4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 128
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
83 * RX and TX Prefetch, Host, and Write-back threshold values should be
84 * carefully set for optimal performance. Consult the network
85 * controller's datasheet and supporting DPDK documentation for guidance
86 * on how these parameters should be set.
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
93 * These default values are optimized for use with the Intel(R) 82599 10 GbE
94 * Controller and the DPDK ixgbe PMD. Consider using other values for other
95 * network controllers and/or network drivers.
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
107 #define JUMBO_FRAME_MAX_SIZE 0x2600
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_SAFE_REMOVE 2
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
123 * Need refine these 2 macros for legacy and DPDK based front end:
124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125 * And then adjust power 2.
128 * For legacy front end, 128 descriptors,
129 * half for virtio header, another half for mbuf.
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 + sizeof(struct rte_mbuf)))
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 #define INVALID_PORT_ID 0xFF
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
172 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173 * disabled on default.
175 static uint32_t zero_copy;
176 static int mergeable;
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
186 struct rte_mempool *pool;
187 struct rte_ring *ring;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200 /* The type of host physical address translated from guest physical address. */
202 PHYS_ADDR_CONTINUOUS = 0,
203 PHYS_ADDR_CROSS_SUBREG = 1,
204 PHYS_ADDR_INVALID = 2,
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
224 .pthresh = RX_PTHRESH,
225 .hthresh = RX_HTHRESH,
226 .wthresh = RX_WTHRESH,
232 * These default values are optimized for use with the Intel(R) 82599 10 GbE
233 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234 * network controllers and/or network drivers.
236 static struct rte_eth_txconf tx_conf_default = {
238 .pthresh = TX_PTHRESH,
239 .hthresh = TX_HTHRESH,
240 .wthresh = TX_WTHRESH,
242 .tx_free_thresh = 0, /* Use PMD default values */
243 .tx_rs_thresh = 0, /* Use PMD default values */
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
249 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
251 .header_split = 0, /**< Header Split disabled */
252 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
253 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
255 * It is necessary for 1G NIC such as I350,
256 * this fixes bug of ipv4 forwarding in guest can't
257 * forward pakets from one virtio dev to another virtio dev.
259 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
260 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
261 .hw_strip_crc = 0, /**< CRC stripped by hardware */
265 .mq_mode = ETH_MQ_TX_NONE,
269 * should be overridden separately in code with
273 .nb_queue_pools = ETH_8_POOLS,
274 .enable_default_pool = 0,
277 .pool_map = {{0, 0},},
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
290 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
308 /* Used for queueing bursts of TX packets. */
312 struct rte_mbuf *m_table[MAX_PKT_BURST];
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
321 /* Vlan header struct used to insert vlan tags on TX. */
323 unsigned char h_dest[ETH_ALEN];
324 unsigned char h_source[ETH_ALEN];
327 __be16 h_vlan_encapsulated_proto;
332 uint8_t version_ihl; /**< version and header length */
333 uint8_t type_of_service; /**< type of service */
334 uint16_t total_length; /**< length of packet */
335 uint16_t packet_id; /**< packet ID */
336 uint16_t fragment_offset; /**< fragmentation offset */
337 uint8_t time_to_live; /**< time to live */
338 uint8_t next_proto_id; /**< protocol ID */
339 uint16_t hdr_checksum; /**< header checksum */
340 uint32_t src_addr; /**< source address */
341 uint32_t dst_addr; /**< destination address */
342 } __attribute__((__packed__));
344 /* Header lengths. */
346 #define VLAN_ETH_HLEN 18
348 /* Per-device statistics struct */
349 struct device_statistics {
351 rte_atomic64_t rx_total_atomic;
354 rte_atomic64_t rx_atomic;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
360 * Builds up the correct configuration for VMDQ VLAN pool map
361 * according to the pool & queue limits.
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
366 struct rte_eth_vmdq_rx_conf conf;
369 memset(&conf, 0, sizeof(conf));
370 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371 conf.nb_pool_maps = num_devices;
372 conf.enable_loop_back =
373 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
375 for (i = 0; i < conf.nb_pool_maps; i++) {
376 conf.pool_map[i].vlan_id = vlan_tags[ i ];
377 conf.pool_map[i].pools = (1UL << i);
380 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
387 * Validate the device number according to the max pool number gotten form
388 * dev_info. If the device number is invalid, give the error message and
389 * return -1. Each device must have its own pool.
392 validate_num_devices(uint32_t max_nb_devices)
394 if (num_devices > max_nb_devices) {
395 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402 * Initialises a given port using global settings and with the rx buffers
403 * coming from the mbuf_pool passed as parameter
406 port_init(uint8_t port)
408 struct rte_eth_dev_info dev_info;
409 struct rte_eth_conf port_conf;
410 uint16_t rx_rings, tx_rings;
411 uint16_t rx_ring_size, tx_ring_size;
415 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416 rte_eth_dev_info_get (port, &dev_info);
418 /*configure the number of supported virtio devices based on VMDQ limits */
419 num_devices = dev_info.max_vmdq_pools;
420 num_queues = dev_info.max_rx_queues;
423 rx_ring_size = num_rx_descriptor;
424 tx_ring_size = num_tx_descriptor;
425 tx_rings = dev_info.max_tx_queues;
427 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429 tx_rings = (uint16_t)rte_lcore_count();
432 retval = validate_num_devices(MAX_DEVICES);
436 /* Get port configuration. */
437 retval = get_eth_conf(&port_conf, num_devices);
441 if (port >= rte_eth_dev_count()) return -1;
443 rx_rings = (uint16_t)num_queues,
444 /* Configure ethernet device. */
445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 /* Setup the queues. */
450 for (q = 0; q < rx_rings; q ++) {
451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 rte_eth_dev_socket_id(port), &rx_conf_default,
453 vpool_array[q].pool);
457 for (q = 0; q < tx_rings; q ++) {
458 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459 rte_eth_dev_socket_id(port), &tx_conf_default);
464 /* Start the device. */
465 retval = rte_eth_dev_start(port);
467 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
476 vmdq_ports_eth_addr[port].addr_bytes[0],
477 vmdq_ports_eth_addr[port].addr_bytes[1],
478 vmdq_ports_eth_addr[port].addr_bytes[2],
479 vmdq_ports_eth_addr[port].addr_bytes[3],
480 vmdq_ports_eth_addr[port].addr_bytes[4],
481 vmdq_ports_eth_addr[port].addr_bytes[5]);
487 * Set character device basename.
490 us_vhost_parse_basename(const char *q_arg)
492 /* parse number string */
494 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
497 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
503 * Parse the portmask provided at run time.
506 parse_portmask(const char *portmask)
513 /* parse hexadecimal string */
514 pm = strtoul(portmask, &end, 16);
515 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
526 * Parse num options at run time.
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 /* parse unsigned int string */
537 num = strtoul(q_arg, &end, 10);
538 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
541 if (num > max_valid_value)
552 us_vhost_usage(const char *prgname)
554 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
556 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557 " --dev-basename <name>\n"
559 " -p PORTMASK: Set mask for ports to be used by application\n"
560 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566 " --dev-basename: The basename to be used for the character device.\n"
567 " --zero-copy [0|1]: disable(default)/enable rx/tx "
569 " --rx-desc-num [0-N]: the number of descriptors on rx, "
570 "used only when zero copy is enabled.\n"
571 " --tx-desc-num [0-N]: the number of descriptors on tx, "
572 "used only when zero copy is enabled.\n",
577 * Parse the arguments given in the command line of the application.
580 us_vhost_parse_args(int argc, char **argv)
585 const char *prgname = argv[0];
586 static struct option long_option[] = {
587 {"vm2vm", required_argument, NULL, 0},
588 {"rx-retry", required_argument, NULL, 0},
589 {"rx-retry-delay", required_argument, NULL, 0},
590 {"rx-retry-num", required_argument, NULL, 0},
591 {"mergeable", required_argument, NULL, 0},
592 {"stats", required_argument, NULL, 0},
593 {"dev-basename", required_argument, NULL, 0},
594 {"zero-copy", required_argument, NULL, 0},
595 {"rx-desc-num", required_argument, NULL, 0},
596 {"tx-desc-num", required_argument, NULL, 0},
600 /* Parse command line */
601 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605 enabled_port_mask = parse_portmask(optarg);
606 if (enabled_port_mask == 0) {
607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 us_vhost_usage(prgname);
614 /* Enable/disable vm2vm comms. */
615 if (!strncmp(long_option[option_index].name, "vm2vm",
617 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
619 RTE_LOG(INFO, VHOST_CONFIG,
620 "Invalid argument for "
622 us_vhost_usage(prgname);
625 vm2vm_mode = (vm2vm_type)ret;
629 /* Enable/disable retries on RX. */
630 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631 ret = parse_num_opt(optarg, 1);
633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634 us_vhost_usage(prgname);
641 /* Specify the retries delay time (in useconds) on RX. */
642 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643 ret = parse_num_opt(optarg, INT32_MAX);
645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646 us_vhost_usage(prgname);
649 burst_rx_delay_time = ret;
653 /* Specify the retries number on RX. */
654 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655 ret = parse_num_opt(optarg, INT32_MAX);
657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658 us_vhost_usage(prgname);
661 burst_rx_retry_num = ret;
665 /* Enable/disable RX mergeable buffers. */
666 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667 ret = parse_num_opt(optarg, 1);
669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670 us_vhost_usage(prgname);
675 vmdq_conf_default.rxmode.jumbo_frame = 1;
676 vmdq_conf_default.rxmode.max_rx_pkt_len
677 = JUMBO_FRAME_MAX_SIZE;
682 /* Enable/disable stats. */
683 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684 ret = parse_num_opt(optarg, INT32_MAX);
686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687 us_vhost_usage(prgname);
694 /* Set character device basename. */
695 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696 if (us_vhost_parse_basename(optarg) == -1) {
697 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698 us_vhost_usage(prgname);
703 /* Enable/disable rx/tx zero copy. */
704 if (!strncmp(long_option[option_index].name,
705 "zero-copy", MAX_LONG_OPT_SZ)) {
706 ret = parse_num_opt(optarg, 1);
708 RTE_LOG(INFO, VHOST_CONFIG,
710 " for zero-copy [0|1]\n");
711 us_vhost_usage(prgname);
717 #ifdef RTE_MBUF_REFCNT
718 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719 "zero copy vhost APP, please "
720 "disable RTE_MBUF_REFCNT\n"
721 "in config file and then rebuild DPDK "
723 "Otherwise please disable zero copy "
724 "flag in command line!\n");
730 /* Specify the descriptor number on RX. */
731 if (!strncmp(long_option[option_index].name,
732 "rx-desc-num", MAX_LONG_OPT_SZ)) {
733 ret = parse_num_opt(optarg, MAX_RING_DESC);
734 if ((ret == -1) || (!POWEROF2(ret))) {
735 RTE_LOG(INFO, VHOST_CONFIG,
736 "Invalid argument for rx-desc-num[0-N],"
737 "power of 2 required.\n");
738 us_vhost_usage(prgname);
741 num_rx_descriptor = ret;
745 /* Specify the descriptor number on TX. */
746 if (!strncmp(long_option[option_index].name,
747 "tx-desc-num", MAX_LONG_OPT_SZ)) {
748 ret = parse_num_opt(optarg, MAX_RING_DESC);
749 if ((ret == -1) || (!POWEROF2(ret))) {
750 RTE_LOG(INFO, VHOST_CONFIG,
751 "Invalid argument for tx-desc-num [0-N],"
752 "power of 2 required.\n");
753 us_vhost_usage(prgname);
756 num_tx_descriptor = ret;
762 /* Invalid option - print options. */
764 us_vhost_usage(prgname);
769 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770 if (enabled_port_mask & (1 << i))
771 ports[num_ports++] = (uint8_t)i;
774 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
775 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781 RTE_LOG(INFO, VHOST_PORT,
782 "Vhost zero copy doesn't support software vm2vm,"
783 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
787 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788 RTE_LOG(INFO, VHOST_PORT,
789 "Vhost zero copy doesn't support jumbo frame,"
790 "please specify '--mergeable 0' to disable the "
791 "mergeable feature.\n");
799 * Update the global var NUM_PORTS and array PORTS according to system ports number
800 * and return valid ports number
802 static unsigned check_ports_num(unsigned nb_ports)
804 unsigned valid_num_ports = num_ports;
807 if (num_ports > nb_ports) {
808 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809 num_ports, nb_ports);
810 num_ports = nb_ports;
813 for (portid = 0; portid < num_ports; portid ++) {
814 if (ports[portid] >= nb_ports) {
815 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816 ports[portid], (nb_ports - 1));
817 ports[portid] = INVALID_PORT_ID;
821 return valid_num_ports;
825 * Macro to print out packet contents. Wrapped in debug define so that the
826 * data path is not effected when debug is disabled.
829 #define PRINT_PACKET(device, addr, size, header) do { \
830 char *pkt_addr = (char*)(addr); \
831 unsigned int index; \
832 char packet[MAX_PRINT_BUFF]; \
835 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
837 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
838 for (index = 0; index < (size); index++) { \
839 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
840 "%02hhx ", pkt_addr[index]); \
842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
844 LOG_DEBUG(VHOST_DATA, "%s", packet); \
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 * Function to convert guest physical addresses to vhost physical addresses.
852 * This is used to convert virtio buffer addresses.
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
856 uint32_t buf_len, hpa_type *addr_type)
858 struct virtio_memory_regions_hpa *region;
860 uint64_t vhost_pa = 0;
862 *addr_type = PHYS_ADDR_INVALID;
864 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865 region = &vdev->regions_hpa[regionidx];
866 if ((guest_pa >= region->guest_phys_address) &&
867 (guest_pa <= region->guest_phys_address_end)) {
868 vhost_pa = region->host_phys_addr_offset + guest_pa;
869 if (likely((guest_pa + buf_len - 1)
870 <= region->guest_phys_address_end))
871 *addr_type = PHYS_ADDR_CONTINUOUS;
873 *addr_type = PHYS_ADDR_CROSS_SUBREG;
878 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880 (void *)(uintptr_t)vhost_pa);
886 * Compares a packet destination MAC address to a device MAC address.
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
891 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 * This function learns the MAC address of the device and registers this along with a
896 * vlan tag to a VMDQ.
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
901 struct ether_hdr *pkt_hdr;
902 struct virtio_net_data_ll *dev_ll;
903 struct virtio_net *dev = vdev->dev;
906 /* Learn MAC address of guest device from packet */
907 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
909 dev_ll = ll_root_used;
911 while (dev_ll != NULL) {
912 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
916 dev_ll = dev_ll->next;
919 for (i = 0; i < ETHER_ADDR_LEN; i++)
920 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
922 /* vlan_tag currently uses the device_id. */
923 vdev->vlan_tag = vlan_tags[dev->device_fh];
925 /* Print out VMDQ registration info. */
926 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
928 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
933 /* Register the MAC address. */
934 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
936 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
939 /* Enable stripping of the vlan tag as we handle routing. */
940 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
942 /* Set device as ready for RX. */
943 vdev->ready = DEVICE_RX;
949 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950 * queue before disabling RX on the device.
953 unlink_vmdq(struct vhost_dev *vdev)
957 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
959 if (vdev->ready == DEVICE_RX) {
960 /*clear MAC and VLAN settings*/
961 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962 for (i = 0; i < 6; i++)
963 vdev->mac_address.addr_bytes[i] = 0;
967 /*Clear out the receive buffers*/
968 rx_count = rte_eth_rx_burst(ports[0],
969 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
972 for (i = 0; i < rx_count; i++)
973 rte_pktmbuf_free(pkts_burst[i]);
975 rx_count = rte_eth_rx_burst(ports[0],
976 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
979 vdev->ready = DEVICE_MAC_LEARNING;
984 * Check if the packet destination MAC address is for a local device. If so then put
985 * the packet on that devices RX queue. If not then return.
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
990 struct virtio_net_data_ll *dev_ll;
991 struct ether_hdr *pkt_hdr;
993 struct virtio_net *dev = vdev->dev;
994 struct virtio_net *tdev; /* destination virito device */
996 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
998 /*get the used devices list*/
999 dev_ll = ll_root_used;
1001 while (dev_ll != NULL) {
1002 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003 &dev_ll->vdev->mac_address)) {
1005 /* Drop the packet if the TX packet is destined for the TX device. */
1006 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011 tdev = dev_ll->vdev->dev;
1014 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1016 if (unlikely(dev_ll->vdev->remove)) {
1017 /*drop the packet if the device is marked for removal*/
1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1020 /*send the packet to the local virtio device*/
1021 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1024 &dev_statistics[tdev->device_fh].rx_total_atomic,
1027 &dev_statistics[tdev->device_fh].rx_atomic,
1029 dev_statistics[tdev->device_fh].tx_total++;
1030 dev_statistics[tdev->device_fh].tx += ret;
1036 dev_ll = dev_ll->next;
1043 * Check if the destination MAC of a packet is one local VM,
1044 * and get its vlan tag, and offset if it is.
1046 static inline int __attribute__((always_inline))
1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1048 uint32_t *offset, uint16_t *vlan_tag)
1050 struct virtio_net_data_ll *dev_ll = ll_root_used;
1051 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1053 while (dev_ll != NULL) {
1054 if ((dev_ll->vdev->ready == DEVICE_RX)
1055 && ether_addr_cmp(&(pkt_hdr->d_addr),
1056 &dev_ll->vdev->mac_address)) {
1058 * Drop the packet if the TX packet is
1059 * destined for the TX device.
1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 LOG_DEBUG(VHOST_DATA,
1063 "(%"PRIu64") TX: Source and destination"
1064 " MAC addresses are the same. Dropping "
1066 dev_ll->vdev->dev->device_fh);
1071 * HW vlan strip will reduce the packet length
1072 * by minus length of vlan tag, so need restore
1073 * the packet length by plus it.
1075 *offset = VLAN_HLEN;
1078 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1080 LOG_DEBUG(VHOST_DATA,
1081 "(%"PRIu64") TX: pkt to local VM device id:"
1082 "(%"PRIu64") vlan tag: %d.\n",
1083 dev->device_fh, dev_ll->vdev->dev->device_fh,
1088 dev_ll = dev_ll->next;
1094 * This function routes the TX packet to the correct interface. This may be a local device
1095 * or the physical port.
1097 static inline void __attribute__((always_inline))
1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1100 struct mbuf_table *tx_q;
1101 struct rte_mbuf **m_table;
1102 unsigned len, ret, offset = 0;
1103 const uint16_t lcore_id = rte_lcore_id();
1104 struct virtio_net *dev = vdev->dev;
1106 /*check if destination is local VM*/
1107 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1108 rte_pktmbuf_free(m);
1112 if (vm2vm_mode == VM2VM_HARDWARE) {
1113 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1114 offset > rte_pktmbuf_tailroom(m)) {
1115 rte_pktmbuf_free(m);
1120 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1122 /*Add packet to the port tx queue*/
1123 tx_q = &lcore_tx_queue[lcore_id];
1126 m->ol_flags = PKT_TX_VLAN_PKT;
1128 m->data_len += offset;
1129 m->pkt_len += offset;
1131 m->vlan_tci = vlan_tag;
1133 tx_q->m_table[len] = m;
1136 dev_statistics[dev->device_fh].tx_total++;
1137 dev_statistics[dev->device_fh].tx++;
1140 if (unlikely(len == MAX_PKT_BURST)) {
1141 m_table = (struct rte_mbuf **)tx_q->m_table;
1142 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1143 /* Free any buffers not handled by TX and update the port stats. */
1144 if (unlikely(ret < len)) {
1146 rte_pktmbuf_free(m_table[ret]);
1147 } while (++ret < len);
1157 * This function is called by each data core. It handles all RX/TX registered with the
1158 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1159 * with all devices in the main linked list.
1162 switch_worker(__attribute__((unused)) void *arg)
1164 struct rte_mempool *mbuf_pool = arg;
1165 struct virtio_net *dev = NULL;
1166 struct vhost_dev *vdev = NULL;
1167 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1168 struct virtio_net_data_ll *dev_ll;
1169 struct mbuf_table *tx_q;
1170 volatile struct lcore_ll_info *lcore_ll;
1171 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1172 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1174 const uint16_t lcore_id = rte_lcore_id();
1175 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1176 uint16_t rx_count = 0;
1180 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1181 lcore_ll = lcore_info[lcore_id].lcore_ll;
1184 tx_q = &lcore_tx_queue[lcore_id];
1185 for (i = 0; i < num_cores; i ++) {
1186 if (lcore_ids[i] == lcore_id) {
1193 cur_tsc = rte_rdtsc();
1195 * TX burst queue drain
1197 diff_tsc = cur_tsc - prev_tsc;
1198 if (unlikely(diff_tsc > drain_tsc)) {
1201 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1203 /*Tx any packets in the queue*/
1204 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1205 (struct rte_mbuf **)tx_q->m_table,
1206 (uint16_t)tx_q->len);
1207 if (unlikely(ret < tx_q->len)) {
1209 rte_pktmbuf_free(tx_q->m_table[ret]);
1210 } while (++ret < tx_q->len);
1220 rte_prefetch0(lcore_ll->ll_root_used);
1222 * Inform the configuration core that we have exited the linked list and that no devices are
1223 * in use if requested.
1225 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1226 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1231 dev_ll = lcore_ll->ll_root_used;
1233 while (dev_ll != NULL) {
1234 /*get virtio device ID*/
1235 vdev = dev_ll->vdev;
1238 if (unlikely(vdev->remove)) {
1239 dev_ll = dev_ll->next;
1241 vdev->ready = DEVICE_SAFE_REMOVE;
1244 if (likely(vdev->ready == DEVICE_RX)) {
1246 rx_count = rte_eth_rx_burst(ports[0],
1247 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1251 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1252 * Here MAX_PKT_BURST must be less than virtio queue size
1254 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1255 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1256 rte_delay_us(burst_rx_delay_time);
1257 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1261 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1264 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1267 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1269 while (likely(rx_count)) {
1271 rte_pktmbuf_free(pkts_burst[rx_count]);
1277 if (likely(!vdev->remove)) {
1278 /* Handle guest TX*/
1279 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1280 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1281 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1282 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1284 rte_pktmbuf_free(pkts_burst[tx_count]);
1288 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1291 /*move to the next device in the list*/
1292 dev_ll = dev_ll->next;
1300 * This function gets available ring number for zero copy rx.
1301 * Only one thread will call this funciton for a paticular virtio device,
1302 * so, it is designed as non-thread-safe function.
1304 static inline uint32_t __attribute__((always_inline))
1305 get_available_ring_num_zcp(struct virtio_net *dev)
1307 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1310 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1311 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1315 * This function gets available ring index for zero copy rx,
1316 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1317 * Only one thread will call this funciton for a paticular virtio device,
1318 * so, it is designed as non-thread-safe function.
1320 static inline uint32_t __attribute__((always_inline))
1321 get_available_ring_index_zcp(struct virtio_net *dev,
1322 uint16_t *res_base_idx, uint32_t count)
1324 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1327 uint16_t free_entries;
1329 *res_base_idx = vq->last_used_idx_res;
1330 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1331 free_entries = (avail_idx - *res_base_idx);
1333 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1335 "res base idx:%d, free entries:%d\n",
1336 dev->device_fh, avail_idx, *res_base_idx,
1340 * If retry is enabled and the queue is full then we wait
1341 * and retry to avoid packet loss.
1343 if (enable_retry && unlikely(count > free_entries)) {
1344 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1345 rte_delay_us(burst_rx_delay_time);
1346 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1347 free_entries = (avail_idx - *res_base_idx);
1348 if (count <= free_entries)
1353 /*check that we have enough buffers*/
1354 if (unlikely(count > free_entries))
1355 count = free_entries;
1357 if (unlikely(count == 0)) {
1358 LOG_DEBUG(VHOST_DATA,
1359 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1360 "avail idx: %d, res base idx:%d, free entries:%d\n",
1361 dev->device_fh, avail_idx,
1362 *res_base_idx, free_entries);
1366 vq->last_used_idx_res = *res_base_idx + count;
1372 * This function put descriptor back to used list.
1374 static inline void __attribute__((always_inline))
1375 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1377 uint16_t res_cur_idx = vq->last_used_idx;
1378 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1379 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1380 rte_compiler_barrier();
1381 *(volatile uint16_t *)&vq->used->idx += 1;
1382 vq->last_used_idx += 1;
1384 /* Kick the guest if necessary. */
1385 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1386 eventfd_write((int)vq->kickfd, 1);
1390 * This function get available descriptor from vitio vring and un-attached mbuf
1391 * from vpool->ring, and then attach them together. It needs adjust the offset
1392 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1393 * frame data may be put to wrong location in mbuf.
1395 static inline void __attribute__((always_inline))
1396 attach_rxmbuf_zcp(struct virtio_net *dev)
1398 uint16_t res_base_idx, desc_idx;
1399 uint64_t buff_addr, phys_addr;
1400 struct vhost_virtqueue *vq;
1401 struct vring_desc *desc;
1402 struct rte_mbuf *mbuf = NULL;
1403 struct vpool *vpool;
1405 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1407 vpool = &vpool_array[vdev->vmdq_rx_q];
1408 vq = dev->virtqueue[VIRTIO_RXQ];
1411 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1414 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1416 desc = &vq->desc[desc_idx];
1417 if (desc->flags & VRING_DESC_F_NEXT) {
1418 desc = &vq->desc[desc->next];
1419 buff_addr = gpa_to_vva(dev, desc->addr);
1420 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1423 buff_addr = gpa_to_vva(dev,
1424 desc->addr + vq->vhost_hlen);
1425 phys_addr = gpa_to_hpa(vdev,
1426 desc->addr + vq->vhost_hlen,
1427 desc->len, &addr_type);
1430 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1431 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1432 " address found when attaching RX frame buffer"
1433 " address!\n", dev->device_fh);
1434 put_desc_to_used_list_zcp(vq, desc_idx);
1439 * Check if the frame buffer address from guest crosses
1440 * sub-region or not.
1442 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1443 RTE_LOG(ERR, VHOST_DATA,
1444 "(%"PRIu64") Frame buffer address cross "
1445 "sub-regioin found when attaching RX frame "
1446 "buffer address!\n",
1448 put_desc_to_used_list_zcp(vq, desc_idx);
1451 } while (unlikely(phys_addr == 0));
1453 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1454 if (unlikely(mbuf == NULL)) {
1455 LOG_DEBUG(VHOST_DATA,
1456 "(%"PRIu64") in attach_rxmbuf_zcp: "
1457 "ring_sc_dequeue fail.\n",
1459 put_desc_to_used_list_zcp(vq, desc_idx);
1463 if (unlikely(vpool->buf_size > desc->len)) {
1464 LOG_DEBUG(VHOST_DATA,
1465 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1466 "length(%d) of descriptor idx: %d less than room "
1467 "size required: %d\n",
1468 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1469 put_desc_to_used_list_zcp(vq, desc_idx);
1470 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1474 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1475 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1476 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1477 mbuf->data_len = desc->len;
1478 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1480 LOG_DEBUG(VHOST_DATA,
1481 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1482 "descriptor idx:%d\n",
1483 dev->device_fh, res_base_idx, desc_idx);
1485 __rte_mbuf_raw_free(mbuf);
1491 * Detach an attched packet mbuf -
1492 * - restore original mbuf address and length values.
1493 * - reset pktmbuf data and data_len to their default values.
1494 * All other fields of the given packet mbuf will be left intact.
1497 * The attached packet mbuf.
1499 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1501 const struct rte_mempool *mp = m->pool;
1502 void *buf = RTE_MBUF_TO_BADDR(m);
1504 uint32_t buf_len = mp->elt_size - sizeof(*m);
1505 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1508 m->buf_len = (uint16_t)buf_len;
1510 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1511 RTE_PKTMBUF_HEADROOM : m->buf_len;
1512 m->data_off = buf_ofs;
1518 * This function is called after packets have been transimited. It fetchs mbuf
1519 * from vpool->pool, detached it and put into vpool->ring. It also update the
1520 * used index and kick the guest if necessary.
1522 static inline uint32_t __attribute__((always_inline))
1523 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1525 struct rte_mbuf *mbuf;
1526 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1527 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1529 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1531 LOG_DEBUG(VHOST_DATA,
1532 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1534 dev->device_fh, mbuf_count);
1535 LOG_DEBUG(VHOST_DATA,
1536 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1538 dev->device_fh, rte_ring_count(vpool->ring));
1540 for (index = 0; index < mbuf_count; index++) {
1541 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1542 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1543 pktmbuf_detach_zcp(mbuf);
1544 rte_ring_sp_enqueue(vpool->ring, mbuf);
1546 /* Update used index buffer information. */
1547 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1548 vq->used->ring[used_idx].len = 0;
1550 used_idx = (used_idx + 1) & (vq->size - 1);
1553 LOG_DEBUG(VHOST_DATA,
1554 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1556 dev->device_fh, rte_mempool_count(vpool->pool));
1557 LOG_DEBUG(VHOST_DATA,
1558 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1560 dev->device_fh, rte_ring_count(vpool->ring));
1561 LOG_DEBUG(VHOST_DATA,
1562 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1563 "vq->last_used_idx:%d\n",
1564 dev->device_fh, vq->last_used_idx);
1566 vq->last_used_idx += mbuf_count;
1568 LOG_DEBUG(VHOST_DATA,
1569 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1570 "vq->last_used_idx:%d\n",
1571 dev->device_fh, vq->last_used_idx);
1573 rte_compiler_barrier();
1575 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1577 /* Kick guest if required. */
1578 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1579 eventfd_write((int)vq->kickfd, 1);
1585 * This function is called when a virtio device is destroy.
1586 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1588 static void mbuf_destroy_zcp(struct vpool *vpool)
1590 struct rte_mbuf *mbuf = NULL;
1591 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1593 LOG_DEBUG(VHOST_CONFIG,
1594 "in mbuf_destroy_zcp: mbuf count in mempool before "
1595 "mbuf_destroy_zcp is: %d\n",
1597 LOG_DEBUG(VHOST_CONFIG,
1598 "in mbuf_destroy_zcp: mbuf count in ring before "
1599 "mbuf_destroy_zcp is : %d\n",
1600 rte_ring_count(vpool->ring));
1602 for (index = 0; index < mbuf_count; index++) {
1603 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1604 if (likely(mbuf != NULL)) {
1605 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1606 pktmbuf_detach_zcp(mbuf);
1607 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1611 LOG_DEBUG(VHOST_CONFIG,
1612 "in mbuf_destroy_zcp: mbuf count in mempool after "
1613 "mbuf_destroy_zcp is: %d\n",
1614 rte_mempool_count(vpool->pool));
1615 LOG_DEBUG(VHOST_CONFIG,
1616 "in mbuf_destroy_zcp: mbuf count in ring after "
1617 "mbuf_destroy_zcp is : %d\n",
1618 rte_ring_count(vpool->ring));
1622 * This function update the use flag and counter.
1624 static inline uint32_t __attribute__((always_inline))
1625 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1628 struct vhost_virtqueue *vq;
1629 struct vring_desc *desc;
1630 struct rte_mbuf *buff;
1631 /* The virtio_hdr is initialised to 0. */
1632 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1633 = {{0, 0, 0, 0, 0, 0}, 0};
1634 uint64_t buff_hdr_addr = 0;
1635 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1636 uint32_t head_idx, packet_success = 0;
1637 uint16_t res_cur_idx;
1639 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1644 vq = dev->virtqueue[VIRTIO_RXQ];
1645 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1647 res_cur_idx = vq->last_used_idx;
1648 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1649 dev->device_fh, res_cur_idx, res_cur_idx + count);
1651 /* Retrieve all of the head indexes first to avoid caching issues. */
1652 for (head_idx = 0; head_idx < count; head_idx++)
1653 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1655 /*Prefetch descriptor index. */
1656 rte_prefetch0(&vq->desc[head[packet_success]]);
1658 while (packet_success != count) {
1659 /* Get descriptor from available ring */
1660 desc = &vq->desc[head[packet_success]];
1662 buff = pkts[packet_success];
1663 LOG_DEBUG(VHOST_DATA,
1664 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1665 "pkt[%d] descriptor idx: %d\n",
1666 dev->device_fh, packet_success,
1667 MBUF_HEADROOM_UINT32(buff));
1670 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1671 + RTE_PKTMBUF_HEADROOM),
1672 rte_pktmbuf_data_len(buff), 0);
1674 /* Buffer address translation for virtio header. */
1675 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1676 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1679 * If the descriptors are chained the header and data are
1680 * placed in separate buffers.
1682 if (desc->flags & VRING_DESC_F_NEXT) {
1683 desc->len = vq->vhost_hlen;
1684 desc = &vq->desc[desc->next];
1685 desc->len = rte_pktmbuf_data_len(buff);
1687 desc->len = packet_len;
1690 /* Update used ring with desc information */
1691 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1692 = head[packet_success];
1693 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1698 /* A header is required per buffer. */
1699 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1700 (const void *)&virtio_hdr, vq->vhost_hlen);
1702 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1704 if (likely(packet_success < count)) {
1705 /* Prefetch descriptor index. */
1706 rte_prefetch0(&vq->desc[head[packet_success]]);
1710 rte_compiler_barrier();
1712 LOG_DEBUG(VHOST_DATA,
1713 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1714 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1715 dev->device_fh, vq->last_used_idx, vq->used->idx);
1717 *(volatile uint16_t *)&vq->used->idx += count;
1718 vq->last_used_idx += count;
1720 LOG_DEBUG(VHOST_DATA,
1721 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1722 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1723 dev->device_fh, vq->last_used_idx, vq->used->idx);
1725 /* Kick the guest if necessary. */
1726 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1727 eventfd_write((int)vq->kickfd, 1);
1733 * This function routes the TX packet to the correct interface.
1734 * This may be a local device or the physical port.
1736 static inline void __attribute__((always_inline))
1737 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1738 uint32_t desc_idx, uint8_t need_copy)
1740 struct mbuf_table *tx_q;
1741 struct rte_mbuf **m_table;
1742 struct rte_mbuf *mbuf = NULL;
1743 unsigned len, ret, offset = 0;
1744 struct vpool *vpool;
1745 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1746 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1748 /*Add packet to the port tx queue*/
1749 tx_q = &tx_queue_zcp[vmdq_rx_q];
1752 /* Allocate an mbuf and populate the structure. */
1753 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1754 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1755 if (unlikely(mbuf == NULL)) {
1756 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1757 RTE_LOG(ERR, VHOST_DATA,
1758 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1760 put_desc_to_used_list_zcp(vq, desc_idx);
1764 if (vm2vm_mode == VM2VM_HARDWARE) {
1765 /* Avoid using a vlan tag from any vm for external pkt, such as
1766 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1767 * selection, MAC address determines it as an external pkt
1768 * which should go to network, while vlan tag determine it as
1769 * a vm2vm pkt should forward to another vm. Hardware confuse
1770 * such a ambiguous situation, so pkt will lost.
1772 vlan_tag = external_pkt_default_vlan_tag;
1773 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1774 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1775 __rte_mbuf_raw_free(mbuf);
1780 mbuf->nb_segs = m->nb_segs;
1781 mbuf->next = m->next;
1782 mbuf->data_len = m->data_len + offset;
1783 mbuf->pkt_len = mbuf->data_len;
1784 if (unlikely(need_copy)) {
1785 /* Copy the packet contents to the mbuf. */
1786 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1787 rte_pktmbuf_mtod(m, void *),
1790 mbuf->data_off = m->data_off;
1791 mbuf->buf_physaddr = m->buf_physaddr;
1792 mbuf->buf_addr = m->buf_addr;
1794 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1795 mbuf->vlan_tci = vlan_tag;
1796 mbuf->l2_len = sizeof(struct ether_hdr);
1797 mbuf->l3_len = sizeof(struct ipv4_hdr);
1798 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1800 tx_q->m_table[len] = mbuf;
1803 LOG_DEBUG(VHOST_DATA,
1804 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1807 (mbuf->next == NULL) ? "null" : "non-null");
1810 dev_statistics[dev->device_fh].tx_total++;
1811 dev_statistics[dev->device_fh].tx++;
1814 if (unlikely(len == MAX_PKT_BURST)) {
1815 m_table = (struct rte_mbuf **)tx_q->m_table;
1816 ret = rte_eth_tx_burst(ports[0],
1817 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1820 * Free any buffers not handled by TX and update
1823 if (unlikely(ret < len)) {
1825 rte_pktmbuf_free(m_table[ret]);
1826 } while (++ret < len);
1830 txmbuf_clean_zcp(dev, vpool);
1839 * This function TX all available packets in virtio TX queue for one
1840 * virtio-net device. If it is first packet, it learns MAC address and
1843 static inline void __attribute__((always_inline))
1844 virtio_dev_tx_zcp(struct virtio_net *dev)
1847 struct vhost_virtqueue *vq;
1848 struct vring_desc *desc;
1849 uint64_t buff_addr = 0, phys_addr;
1850 uint32_t head[MAX_PKT_BURST];
1852 uint16_t free_entries, packet_success = 0;
1854 uint8_t need_copy = 0;
1856 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1858 vq = dev->virtqueue[VIRTIO_TXQ];
1859 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1861 /* If there are no available buffers then return. */
1862 if (vq->last_used_idx_res == avail_idx)
1865 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1867 /* Prefetch available ring to retrieve head indexes. */
1868 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1870 /* Get the number of free entries in the ring */
1871 free_entries = (avail_idx - vq->last_used_idx_res);
1873 /* Limit to MAX_PKT_BURST. */
1875 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1877 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1878 dev->device_fh, free_entries);
1880 /* Retrieve all of the head indexes first to avoid caching issues. */
1881 for (i = 0; i < free_entries; i++)
1883 = vq->avail->ring[(vq->last_used_idx_res + i)
1886 vq->last_used_idx_res += free_entries;
1888 /* Prefetch descriptor index. */
1889 rte_prefetch0(&vq->desc[head[packet_success]]);
1890 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1892 while (packet_success < free_entries) {
1893 desc = &vq->desc[head[packet_success]];
1895 /* Discard first buffer as it is the virtio header */
1896 desc = &vq->desc[desc->next];
1898 /* Buffer address translation. */
1899 buff_addr = gpa_to_vva(dev, desc->addr);
1900 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1901 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1904 if (likely(packet_success < (free_entries - 1)))
1905 /* Prefetch descriptor index. */
1906 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1908 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1909 RTE_LOG(ERR, VHOST_DATA,
1910 "(%"PRIu64") Invalid frame buffer address found"
1911 "when TX packets!\n",
1917 /* Prefetch buffer address. */
1918 rte_prefetch0((void *)(uintptr_t)buff_addr);
1921 * Setup dummy mbuf. This is copied to a real mbuf if
1922 * transmitted out the physical port.
1924 m.data_len = desc->len;
1928 m.buf_addr = (void *)(uintptr_t)buff_addr;
1929 m.buf_physaddr = phys_addr;
1932 * Check if the frame buffer address from guest crosses
1933 * sub-region or not.
1935 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1936 RTE_LOG(ERR, VHOST_DATA,
1937 "(%"PRIu64") Frame buffer address cross "
1938 "sub-regioin found when attaching TX frame "
1939 "buffer address!\n",
1945 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1948 * If this is the first received packet we need to learn
1949 * the MAC and setup VMDQ
1951 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1952 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1954 * Discard frame if device is scheduled for
1955 * removal or a duplicate MAC address is found.
1957 packet_success += free_entries;
1958 vq->last_used_idx += packet_success;
1963 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1969 * This function is called by each data core. It handles all RX/TX registered
1970 * with the core. For TX the specific lcore linked list is used. For RX, MAC
1971 * addresses are compared with all devices in the main linked list.
1974 switch_worker_zcp(__attribute__((unused)) void *arg)
1976 struct virtio_net *dev = NULL;
1977 struct vhost_dev *vdev = NULL;
1978 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1979 struct virtio_net_data_ll *dev_ll;
1980 struct mbuf_table *tx_q;
1981 volatile struct lcore_ll_info *lcore_ll;
1982 const uint64_t drain_tsc
1983 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1984 * BURST_TX_DRAIN_US;
1985 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1987 const uint16_t lcore_id = rte_lcore_id();
1988 uint16_t count_in_ring, rx_count = 0;
1990 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1992 lcore_ll = lcore_info[lcore_id].lcore_ll;
1996 cur_tsc = rte_rdtsc();
1998 /* TX burst queue drain */
1999 diff_tsc = cur_tsc - prev_tsc;
2000 if (unlikely(diff_tsc > drain_tsc)) {
2002 * Get mbuf from vpool.pool and detach mbuf and
2003 * put back into vpool.ring.
2005 dev_ll = lcore_ll->ll_root_used;
2006 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2007 /* Get virtio device ID */
2008 vdev = dev_ll->vdev;
2011 if (likely(!vdev->remove)) {
2012 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2014 LOG_DEBUG(VHOST_DATA,
2015 "TX queue drained after timeout"
2016 " with burst size %u\n",
2020 * Tx any packets in the queue
2022 ret = rte_eth_tx_burst(
2024 (uint16_t)tx_q->txq_id,
2025 (struct rte_mbuf **)
2027 (uint16_t)tx_q->len);
2028 if (unlikely(ret < tx_q->len)) {
2031 tx_q->m_table[ret]);
2032 } while (++ret < tx_q->len);
2036 txmbuf_clean_zcp(dev,
2037 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2040 dev_ll = dev_ll->next;
2045 rte_prefetch0(lcore_ll->ll_root_used);
2048 * Inform the configuration core that we have exited the linked
2049 * list and that no devices are in use if requested.
2051 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2052 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2054 /* Process devices */
2055 dev_ll = lcore_ll->ll_root_used;
2057 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2058 vdev = dev_ll->vdev;
2060 if (unlikely(vdev->remove)) {
2061 dev_ll = dev_ll->next;
2063 vdev->ready = DEVICE_SAFE_REMOVE;
2067 if (likely(vdev->ready == DEVICE_RX)) {
2068 uint32_t index = vdev->vmdq_rx_q;
2071 = rte_ring_count(vpool_array[index].ring);
2072 uint16_t free_entries
2073 = (uint16_t)get_available_ring_num_zcp(dev);
2076 * Attach all mbufs in vpool.ring and put back
2080 i < RTE_MIN(free_entries,
2081 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2083 attach_rxmbuf_zcp(dev);
2085 /* Handle guest RX */
2086 rx_count = rte_eth_rx_burst(ports[0],
2087 vdev->vmdq_rx_q, pkts_burst,
2091 ret_count = virtio_dev_rx_zcp(dev,
2092 pkts_burst, rx_count);
2094 dev_statistics[dev->device_fh].rx_total
2096 dev_statistics[dev->device_fh].rx
2099 while (likely(rx_count)) {
2102 pkts_burst[rx_count]);
2103 rte_ring_sp_enqueue(
2104 vpool_array[index].ring,
2105 (void *)pkts_burst[rx_count]);
2110 if (likely(!vdev->remove))
2111 /* Handle guest TX */
2112 virtio_dev_tx_zcp(dev);
2114 /* Move to the next device in the list */
2115 dev_ll = dev_ll->next;
2124 * Add an entry to a used linked list. A free entry must first be found
2125 * in the free linked list using get_data_ll_free_entry();
2128 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2129 struct virtio_net_data_ll *ll_dev)
2131 struct virtio_net_data_ll *ll = *ll_root_addr;
2133 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2134 ll_dev->next = NULL;
2135 rte_compiler_barrier();
2137 /* If ll == NULL then this is the first device. */
2139 /* Increment to the tail of the linked list. */
2140 while ((ll->next != NULL) )
2145 *ll_root_addr = ll_dev;
2150 * Remove an entry from a used linked list. The entry must then be added to
2151 * the free linked list using put_data_ll_free_entry().
2154 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2155 struct virtio_net_data_ll *ll_dev,
2156 struct virtio_net_data_ll *ll_dev_last)
2158 struct virtio_net_data_ll *ll = *ll_root_addr;
2160 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2164 *ll_root_addr = ll_dev->next;
2166 if (likely(ll_dev_last != NULL))
2167 ll_dev_last->next = ll_dev->next;
2169 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2173 * Find and return an entry from the free linked list.
2175 static struct virtio_net_data_ll *
2176 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2178 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2179 struct virtio_net_data_ll *ll_dev;
2181 if (ll_free == NULL)
2185 *ll_root_addr = ll_free->next;
2191 * Place an entry back on to the free linked list.
2194 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2195 struct virtio_net_data_ll *ll_dev)
2197 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2202 ll_dev->next = ll_free;
2203 *ll_root_addr = ll_dev;
2207 * Creates a linked list of a given size.
2209 static struct virtio_net_data_ll *
2210 alloc_data_ll(uint32_t size)
2212 struct virtio_net_data_ll *ll_new;
2215 /* Malloc and then chain the linked list. */
2216 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2217 if (ll_new == NULL) {
2218 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2222 for (i = 0; i < size - 1; i++) {
2223 ll_new[i].vdev = NULL;
2224 ll_new[i].next = &ll_new[i+1];
2226 ll_new[i].next = NULL;
2232 * Create the main linked list along with each individual cores linked list. A used and a free list
2233 * are created to manage entries.
2240 RTE_LCORE_FOREACH_SLAVE(lcore) {
2241 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2242 if (lcore_info[lcore].lcore_ll == NULL) {
2243 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2247 lcore_info[lcore].lcore_ll->device_num = 0;
2248 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2249 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2250 if (num_devices % num_switching_cores)
2251 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2253 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2256 /* Allocate devices up to a maximum of MAX_DEVICES. */
2257 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2263 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2264 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2265 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2268 destroy_device (volatile struct virtio_net *dev)
2270 struct virtio_net_data_ll *ll_lcore_dev_cur;
2271 struct virtio_net_data_ll *ll_main_dev_cur;
2272 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2273 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2274 struct vhost_dev *vdev;
2277 dev->flags &= ~VIRTIO_DEV_RUNNING;
2279 vdev = (struct vhost_dev *)dev->priv;
2280 /*set the remove flag. */
2282 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2286 /* Search for entry to be removed from lcore ll */
2287 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2288 while (ll_lcore_dev_cur != NULL) {
2289 if (ll_lcore_dev_cur->vdev == vdev) {
2292 ll_lcore_dev_last = ll_lcore_dev_cur;
2293 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2297 if (ll_lcore_dev_cur == NULL) {
2298 RTE_LOG(ERR, VHOST_CONFIG,
2299 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2304 /* Search for entry to be removed from main ll */
2305 ll_main_dev_cur = ll_root_used;
2306 ll_main_dev_last = NULL;
2307 while (ll_main_dev_cur != NULL) {
2308 if (ll_main_dev_cur->vdev == vdev) {
2311 ll_main_dev_last = ll_main_dev_cur;
2312 ll_main_dev_cur = ll_main_dev_cur->next;
2316 /* Remove entries from the lcore and main ll. */
2317 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2318 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2320 /* Set the dev_removal_flag on each lcore. */
2321 RTE_LCORE_FOREACH_SLAVE(lcore) {
2322 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2326 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2327 * they can no longer access the device removed from the linked lists and that the devices
2328 * are no longer in use.
2330 RTE_LCORE_FOREACH_SLAVE(lcore) {
2331 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2336 /* Add the entries back to the lcore and main free ll.*/
2337 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2338 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2340 /* Decrement number of device on the lcore. */
2341 lcore_info[vdev->coreid].lcore_ll->device_num--;
2343 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2346 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2348 /* Stop the RX queue. */
2349 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2350 LOG_DEBUG(VHOST_CONFIG,
2351 "(%"PRIu64") In destroy_device: Failed to stop "
2357 LOG_DEBUG(VHOST_CONFIG,
2358 "(%"PRIu64") in destroy_device: Start put mbuf in "
2359 "mempool back to ring for RX queue: %d\n",
2360 dev->device_fh, vdev->vmdq_rx_q);
2362 mbuf_destroy_zcp(vpool);
2364 /* Stop the TX queue. */
2365 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2366 LOG_DEBUG(VHOST_CONFIG,
2367 "(%"PRIu64") In destroy_device: Failed to "
2368 "stop tx queue:%d\n",
2369 dev->device_fh, vdev->vmdq_rx_q);
2372 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2374 LOG_DEBUG(VHOST_CONFIG,
2375 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2376 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2377 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2380 mbuf_destroy_zcp(vpool);
2381 rte_free(vdev->regions_hpa);
2388 * Calculate the region count of physical continous regions for one particular
2389 * region of whose vhost virtual address is continous. The particular region
2390 * start from vva_start, with size of 'size' in argument.
2393 check_hpa_regions(uint64_t vva_start, uint64_t size)
2395 uint32_t i, nregions = 0, page_size = getpagesize();
2396 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2397 if (vva_start % page_size) {
2398 LOG_DEBUG(VHOST_CONFIG,
2399 "in check_countinous: vva start(%p) mod page_size(%d) "
2401 (void *)(uintptr_t)vva_start, page_size);
2404 if (size % page_size) {
2405 LOG_DEBUG(VHOST_CONFIG,
2406 "in check_countinous: "
2407 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2411 for (i = 0; i < size - page_size; i = i + page_size) {
2413 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2414 next_phys_addr = rte_mem_virt2phy(
2415 (void *)(uintptr_t)(vva_start + i + page_size));
2416 if ((cur_phys_addr + page_size) != next_phys_addr) {
2418 LOG_DEBUG(VHOST_CONFIG,
2419 "in check_continuous: hva addr:(%p) is not "
2420 "continuous with hva addr:(%p), diff:%d\n",
2421 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2422 (void *)(uintptr_t)(vva_start + (uint64_t)i
2423 + page_size), page_size);
2424 LOG_DEBUG(VHOST_CONFIG,
2425 "in check_continuous: hpa addr:(%p) is not "
2426 "continuous with hpa addr:(%p), "
2427 "diff:(%"PRIu64")\n",
2428 (void *)(uintptr_t)cur_phys_addr,
2429 (void *)(uintptr_t)next_phys_addr,
2430 (next_phys_addr-cur_phys_addr));
2437 * Divide each region whose vhost virtual address is continous into a few
2438 * sub-regions, make sure the physical address within each sub-region are
2439 * continous. And fill offset(to GPA) and size etc. information of each
2440 * sub-region into regions_hpa.
2443 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2445 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2446 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2448 if (mem_region_hpa == NULL)
2451 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2452 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2453 virtio_memory->regions[regionidx].address_offset;
2454 mem_region_hpa[regionidx_hpa].guest_phys_address
2455 = virtio_memory->regions[regionidx].guest_phys_address;
2456 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2457 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2458 mem_region_hpa[regionidx_hpa].guest_phys_address;
2459 LOG_DEBUG(VHOST_CONFIG,
2460 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2463 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2464 LOG_DEBUG(VHOST_CONFIG,
2465 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2468 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2470 i < virtio_memory->regions[regionidx].memory_size -
2473 cur_phys_addr = rte_mem_virt2phy(
2474 (void *)(uintptr_t)(vva_start + i));
2475 next_phys_addr = rte_mem_virt2phy(
2476 (void *)(uintptr_t)(vva_start +
2478 if ((cur_phys_addr + page_size) != next_phys_addr) {
2479 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2480 mem_region_hpa[regionidx_hpa].guest_phys_address +
2482 mem_region_hpa[regionidx_hpa].memory_size
2484 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2485 "phys addr end [%d]:(%p)\n",
2488 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2489 LOG_DEBUG(VHOST_CONFIG,
2490 "in fill_hpa_regions: guest phys addr "
2494 (mem_region_hpa[regionidx_hpa].memory_size));
2495 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2496 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2498 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2500 mem_region_hpa[regionidx_hpa].guest_phys_address;
2501 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2502 " phys addr start[%d]:(%p)\n",
2505 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2506 LOG_DEBUG(VHOST_CONFIG,
2507 "in fill_hpa_regions: host phys addr "
2511 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2517 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2518 = mem_region_hpa[regionidx_hpa].guest_phys_address
2520 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2521 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2522 "[%d]:(%p)\n", regionidx_hpa,
2524 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2525 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2526 "[%d]:(%p)\n", regionidx_hpa,
2528 (mem_region_hpa[regionidx_hpa].memory_size));
2531 return regionidx_hpa;
2535 * A new device is added to a data core. First the device is added to the main linked list
2536 * and the allocated to a specific data core.
2539 new_device (struct virtio_net *dev)
2541 struct virtio_net_data_ll *ll_dev;
2542 int lcore, core_add = 0;
2543 uint32_t device_num_min = num_devices;
2544 struct vhost_dev *vdev;
2547 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2549 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2557 vdev->nregions_hpa = dev->mem->nregions;
2558 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2560 += check_hpa_regions(
2561 dev->mem->regions[regionidx].guest_phys_address
2562 + dev->mem->regions[regionidx].address_offset,
2563 dev->mem->regions[regionidx].memory_size);
2567 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2568 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2570 if (vdev->regions_hpa == NULL) {
2571 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2577 if (fill_hpa_memory_regions(
2578 vdev->regions_hpa, dev->mem
2579 ) != vdev->nregions_hpa) {
2581 RTE_LOG(ERR, VHOST_CONFIG,
2582 "hpa memory regions number mismatch: "
2583 "[%d]\n", vdev->nregions_hpa);
2584 rte_free(vdev->regions_hpa);
2591 /* Add device to main ll */
2592 ll_dev = get_data_ll_free_entry(&ll_root_free);
2593 if (ll_dev == NULL) {
2594 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2595 "of %d devices per core has been reached\n",
2596 dev->device_fh, num_devices);
2597 if (vdev->regions_hpa)
2598 rte_free(vdev->regions_hpa);
2602 ll_dev->vdev = vdev;
2603 add_data_ll_entry(&ll_root_used, ll_dev);
2605 = dev->device_fh * (num_queues / num_devices);
2608 uint32_t index = vdev->vmdq_rx_q;
2609 uint32_t count_in_ring, i;
2610 struct mbuf_table *tx_q;
2612 count_in_ring = rte_ring_count(vpool_array[index].ring);
2614 LOG_DEBUG(VHOST_CONFIG,
2615 "(%"PRIu64") in new_device: mbuf count in mempool "
2616 "before attach is: %d\n",
2618 rte_mempool_count(vpool_array[index].pool));
2619 LOG_DEBUG(VHOST_CONFIG,
2620 "(%"PRIu64") in new_device: mbuf count in ring "
2621 "before attach is : %d\n",
2622 dev->device_fh, count_in_ring);
2625 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2627 for (i = 0; i < count_in_ring; i++)
2628 attach_rxmbuf_zcp(dev);
2630 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2631 "mempool after attach is: %d\n",
2633 rte_mempool_count(vpool_array[index].pool));
2634 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2635 "ring after attach is : %d\n",
2637 rte_ring_count(vpool_array[index].ring));
2639 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2640 tx_q->txq_id = vdev->vmdq_rx_q;
2642 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2643 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2645 LOG_DEBUG(VHOST_CONFIG,
2646 "(%"PRIu64") In new_device: Failed to start "
2648 dev->device_fh, vdev->vmdq_rx_q);
2650 mbuf_destroy_zcp(vpool);
2651 rte_free(vdev->regions_hpa);
2656 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2657 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2659 LOG_DEBUG(VHOST_CONFIG,
2660 "(%"PRIu64") In new_device: Failed to start "
2662 dev->device_fh, vdev->vmdq_rx_q);
2664 /* Stop the TX queue. */
2665 if (rte_eth_dev_tx_queue_stop(ports[0],
2666 vdev->vmdq_rx_q) != 0) {
2667 LOG_DEBUG(VHOST_CONFIG,
2668 "(%"PRIu64") In new_device: Failed to "
2669 "stop tx queue:%d\n",
2670 dev->device_fh, vdev->vmdq_rx_q);
2673 mbuf_destroy_zcp(vpool);
2674 rte_free(vdev->regions_hpa);
2681 /*reset ready flag*/
2682 vdev->ready = DEVICE_MAC_LEARNING;
2685 /* Find a suitable lcore to add the device. */
2686 RTE_LCORE_FOREACH_SLAVE(lcore) {
2687 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2688 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2692 /* Add device to lcore ll */
2693 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2694 if (ll_dev == NULL) {
2695 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2696 vdev->ready = DEVICE_SAFE_REMOVE;
2697 destroy_device(dev);
2698 if (vdev->regions_hpa)
2699 rte_free(vdev->regions_hpa);
2703 ll_dev->vdev = vdev;
2704 vdev->coreid = core_add;
2706 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2708 /* Initialize device stats */
2709 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2711 /* Disable notifications. */
2712 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2713 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2714 lcore_info[vdev->coreid].lcore_ll->device_num++;
2715 dev->flags |= VIRTIO_DEV_RUNNING;
2717 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2723 * These callback allow devices to be added to the data core when configuration
2724 * has been fully complete.
2726 static const struct virtio_net_device_ops virtio_net_device_ops =
2728 .new_device = new_device,
2729 .destroy_device = destroy_device,
2733 * This is a thread will wake up after a period to print stats if the user has
2739 struct virtio_net_data_ll *dev_ll;
2740 uint64_t tx_dropped, rx_dropped;
2741 uint64_t tx, tx_total, rx, rx_total;
2743 const char clr[] = { 27, '[', '2', 'J', '\0' };
2744 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2747 sleep(enable_stats);
2749 /* Clear screen and move to top left */
2750 printf("%s%s", clr, top_left);
2752 printf("\nDevice statistics ====================================");
2754 dev_ll = ll_root_used;
2755 while (dev_ll != NULL) {
2756 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2757 tx_total = dev_statistics[device_fh].tx_total;
2758 tx = dev_statistics[device_fh].tx;
2759 tx_dropped = tx_total - tx;
2760 if (zero_copy == 0) {
2761 rx_total = rte_atomic64_read(
2762 &dev_statistics[device_fh].rx_total_atomic);
2763 rx = rte_atomic64_read(
2764 &dev_statistics[device_fh].rx_atomic);
2766 rx_total = dev_statistics[device_fh].rx_total;
2767 rx = dev_statistics[device_fh].rx;
2769 rx_dropped = rx_total - rx;
2771 printf("\nStatistics for device %"PRIu32" ------------------------------"
2772 "\nTX total: %"PRIu64""
2773 "\nTX dropped: %"PRIu64""
2774 "\nTX successful: %"PRIu64""
2775 "\nRX total: %"PRIu64""
2776 "\nRX dropped: %"PRIu64""
2777 "\nRX successful: %"PRIu64"",
2786 dev_ll = dev_ll->next;
2788 printf("\n======================================================\n");
2793 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2794 char *ring_name, uint32_t nb_mbuf)
2796 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2797 vpool_array[index].pool
2798 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2799 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2800 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2801 rte_pktmbuf_init, NULL, socket, 0);
2802 if (vpool_array[index].pool != NULL) {
2803 vpool_array[index].ring
2804 = rte_ring_create(ring_name,
2805 rte_align32pow2(nb_mbuf + 1),
2806 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2807 if (likely(vpool_array[index].ring != NULL)) {
2808 LOG_DEBUG(VHOST_CONFIG,
2809 "in setup_mempool_tbl: mbuf count in "
2811 rte_mempool_count(vpool_array[index].pool));
2812 LOG_DEBUG(VHOST_CONFIG,
2813 "in setup_mempool_tbl: mbuf count in "
2815 rte_ring_count(vpool_array[index].ring));
2817 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2821 /* Need consider head room. */
2822 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2824 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2830 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2831 * device is also registered here to handle the IOCTLs.
2834 MAIN(int argc, char *argv[])
2836 struct rte_mempool *mbuf_pool = NULL;
2837 unsigned lcore_id, core_id = 0;
2838 unsigned nb_ports, valid_num_ports;
2840 uint8_t portid, queue_id = 0;
2841 static pthread_t tid;
2844 ret = rte_eal_init(argc, argv);
2846 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2850 /* parse app arguments */
2851 ret = us_vhost_parse_args(argc, argv);
2853 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2855 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2856 if (rte_lcore_is_enabled(lcore_id))
2857 lcore_ids[core_id ++] = lcore_id;
2859 if (rte_lcore_count() > RTE_MAX_LCORE)
2860 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2862 /*set the number of swithcing cores available*/
2863 num_switching_cores = rte_lcore_count()-1;
2865 /* Get the number of physical ports. */
2866 nb_ports = rte_eth_dev_count();
2867 if (nb_ports > RTE_MAX_ETHPORTS)
2868 nb_ports = RTE_MAX_ETHPORTS;
2871 * Update the global var NUM_PORTS and global array PORTS
2872 * and get value of var VALID_NUM_PORTS according to system ports number
2874 valid_num_ports = check_ports_num(nb_ports);
2876 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2877 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2878 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2882 if (zero_copy == 0) {
2883 /* Create the mbuf pool. */
2884 mbuf_pool = rte_mempool_create(
2888 MBUF_SIZE, MBUF_CACHE_SIZE,
2889 sizeof(struct rte_pktmbuf_pool_private),
2890 rte_pktmbuf_pool_init, NULL,
2891 rte_pktmbuf_init, NULL,
2892 rte_socket_id(), 0);
2893 if (mbuf_pool == NULL)
2894 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2896 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2897 vpool_array[queue_id].pool = mbuf_pool;
2899 if (vm2vm_mode == VM2VM_HARDWARE) {
2900 /* Enable VT loop back to let L2 switch to do it. */
2901 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2902 LOG_DEBUG(VHOST_CONFIG,
2903 "Enable loop back for L2 switch in vmdq.\n");
2907 char pool_name[RTE_MEMPOOL_NAMESIZE];
2908 char ring_name[RTE_MEMPOOL_NAMESIZE];
2911 * Zero copy defers queue RX/TX start to the time when guest
2912 * finishes its startup and packet buffers from that guest are
2915 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2916 rx_conf_default.rx_drop_en = 0;
2917 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2918 nb_mbuf = num_rx_descriptor
2919 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2920 + num_switching_cores * MAX_PKT_BURST;
2922 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2923 snprintf(pool_name, sizeof(pool_name),
2924 "rxmbuf_pool_%u", queue_id);
2925 snprintf(ring_name, sizeof(ring_name),
2926 "rxmbuf_ring_%u", queue_id);
2927 setup_mempool_tbl(rte_socket_id(), queue_id,
2928 pool_name, ring_name, nb_mbuf);
2931 nb_mbuf = num_tx_descriptor
2932 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2933 + num_switching_cores * MAX_PKT_BURST;
2935 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2936 snprintf(pool_name, sizeof(pool_name),
2937 "txmbuf_pool_%u", queue_id);
2938 snprintf(ring_name, sizeof(ring_name),
2939 "txmbuf_ring_%u", queue_id);
2940 setup_mempool_tbl(rte_socket_id(),
2941 (queue_id + MAX_QUEUES),
2942 pool_name, ring_name, nb_mbuf);
2945 if (vm2vm_mode == VM2VM_HARDWARE) {
2946 /* Enable VT loop back to let L2 switch to do it. */
2947 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2948 LOG_DEBUG(VHOST_CONFIG,
2949 "Enable loop back for L2 switch in vmdq.\n");
2952 /* Set log level. */
2953 rte_set_log_level(LOG_LEVEL);
2955 /* initialize all ports */
2956 for (portid = 0; portid < nb_ports; portid++) {
2957 /* skip ports that are not enabled */
2958 if ((enabled_port_mask & (1 << portid)) == 0) {
2959 RTE_LOG(INFO, VHOST_PORT,
2960 "Skipping disabled port %d\n", portid);
2963 if (port_init(portid) != 0)
2964 rte_exit(EXIT_FAILURE,
2965 "Cannot initialize network ports\n");
2968 /* Initialise all linked lists. */
2969 if (init_data_ll() == -1)
2970 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2972 /* Initialize device stats */
2973 memset(&dev_statistics, 0, sizeof(dev_statistics));
2975 /* Enable stats if the user option is set. */
2977 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2979 /* Launch all data cores. */
2980 if (zero_copy == 0) {
2981 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2982 rte_eal_remote_launch(switch_worker,
2983 mbuf_pool, lcore_id);
2986 uint32_t count_in_mempool, index, i;
2987 for (index = 0; index < 2*MAX_QUEUES; index++) {
2988 /* For all RX and TX queues. */
2990 = rte_mempool_count(vpool_array[index].pool);
2993 * Transfer all un-attached mbufs from vpool.pool
2996 for (i = 0; i < count_in_mempool; i++) {
2997 struct rte_mbuf *mbuf
2998 = __rte_mbuf_raw_alloc(
2999 vpool_array[index].pool);
3000 rte_ring_sp_enqueue(vpool_array[index].ring,
3004 LOG_DEBUG(VHOST_CONFIG,
3005 "in MAIN: mbuf count in mempool at initial "
3006 "is: %d\n", count_in_mempool);
3007 LOG_DEBUG(VHOST_CONFIG,
3008 "in MAIN: mbuf count in ring at initial is :"
3010 rte_ring_count(vpool_array[index].ring));
3013 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3014 rte_eal_remote_launch(switch_worker_zcp, NULL,
3019 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3021 /* Register CUSE device to handle IOCTLs. */
3022 ret = rte_vhost_driver_register((char *)&dev_basename);
3024 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3026 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3028 /* Start CUSE session. */
3029 rte_vhost_driver_session_start();