4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 128
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
83 * RX and TX Prefetch, Host, and Write-back threshold values should be
84 * carefully set for optimal performance. Consult the network
85 * controller's datasheet and supporting DPDK documentation for guidance
86 * on how these parameters should be set.
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
93 * These default values are optimized for use with the Intel(R) 82599 10 GbE
94 * Controller and the DPDK ixgbe PMD. Consider using other values for other
95 * network controllers and/or network drivers.
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
107 #define JUMBO_FRAME_MAX_SIZE 0x2600
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_SAFE_REMOVE 2
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
123 * Need refine these 2 macros for legacy and DPDK based front end:
124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125 * And then adjust power 2.
128 * For legacy front end, 128 descriptors,
129 * half for virtio header, another half for mbuf.
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 + sizeof(struct rte_mbuf)))
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 #define INVALID_PORT_ID 0xFF
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
172 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173 * disabled on default.
175 static uint32_t zero_copy;
176 static int mergeable;
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
186 struct rte_mempool *pool;
187 struct rte_ring *ring;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200 /* The type of host physical address translated from guest physical address. */
202 PHYS_ADDR_CONTINUOUS = 0,
203 PHYS_ADDR_CROSS_SUBREG = 1,
204 PHYS_ADDR_INVALID = 2,
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
224 .pthresh = RX_PTHRESH,
225 .hthresh = RX_HTHRESH,
226 .wthresh = RX_WTHRESH,
232 * These default values are optimized for use with the Intel(R) 82599 10 GbE
233 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234 * network controllers and/or network drivers.
236 static struct rte_eth_txconf tx_conf_default = {
238 .pthresh = TX_PTHRESH,
239 .hthresh = TX_HTHRESH,
240 .wthresh = TX_WTHRESH,
242 .tx_free_thresh = 0, /* Use PMD default values */
243 .tx_rs_thresh = 0, /* Use PMD default values */
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
249 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
251 .header_split = 0, /**< Header Split disabled */
252 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
253 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
255 * It is necessary for 1G NIC such as I350,
256 * this fixes bug of ipv4 forwarding in guest can't
257 * forward pakets from one virtio dev to another virtio dev.
259 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
260 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
261 .hw_strip_crc = 0, /**< CRC stripped by hardware */
265 .mq_mode = ETH_MQ_TX_NONE,
269 * should be overridden separately in code with
273 .nb_queue_pools = ETH_8_POOLS,
274 .enable_default_pool = 0,
277 .pool_map = {{0, 0},},
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
290 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
308 /* Used for queueing bursts of TX packets. */
312 struct rte_mbuf *m_table[MAX_PKT_BURST];
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
321 /* Vlan header struct used to insert vlan tags on TX. */
323 unsigned char h_dest[ETH_ALEN];
324 unsigned char h_source[ETH_ALEN];
327 __be16 h_vlan_encapsulated_proto;
332 uint8_t version_ihl; /**< version and header length */
333 uint8_t type_of_service; /**< type of service */
334 uint16_t total_length; /**< length of packet */
335 uint16_t packet_id; /**< packet ID */
336 uint16_t fragment_offset; /**< fragmentation offset */
337 uint8_t time_to_live; /**< time to live */
338 uint8_t next_proto_id; /**< protocol ID */
339 uint16_t hdr_checksum; /**< header checksum */
340 uint32_t src_addr; /**< source address */
341 uint32_t dst_addr; /**< destination address */
342 } __attribute__((__packed__));
344 /* Header lengths. */
346 #define VLAN_ETH_HLEN 18
348 /* Per-device statistics struct */
349 struct device_statistics {
351 rte_atomic64_t rx_total_atomic;
354 rte_atomic64_t rx_atomic;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
360 * Builds up the correct configuration for VMDQ VLAN pool map
361 * according to the pool & queue limits.
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
366 struct rte_eth_vmdq_rx_conf conf;
369 memset(&conf, 0, sizeof(conf));
370 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371 conf.nb_pool_maps = num_devices;
372 conf.enable_loop_back =
373 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
375 for (i = 0; i < conf.nb_pool_maps; i++) {
376 conf.pool_map[i].vlan_id = vlan_tags[ i ];
377 conf.pool_map[i].pools = (1UL << i);
380 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
387 * Validate the device number according to the max pool number gotten form
388 * dev_info. If the device number is invalid, give the error message and
389 * return -1. Each device must have its own pool.
392 validate_num_devices(uint32_t max_nb_devices)
394 if (num_devices > max_nb_devices) {
395 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402 * Initialises a given port using global settings and with the rx buffers
403 * coming from the mbuf_pool passed as parameter
406 port_init(uint8_t port)
408 struct rte_eth_dev_info dev_info;
409 struct rte_eth_conf port_conf;
410 uint16_t rx_rings, tx_rings;
411 uint16_t rx_ring_size, tx_ring_size;
415 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416 rte_eth_dev_info_get (port, &dev_info);
418 /*configure the number of supported virtio devices based on VMDQ limits */
419 num_devices = dev_info.max_vmdq_pools;
420 num_queues = dev_info.max_rx_queues;
423 rx_ring_size = num_rx_descriptor;
424 tx_ring_size = num_tx_descriptor;
425 tx_rings = dev_info.max_tx_queues;
427 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429 tx_rings = (uint16_t)rte_lcore_count();
432 retval = validate_num_devices(MAX_DEVICES);
436 /* Get port configuration. */
437 retval = get_eth_conf(&port_conf, num_devices);
441 if (port >= rte_eth_dev_count()) return -1;
443 rx_rings = (uint16_t)num_queues,
444 /* Configure ethernet device. */
445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 /* Setup the queues. */
450 for (q = 0; q < rx_rings; q ++) {
451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 rte_eth_dev_socket_id(port), &rx_conf_default,
453 vpool_array[q].pool);
457 for (q = 0; q < tx_rings; q ++) {
458 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459 rte_eth_dev_socket_id(port), &tx_conf_default);
464 /* Start the device. */
465 retval = rte_eth_dev_start(port);
467 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
476 vmdq_ports_eth_addr[port].addr_bytes[0],
477 vmdq_ports_eth_addr[port].addr_bytes[1],
478 vmdq_ports_eth_addr[port].addr_bytes[2],
479 vmdq_ports_eth_addr[port].addr_bytes[3],
480 vmdq_ports_eth_addr[port].addr_bytes[4],
481 vmdq_ports_eth_addr[port].addr_bytes[5]);
487 * Set character device basename.
490 us_vhost_parse_basename(const char *q_arg)
492 /* parse number string */
494 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
497 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
503 * Parse the portmask provided at run time.
506 parse_portmask(const char *portmask)
513 /* parse hexadecimal string */
514 pm = strtoul(portmask, &end, 16);
515 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
526 * Parse num options at run time.
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 /* parse unsigned int string */
537 num = strtoul(q_arg, &end, 10);
538 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
541 if (num > max_valid_value)
552 us_vhost_usage(const char *prgname)
554 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
556 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557 " --dev-basename <name>\n"
559 " -p PORTMASK: Set mask for ports to be used by application\n"
560 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566 " --dev-basename: The basename to be used for the character device.\n"
567 " --zero-copy [0|1]: disable(default)/enable rx/tx "
569 " --rx-desc-num [0-N]: the number of descriptors on rx, "
570 "used only when zero copy is enabled.\n"
571 " --tx-desc-num [0-N]: the number of descriptors on tx, "
572 "used only when zero copy is enabled.\n",
577 * Parse the arguments given in the command line of the application.
580 us_vhost_parse_args(int argc, char **argv)
585 const char *prgname = argv[0];
586 static struct option long_option[] = {
587 {"vm2vm", required_argument, NULL, 0},
588 {"rx-retry", required_argument, NULL, 0},
589 {"rx-retry-delay", required_argument, NULL, 0},
590 {"rx-retry-num", required_argument, NULL, 0},
591 {"mergeable", required_argument, NULL, 0},
592 {"stats", required_argument, NULL, 0},
593 {"dev-basename", required_argument, NULL, 0},
594 {"zero-copy", required_argument, NULL, 0},
595 {"rx-desc-num", required_argument, NULL, 0},
596 {"tx-desc-num", required_argument, NULL, 0},
600 /* Parse command line */
601 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605 enabled_port_mask = parse_portmask(optarg);
606 if (enabled_port_mask == 0) {
607 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608 us_vhost_usage(prgname);
614 /* Enable/disable vm2vm comms. */
615 if (!strncmp(long_option[option_index].name, "vm2vm",
617 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
619 RTE_LOG(INFO, VHOST_CONFIG,
620 "Invalid argument for "
622 us_vhost_usage(prgname);
625 vm2vm_mode = (vm2vm_type)ret;
629 /* Enable/disable retries on RX. */
630 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631 ret = parse_num_opt(optarg, 1);
633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634 us_vhost_usage(prgname);
641 /* Specify the retries delay time (in useconds) on RX. */
642 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643 ret = parse_num_opt(optarg, INT32_MAX);
645 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646 us_vhost_usage(prgname);
649 burst_rx_delay_time = ret;
653 /* Specify the retries number on RX. */
654 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655 ret = parse_num_opt(optarg, INT32_MAX);
657 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658 us_vhost_usage(prgname);
661 burst_rx_retry_num = ret;
665 /* Enable/disable RX mergeable buffers. */
666 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667 ret = parse_num_opt(optarg, 1);
669 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670 us_vhost_usage(prgname);
675 vmdq_conf_default.rxmode.jumbo_frame = 1;
676 vmdq_conf_default.rxmode.max_rx_pkt_len
677 = JUMBO_FRAME_MAX_SIZE;
682 /* Enable/disable stats. */
683 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684 ret = parse_num_opt(optarg, INT32_MAX);
686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687 us_vhost_usage(prgname);
694 /* Set character device basename. */
695 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696 if (us_vhost_parse_basename(optarg) == -1) {
697 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698 us_vhost_usage(prgname);
703 /* Enable/disable rx/tx zero copy. */
704 if (!strncmp(long_option[option_index].name,
705 "zero-copy", MAX_LONG_OPT_SZ)) {
706 ret = parse_num_opt(optarg, 1);
708 RTE_LOG(INFO, VHOST_CONFIG,
710 " for zero-copy [0|1]\n");
711 us_vhost_usage(prgname);
717 #ifdef RTE_MBUF_REFCNT
718 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719 "zero copy vhost APP, please "
720 "disable RTE_MBUF_REFCNT\n"
721 "in config file and then rebuild DPDK "
723 "Otherwise please disable zero copy "
724 "flag in command line!\n");
730 /* Specify the descriptor number on RX. */
731 if (!strncmp(long_option[option_index].name,
732 "rx-desc-num", MAX_LONG_OPT_SZ)) {
733 ret = parse_num_opt(optarg, MAX_RING_DESC);
734 if ((ret == -1) || (!POWEROF2(ret))) {
735 RTE_LOG(INFO, VHOST_CONFIG,
736 "Invalid argument for rx-desc-num[0-N],"
737 "power of 2 required.\n");
738 us_vhost_usage(prgname);
741 num_rx_descriptor = ret;
745 /* Specify the descriptor number on TX. */
746 if (!strncmp(long_option[option_index].name,
747 "tx-desc-num", MAX_LONG_OPT_SZ)) {
748 ret = parse_num_opt(optarg, MAX_RING_DESC);
749 if ((ret == -1) || (!POWEROF2(ret))) {
750 RTE_LOG(INFO, VHOST_CONFIG,
751 "Invalid argument for tx-desc-num [0-N],"
752 "power of 2 required.\n");
753 us_vhost_usage(prgname);
756 num_tx_descriptor = ret;
762 /* Invalid option - print options. */
764 us_vhost_usage(prgname);
769 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770 if (enabled_port_mask & (1 << i))
771 ports[num_ports++] = (uint8_t)i;
774 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
775 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781 RTE_LOG(INFO, VHOST_PORT,
782 "Vhost zero copy doesn't support software vm2vm,"
783 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
787 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788 RTE_LOG(INFO, VHOST_PORT,
789 "Vhost zero copy doesn't support jumbo frame,"
790 "please specify '--mergeable 0' to disable the "
791 "mergeable feature.\n");
799 * Update the global var NUM_PORTS and array PORTS according to system ports number
800 * and return valid ports number
802 static unsigned check_ports_num(unsigned nb_ports)
804 unsigned valid_num_ports = num_ports;
807 if (num_ports > nb_ports) {
808 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809 num_ports, nb_ports);
810 num_ports = nb_ports;
813 for (portid = 0; portid < num_ports; portid ++) {
814 if (ports[portid] >= nb_ports) {
815 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816 ports[portid], (nb_ports - 1));
817 ports[portid] = INVALID_PORT_ID;
821 return valid_num_ports;
825 * Macro to print out packet contents. Wrapped in debug define so that the
826 * data path is not effected when debug is disabled.
829 #define PRINT_PACKET(device, addr, size, header) do { \
830 char *pkt_addr = (char*)(addr); \
831 unsigned int index; \
832 char packet[MAX_PRINT_BUFF]; \
835 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
837 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
838 for (index = 0; index < (size); index++) { \
839 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
840 "%02hhx ", pkt_addr[index]); \
842 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
844 LOG_DEBUG(VHOST_DATA, "%s", packet); \
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 * Function to convert guest physical addresses to vhost physical addresses.
852 * This is used to convert virtio buffer addresses.
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
856 uint32_t buf_len, hpa_type *addr_type)
858 struct virtio_memory_regions_hpa *region;
860 uint64_t vhost_pa = 0;
862 *addr_type = PHYS_ADDR_INVALID;
864 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865 region = &vdev->regions_hpa[regionidx];
866 if ((guest_pa >= region->guest_phys_address) &&
867 (guest_pa <= region->guest_phys_address_end)) {
868 vhost_pa = region->host_phys_addr_offset + guest_pa;
869 if (likely((guest_pa + buf_len - 1)
870 <= region->guest_phys_address_end))
871 *addr_type = PHYS_ADDR_CONTINUOUS;
873 *addr_type = PHYS_ADDR_CROSS_SUBREG;
878 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880 (void *)(uintptr_t)vhost_pa);
886 * Compares a packet destination MAC address to a device MAC address.
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
891 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 * This function learns the MAC address of the device and registers this along with a
896 * vlan tag to a VMDQ.
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
901 struct ether_hdr *pkt_hdr;
902 struct virtio_net_data_ll *dev_ll;
903 struct virtio_net *dev = vdev->dev;
906 /* Learn MAC address of guest device from packet */
907 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
909 dev_ll = ll_root_used;
911 while (dev_ll != NULL) {
912 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
916 dev_ll = dev_ll->next;
919 for (i = 0; i < ETHER_ADDR_LEN; i++)
920 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
922 /* vlan_tag currently uses the device_id. */
923 vdev->vlan_tag = vlan_tags[dev->device_fh];
925 /* Print out VMDQ registration info. */
926 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
928 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
933 /* Register the MAC address. */
934 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
936 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
939 /* Enable stripping of the vlan tag as we handle routing. */
940 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
942 /* Set device as ready for RX. */
943 vdev->ready = DEVICE_RX;
949 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950 * queue before disabling RX on the device.
953 unlink_vmdq(struct vhost_dev *vdev)
957 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
959 if (vdev->ready == DEVICE_RX) {
960 /*clear MAC and VLAN settings*/
961 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962 for (i = 0; i < 6; i++)
963 vdev->mac_address.addr_bytes[i] = 0;
967 /*Clear out the receive buffers*/
968 rx_count = rte_eth_rx_burst(ports[0],
969 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
972 for (i = 0; i < rx_count; i++)
973 rte_pktmbuf_free(pkts_burst[i]);
975 rx_count = rte_eth_rx_burst(ports[0],
976 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
979 vdev->ready = DEVICE_MAC_LEARNING;
984 * Check if the packet destination MAC address is for a local device. If so then put
985 * the packet on that devices RX queue. If not then return.
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
990 struct virtio_net_data_ll *dev_ll;
991 struct ether_hdr *pkt_hdr;
993 struct virtio_net *dev = vdev->dev;
994 struct virtio_net *tdev; /* destination virito device */
996 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
998 /*get the used devices list*/
999 dev_ll = ll_root_used;
1001 while (dev_ll != NULL) {
1002 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003 &dev_ll->vdev->mac_address)) {
1005 /* Drop the packet if the TX packet is destined for the TX device. */
1006 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011 tdev = dev_ll->vdev->dev;
1014 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1016 if (unlikely(dev_ll->vdev->remove)) {
1017 /*drop the packet if the device is marked for removal*/
1018 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1020 /*send the packet to the local virtio device*/
1021 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1024 &dev_statistics[tdev->device_fh].rx_total_atomic,
1027 &dev_statistics[tdev->device_fh].rx_atomic,
1029 dev_statistics[tdev->device_fh].tx_total++;
1030 dev_statistics[tdev->device_fh].tx += ret;
1036 dev_ll = dev_ll->next;
1043 * Check if the destination MAC of a packet is one local VM,
1044 * and get its vlan tag, and offset if it is.
1046 static inline int __attribute__((always_inline))
1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1048 uint32_t *offset, uint16_t *vlan_tag)
1050 struct virtio_net_data_ll *dev_ll = ll_root_used;
1051 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1053 while (dev_ll != NULL) {
1054 if ((dev_ll->vdev->ready == DEVICE_RX)
1055 && ether_addr_cmp(&(pkt_hdr->d_addr),
1056 &dev_ll->vdev->mac_address)) {
1058 * Drop the packet if the TX packet is
1059 * destined for the TX device.
1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 LOG_DEBUG(VHOST_DATA,
1063 "(%"PRIu64") TX: Source and destination"
1064 " MAC addresses are the same. Dropping "
1066 dev_ll->vdev->dev->device_fh);
1071 * HW vlan strip will reduce the packet length
1072 * by minus length of vlan tag, so need restore
1073 * the packet length by plus it.
1075 *offset = VLAN_HLEN;
1078 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1080 LOG_DEBUG(VHOST_DATA,
1081 "(%"PRIu64") TX: pkt to local VM device id:"
1082 "(%"PRIu64") vlan tag: %d.\n",
1083 dev->device_fh, dev_ll->vdev->dev->device_fh,
1088 dev_ll = dev_ll->next;
1094 * This function routes the TX packet to the correct interface. This may be a local device
1095 * or the physical port.
1097 static inline void __attribute__((always_inline))
1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1100 struct mbuf_table *tx_q;
1101 struct rte_mbuf **m_table;
1102 unsigned len, ret, offset = 0;
1103 const uint16_t lcore_id = rte_lcore_id();
1104 struct virtio_net *dev = vdev->dev;
1106 /*check if destination is local VM*/
1107 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1108 rte_pktmbuf_free(m);
1112 if (vm2vm_mode == VM2VM_HARDWARE) {
1113 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1114 rte_pktmbuf_free(m);
1119 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1121 /*Add packet to the port tx queue*/
1122 tx_q = &lcore_tx_queue[lcore_id];
1125 m->ol_flags = PKT_TX_VLAN_PKT;
1127 m->data_len += offset;
1128 m->pkt_len += offset;
1130 m->vlan_tci = vlan_tag;
1132 tx_q->m_table[len] = m;
1135 dev_statistics[dev->device_fh].tx_total++;
1136 dev_statistics[dev->device_fh].tx++;
1139 if (unlikely(len == MAX_PKT_BURST)) {
1140 m_table = (struct rte_mbuf **)tx_q->m_table;
1141 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1142 /* Free any buffers not handled by TX and update the port stats. */
1143 if (unlikely(ret < len)) {
1145 rte_pktmbuf_free(m_table[ret]);
1146 } while (++ret < len);
1156 * This function is called by each data core. It handles all RX/TX registered with the
1157 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1158 * with all devices in the main linked list.
1161 switch_worker(__attribute__((unused)) void *arg)
1163 struct rte_mempool *mbuf_pool = arg;
1164 struct virtio_net *dev = NULL;
1165 struct vhost_dev *vdev = NULL;
1166 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1167 struct virtio_net_data_ll *dev_ll;
1168 struct mbuf_table *tx_q;
1169 volatile struct lcore_ll_info *lcore_ll;
1170 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1171 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1173 const uint16_t lcore_id = rte_lcore_id();
1174 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1175 uint16_t rx_count = 0;
1179 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1180 lcore_ll = lcore_info[lcore_id].lcore_ll;
1183 tx_q = &lcore_tx_queue[lcore_id];
1184 for (i = 0; i < num_cores; i ++) {
1185 if (lcore_ids[i] == lcore_id) {
1192 cur_tsc = rte_rdtsc();
1194 * TX burst queue drain
1196 diff_tsc = cur_tsc - prev_tsc;
1197 if (unlikely(diff_tsc > drain_tsc)) {
1200 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1202 /*Tx any packets in the queue*/
1203 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1204 (struct rte_mbuf **)tx_q->m_table,
1205 (uint16_t)tx_q->len);
1206 if (unlikely(ret < tx_q->len)) {
1208 rte_pktmbuf_free(tx_q->m_table[ret]);
1209 } while (++ret < tx_q->len);
1219 rte_prefetch0(lcore_ll->ll_root_used);
1221 * Inform the configuration core that we have exited the linked list and that no devices are
1222 * in use if requested.
1224 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1225 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1230 dev_ll = lcore_ll->ll_root_used;
1232 while (dev_ll != NULL) {
1233 /*get virtio device ID*/
1234 vdev = dev_ll->vdev;
1237 if (unlikely(vdev->remove)) {
1238 dev_ll = dev_ll->next;
1240 vdev->ready = DEVICE_SAFE_REMOVE;
1243 if (likely(vdev->ready == DEVICE_RX)) {
1245 rx_count = rte_eth_rx_burst(ports[0],
1246 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1250 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1251 * Here MAX_PKT_BURST must be less than virtio queue size
1253 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1254 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1255 rte_delay_us(burst_rx_delay_time);
1256 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1260 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1263 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1266 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1268 while (likely(rx_count)) {
1270 rte_pktmbuf_free(pkts_burst[rx_count]);
1276 if (likely(!vdev->remove)) {
1277 /* Handle guest TX*/
1278 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1279 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1280 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1281 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1283 rte_pktmbuf_free(pkts_burst[tx_count]);
1287 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1290 /*move to the next device in the list*/
1291 dev_ll = dev_ll->next;
1299 * This function gets available ring number for zero copy rx.
1300 * Only one thread will call this funciton for a paticular virtio device,
1301 * so, it is designed as non-thread-safe function.
1303 static inline uint32_t __attribute__((always_inline))
1304 get_available_ring_num_zcp(struct virtio_net *dev)
1306 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1309 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1310 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1314 * This function gets available ring index for zero copy rx,
1315 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1316 * Only one thread will call this funciton for a paticular virtio device,
1317 * so, it is designed as non-thread-safe function.
1319 static inline uint32_t __attribute__((always_inline))
1320 get_available_ring_index_zcp(struct virtio_net *dev,
1321 uint16_t *res_base_idx, uint32_t count)
1323 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1326 uint16_t free_entries;
1328 *res_base_idx = vq->last_used_idx_res;
1329 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1330 free_entries = (avail_idx - *res_base_idx);
1332 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1334 "res base idx:%d, free entries:%d\n",
1335 dev->device_fh, avail_idx, *res_base_idx,
1339 * If retry is enabled and the queue is full then we wait
1340 * and retry to avoid packet loss.
1342 if (enable_retry && unlikely(count > free_entries)) {
1343 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1344 rte_delay_us(burst_rx_delay_time);
1345 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1346 free_entries = (avail_idx - *res_base_idx);
1347 if (count <= free_entries)
1352 /*check that we have enough buffers*/
1353 if (unlikely(count > free_entries))
1354 count = free_entries;
1356 if (unlikely(count == 0)) {
1357 LOG_DEBUG(VHOST_DATA,
1358 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1359 "avail idx: %d, res base idx:%d, free entries:%d\n",
1360 dev->device_fh, avail_idx,
1361 *res_base_idx, free_entries);
1365 vq->last_used_idx_res = *res_base_idx + count;
1371 * This function put descriptor back to used list.
1373 static inline void __attribute__((always_inline))
1374 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1376 uint16_t res_cur_idx = vq->last_used_idx;
1377 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1378 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1379 rte_compiler_barrier();
1380 *(volatile uint16_t *)&vq->used->idx += 1;
1381 vq->last_used_idx += 1;
1383 /* Kick the guest if necessary. */
1384 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1385 eventfd_write((int)vq->kickfd, 1);
1389 * This function get available descriptor from vitio vring and un-attached mbuf
1390 * from vpool->ring, and then attach them together. It needs adjust the offset
1391 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1392 * frame data may be put to wrong location in mbuf.
1394 static inline void __attribute__((always_inline))
1395 attach_rxmbuf_zcp(struct virtio_net *dev)
1397 uint16_t res_base_idx, desc_idx;
1398 uint64_t buff_addr, phys_addr;
1399 struct vhost_virtqueue *vq;
1400 struct vring_desc *desc;
1401 struct rte_mbuf *mbuf = NULL;
1402 struct vpool *vpool;
1404 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1406 vpool = &vpool_array[vdev->vmdq_rx_q];
1407 vq = dev->virtqueue[VIRTIO_RXQ];
1410 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1413 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1415 desc = &vq->desc[desc_idx];
1416 if (desc->flags & VRING_DESC_F_NEXT) {
1417 desc = &vq->desc[desc->next];
1418 buff_addr = gpa_to_vva(dev, desc->addr);
1419 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1422 buff_addr = gpa_to_vva(dev,
1423 desc->addr + vq->vhost_hlen);
1424 phys_addr = gpa_to_hpa(vdev,
1425 desc->addr + vq->vhost_hlen,
1426 desc->len, &addr_type);
1429 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1430 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1431 " address found when attaching RX frame buffer"
1432 " address!\n", dev->device_fh);
1433 put_desc_to_used_list_zcp(vq, desc_idx);
1438 * Check if the frame buffer address from guest crosses
1439 * sub-region or not.
1441 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1442 RTE_LOG(ERR, VHOST_DATA,
1443 "(%"PRIu64") Frame buffer address cross "
1444 "sub-regioin found when attaching RX frame "
1445 "buffer address!\n",
1447 put_desc_to_used_list_zcp(vq, desc_idx);
1450 } while (unlikely(phys_addr == 0));
1452 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1453 if (unlikely(mbuf == NULL)) {
1454 LOG_DEBUG(VHOST_DATA,
1455 "(%"PRIu64") in attach_rxmbuf_zcp: "
1456 "ring_sc_dequeue fail.\n",
1458 put_desc_to_used_list_zcp(vq, desc_idx);
1462 if (unlikely(vpool->buf_size > desc->len)) {
1463 LOG_DEBUG(VHOST_DATA,
1464 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1465 "length(%d) of descriptor idx: %d less than room "
1466 "size required: %d\n",
1467 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1468 put_desc_to_used_list_zcp(vq, desc_idx);
1469 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1473 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1474 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1475 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1476 mbuf->data_len = desc->len;
1477 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1479 LOG_DEBUG(VHOST_DATA,
1480 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1481 "descriptor idx:%d\n",
1482 dev->device_fh, res_base_idx, desc_idx);
1484 __rte_mbuf_raw_free(mbuf);
1490 * Detach an attched packet mbuf -
1491 * - restore original mbuf address and length values.
1492 * - reset pktmbuf data and data_len to their default values.
1493 * All other fields of the given packet mbuf will be left intact.
1496 * The attached packet mbuf.
1498 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1500 const struct rte_mempool *mp = m->pool;
1501 void *buf = RTE_MBUF_TO_BADDR(m);
1503 uint32_t buf_len = mp->elt_size - sizeof(*m);
1504 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1507 m->buf_len = (uint16_t)buf_len;
1509 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1510 RTE_PKTMBUF_HEADROOM : m->buf_len;
1511 m->data_off = buf_ofs;
1517 * This function is called after packets have been transimited. It fetchs mbuf
1518 * from vpool->pool, detached it and put into vpool->ring. It also update the
1519 * used index and kick the guest if necessary.
1521 static inline uint32_t __attribute__((always_inline))
1522 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1524 struct rte_mbuf *mbuf;
1525 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1526 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1528 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1530 LOG_DEBUG(VHOST_DATA,
1531 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1533 dev->device_fh, mbuf_count);
1534 LOG_DEBUG(VHOST_DATA,
1535 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1537 dev->device_fh, rte_ring_count(vpool->ring));
1539 for (index = 0; index < mbuf_count; index++) {
1540 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1541 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1542 pktmbuf_detach_zcp(mbuf);
1543 rte_ring_sp_enqueue(vpool->ring, mbuf);
1545 /* Update used index buffer information. */
1546 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1547 vq->used->ring[used_idx].len = 0;
1549 used_idx = (used_idx + 1) & (vq->size - 1);
1552 LOG_DEBUG(VHOST_DATA,
1553 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1555 dev->device_fh, rte_mempool_count(vpool->pool));
1556 LOG_DEBUG(VHOST_DATA,
1557 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1559 dev->device_fh, rte_ring_count(vpool->ring));
1560 LOG_DEBUG(VHOST_DATA,
1561 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1562 "vq->last_used_idx:%d\n",
1563 dev->device_fh, vq->last_used_idx);
1565 vq->last_used_idx += mbuf_count;
1567 LOG_DEBUG(VHOST_DATA,
1568 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1569 "vq->last_used_idx:%d\n",
1570 dev->device_fh, vq->last_used_idx);
1572 rte_compiler_barrier();
1574 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1576 /* Kick guest if required. */
1577 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1578 eventfd_write((int)vq->kickfd, 1);
1584 * This function is called when a virtio device is destroy.
1585 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1587 static void mbuf_destroy_zcp(struct vpool *vpool)
1589 struct rte_mbuf *mbuf = NULL;
1590 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1592 LOG_DEBUG(VHOST_CONFIG,
1593 "in mbuf_destroy_zcp: mbuf count in mempool before "
1594 "mbuf_destroy_zcp is: %d\n",
1596 LOG_DEBUG(VHOST_CONFIG,
1597 "in mbuf_destroy_zcp: mbuf count in ring before "
1598 "mbuf_destroy_zcp is : %d\n",
1599 rte_ring_count(vpool->ring));
1601 for (index = 0; index < mbuf_count; index++) {
1602 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1603 if (likely(mbuf != NULL)) {
1604 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1605 pktmbuf_detach_zcp(mbuf);
1606 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1610 LOG_DEBUG(VHOST_CONFIG,
1611 "in mbuf_destroy_zcp: mbuf count in mempool after "
1612 "mbuf_destroy_zcp is: %d\n",
1613 rte_mempool_count(vpool->pool));
1614 LOG_DEBUG(VHOST_CONFIG,
1615 "in mbuf_destroy_zcp: mbuf count in ring after "
1616 "mbuf_destroy_zcp is : %d\n",
1617 rte_ring_count(vpool->ring));
1621 * This function update the use flag and counter.
1623 static inline uint32_t __attribute__((always_inline))
1624 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1627 struct vhost_virtqueue *vq;
1628 struct vring_desc *desc;
1629 struct rte_mbuf *buff;
1630 /* The virtio_hdr is initialised to 0. */
1631 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1632 = {{0, 0, 0, 0, 0, 0}, 0};
1633 uint64_t buff_hdr_addr = 0;
1634 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1635 uint32_t head_idx, packet_success = 0;
1636 uint16_t res_cur_idx;
1638 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1643 vq = dev->virtqueue[VIRTIO_RXQ];
1644 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1646 res_cur_idx = vq->last_used_idx;
1647 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1648 dev->device_fh, res_cur_idx, res_cur_idx + count);
1650 /* Retrieve all of the head indexes first to avoid caching issues. */
1651 for (head_idx = 0; head_idx < count; head_idx++)
1652 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1654 /*Prefetch descriptor index. */
1655 rte_prefetch0(&vq->desc[head[packet_success]]);
1657 while (packet_success != count) {
1658 /* Get descriptor from available ring */
1659 desc = &vq->desc[head[packet_success]];
1661 buff = pkts[packet_success];
1662 LOG_DEBUG(VHOST_DATA,
1663 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1664 "pkt[%d] descriptor idx: %d\n",
1665 dev->device_fh, packet_success,
1666 MBUF_HEADROOM_UINT32(buff));
1669 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1670 + RTE_PKTMBUF_HEADROOM),
1671 rte_pktmbuf_data_len(buff), 0);
1673 /* Buffer address translation for virtio header. */
1674 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1675 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1678 * If the descriptors are chained the header and data are
1679 * placed in separate buffers.
1681 if (desc->flags & VRING_DESC_F_NEXT) {
1682 desc->len = vq->vhost_hlen;
1683 desc = &vq->desc[desc->next];
1684 desc->len = rte_pktmbuf_data_len(buff);
1686 desc->len = packet_len;
1689 /* Update used ring with desc information */
1690 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1691 = head[packet_success];
1692 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1697 /* A header is required per buffer. */
1698 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1699 (const void *)&virtio_hdr, vq->vhost_hlen);
1701 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1703 if (likely(packet_success < count)) {
1704 /* Prefetch descriptor index. */
1705 rte_prefetch0(&vq->desc[head[packet_success]]);
1709 rte_compiler_barrier();
1711 LOG_DEBUG(VHOST_DATA,
1712 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1713 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1714 dev->device_fh, vq->last_used_idx, vq->used->idx);
1716 *(volatile uint16_t *)&vq->used->idx += count;
1717 vq->last_used_idx += count;
1719 LOG_DEBUG(VHOST_DATA,
1720 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1721 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1722 dev->device_fh, vq->last_used_idx, vq->used->idx);
1724 /* Kick the guest if necessary. */
1725 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1726 eventfd_write((int)vq->kickfd, 1);
1732 * This function routes the TX packet to the correct interface.
1733 * This may be a local device or the physical port.
1735 static inline void __attribute__((always_inline))
1736 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1737 uint32_t desc_idx, uint8_t need_copy)
1739 struct mbuf_table *tx_q;
1740 struct rte_mbuf **m_table;
1741 struct rte_mbuf *mbuf = NULL;
1742 unsigned len, ret, offset = 0;
1743 struct vpool *vpool;
1744 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1745 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1747 /*Add packet to the port tx queue*/
1748 tx_q = &tx_queue_zcp[vmdq_rx_q];
1751 /* Allocate an mbuf and populate the structure. */
1752 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1753 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1754 if (unlikely(mbuf == NULL)) {
1755 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1756 RTE_LOG(ERR, VHOST_DATA,
1757 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1759 put_desc_to_used_list_zcp(vq, desc_idx);
1763 if (vm2vm_mode == VM2VM_HARDWARE) {
1764 /* Avoid using a vlan tag from any vm for external pkt, such as
1765 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1766 * selection, MAC address determines it as an external pkt
1767 * which should go to network, while vlan tag determine it as
1768 * a vm2vm pkt should forward to another vm. Hardware confuse
1769 * such a ambiguous situation, so pkt will lost.
1771 vlan_tag = external_pkt_default_vlan_tag;
1772 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1773 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1774 __rte_mbuf_raw_free(mbuf);
1779 mbuf->nb_segs = m->nb_segs;
1780 mbuf->next = m->next;
1781 mbuf->data_len = m->data_len + offset;
1782 mbuf->pkt_len = mbuf->data_len;
1783 if (unlikely(need_copy)) {
1784 /* Copy the packet contents to the mbuf. */
1785 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1786 rte_pktmbuf_mtod(m, void *),
1789 mbuf->data_off = m->data_off;
1790 mbuf->buf_physaddr = m->buf_physaddr;
1791 mbuf->buf_addr = m->buf_addr;
1793 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1794 mbuf->vlan_tci = vlan_tag;
1795 mbuf->l2_len = sizeof(struct ether_hdr);
1796 mbuf->l3_len = sizeof(struct ipv4_hdr);
1797 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1799 tx_q->m_table[len] = mbuf;
1802 LOG_DEBUG(VHOST_DATA,
1803 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1806 (mbuf->next == NULL) ? "null" : "non-null");
1809 dev_statistics[dev->device_fh].tx_total++;
1810 dev_statistics[dev->device_fh].tx++;
1813 if (unlikely(len == MAX_PKT_BURST)) {
1814 m_table = (struct rte_mbuf **)tx_q->m_table;
1815 ret = rte_eth_tx_burst(ports[0],
1816 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1819 * Free any buffers not handled by TX and update
1822 if (unlikely(ret < len)) {
1824 rte_pktmbuf_free(m_table[ret]);
1825 } while (++ret < len);
1829 txmbuf_clean_zcp(dev, vpool);
1838 * This function TX all available packets in virtio TX queue for one
1839 * virtio-net device. If it is first packet, it learns MAC address and
1842 static inline void __attribute__((always_inline))
1843 virtio_dev_tx_zcp(struct virtio_net *dev)
1846 struct vhost_virtqueue *vq;
1847 struct vring_desc *desc;
1848 uint64_t buff_addr = 0, phys_addr;
1849 uint32_t head[MAX_PKT_BURST];
1851 uint16_t free_entries, packet_success = 0;
1853 uint8_t need_copy = 0;
1855 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1857 vq = dev->virtqueue[VIRTIO_TXQ];
1858 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1860 /* If there are no available buffers then return. */
1861 if (vq->last_used_idx_res == avail_idx)
1864 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1866 /* Prefetch available ring to retrieve head indexes. */
1867 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1869 /* Get the number of free entries in the ring */
1870 free_entries = (avail_idx - vq->last_used_idx_res);
1872 /* Limit to MAX_PKT_BURST. */
1874 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1876 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1877 dev->device_fh, free_entries);
1879 /* Retrieve all of the head indexes first to avoid caching issues. */
1880 for (i = 0; i < free_entries; i++)
1882 = vq->avail->ring[(vq->last_used_idx_res + i)
1885 vq->last_used_idx_res += free_entries;
1887 /* Prefetch descriptor index. */
1888 rte_prefetch0(&vq->desc[head[packet_success]]);
1889 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1891 while (packet_success < free_entries) {
1892 desc = &vq->desc[head[packet_success]];
1894 /* Discard first buffer as it is the virtio header */
1895 desc = &vq->desc[desc->next];
1897 /* Buffer address translation. */
1898 buff_addr = gpa_to_vva(dev, desc->addr);
1899 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1901 if (likely(packet_success < (free_entries - 1)))
1902 /* Prefetch descriptor index. */
1903 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1905 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1906 RTE_LOG(ERR, VHOST_DATA,
1907 "(%"PRIu64") Invalid frame buffer address found"
1908 "when TX packets!\n",
1914 /* Prefetch buffer address. */
1915 rte_prefetch0((void *)(uintptr_t)buff_addr);
1918 * Setup dummy mbuf. This is copied to a real mbuf if
1919 * transmitted out the physical port.
1921 m.data_len = desc->len;
1925 m.buf_addr = (void *)(uintptr_t)buff_addr;
1926 m.buf_physaddr = phys_addr;
1929 * Check if the frame buffer address from guest crosses
1930 * sub-region or not.
1932 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1933 RTE_LOG(ERR, VHOST_DATA,
1934 "(%"PRIu64") Frame buffer address cross "
1935 "sub-regioin found when attaching TX frame "
1936 "buffer address!\n",
1942 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1945 * If this is the first received packet we need to learn
1946 * the MAC and setup VMDQ
1948 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1949 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1951 * Discard frame if device is scheduled for
1952 * removal or a duplicate MAC address is found.
1954 packet_success += free_entries;
1955 vq->last_used_idx += packet_success;
1960 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1966 * This function is called by each data core. It handles all RX/TX registered
1967 * with the core. For TX the specific lcore linked list is used. For RX, MAC
1968 * addresses are compared with all devices in the main linked list.
1971 switch_worker_zcp(__attribute__((unused)) void *arg)
1973 struct virtio_net *dev = NULL;
1974 struct vhost_dev *vdev = NULL;
1975 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1976 struct virtio_net_data_ll *dev_ll;
1977 struct mbuf_table *tx_q;
1978 volatile struct lcore_ll_info *lcore_ll;
1979 const uint64_t drain_tsc
1980 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1981 * BURST_TX_DRAIN_US;
1982 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1984 const uint16_t lcore_id = rte_lcore_id();
1985 uint16_t count_in_ring, rx_count = 0;
1987 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1989 lcore_ll = lcore_info[lcore_id].lcore_ll;
1993 cur_tsc = rte_rdtsc();
1995 /* TX burst queue drain */
1996 diff_tsc = cur_tsc - prev_tsc;
1997 if (unlikely(diff_tsc > drain_tsc)) {
1999 * Get mbuf from vpool.pool and detach mbuf and
2000 * put back into vpool.ring.
2002 dev_ll = lcore_ll->ll_root_used;
2003 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2004 /* Get virtio device ID */
2005 vdev = dev_ll->vdev;
2008 if (likely(!vdev->remove)) {
2009 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2011 LOG_DEBUG(VHOST_DATA,
2012 "TX queue drained after timeout"
2013 " with burst size %u\n",
2017 * Tx any packets in the queue
2019 ret = rte_eth_tx_burst(
2021 (uint16_t)tx_q->txq_id,
2022 (struct rte_mbuf **)
2024 (uint16_t)tx_q->len);
2025 if (unlikely(ret < tx_q->len)) {
2028 tx_q->m_table[ret]);
2029 } while (++ret < tx_q->len);
2033 txmbuf_clean_zcp(dev,
2034 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2037 dev_ll = dev_ll->next;
2042 rte_prefetch0(lcore_ll->ll_root_used);
2045 * Inform the configuration core that we have exited the linked
2046 * list and that no devices are in use if requested.
2048 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2049 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2051 /* Process devices */
2052 dev_ll = lcore_ll->ll_root_used;
2054 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2055 vdev = dev_ll->vdev;
2057 if (unlikely(vdev->remove)) {
2058 dev_ll = dev_ll->next;
2060 vdev->ready = DEVICE_SAFE_REMOVE;
2064 if (likely(vdev->ready == DEVICE_RX)) {
2065 uint32_t index = vdev->vmdq_rx_q;
2068 = rte_ring_count(vpool_array[index].ring);
2069 uint16_t free_entries
2070 = (uint16_t)get_available_ring_num_zcp(dev);
2073 * Attach all mbufs in vpool.ring and put back
2077 i < RTE_MIN(free_entries,
2078 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2080 attach_rxmbuf_zcp(dev);
2082 /* Handle guest RX */
2083 rx_count = rte_eth_rx_burst(ports[0],
2084 vdev->vmdq_rx_q, pkts_burst,
2088 ret_count = virtio_dev_rx_zcp(dev,
2089 pkts_burst, rx_count);
2091 dev_statistics[dev->device_fh].rx_total
2093 dev_statistics[dev->device_fh].rx
2096 while (likely(rx_count)) {
2099 pkts_burst[rx_count]);
2100 rte_ring_sp_enqueue(
2101 vpool_array[index].ring,
2102 (void *)pkts_burst[rx_count]);
2107 if (likely(!vdev->remove))
2108 /* Handle guest TX */
2109 virtio_dev_tx_zcp(dev);
2111 /* Move to the next device in the list */
2112 dev_ll = dev_ll->next;
2121 * Add an entry to a used linked list. A free entry must first be found
2122 * in the free linked list using get_data_ll_free_entry();
2125 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2126 struct virtio_net_data_ll *ll_dev)
2128 struct virtio_net_data_ll *ll = *ll_root_addr;
2130 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2131 ll_dev->next = NULL;
2132 rte_compiler_barrier();
2134 /* If ll == NULL then this is the first device. */
2136 /* Increment to the tail of the linked list. */
2137 while ((ll->next != NULL) )
2142 *ll_root_addr = ll_dev;
2147 * Remove an entry from a used linked list. The entry must then be added to
2148 * the free linked list using put_data_ll_free_entry().
2151 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2152 struct virtio_net_data_ll *ll_dev,
2153 struct virtio_net_data_ll *ll_dev_last)
2155 struct virtio_net_data_ll *ll = *ll_root_addr;
2157 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2161 *ll_root_addr = ll_dev->next;
2163 if (likely(ll_dev_last != NULL))
2164 ll_dev_last->next = ll_dev->next;
2166 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2170 * Find and return an entry from the free linked list.
2172 static struct virtio_net_data_ll *
2173 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2175 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2176 struct virtio_net_data_ll *ll_dev;
2178 if (ll_free == NULL)
2182 *ll_root_addr = ll_free->next;
2188 * Place an entry back on to the free linked list.
2191 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2192 struct virtio_net_data_ll *ll_dev)
2194 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2199 ll_dev->next = ll_free;
2200 *ll_root_addr = ll_dev;
2204 * Creates a linked list of a given size.
2206 static struct virtio_net_data_ll *
2207 alloc_data_ll(uint32_t size)
2209 struct virtio_net_data_ll *ll_new;
2212 /* Malloc and then chain the linked list. */
2213 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2214 if (ll_new == NULL) {
2215 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2219 for (i = 0; i < size - 1; i++) {
2220 ll_new[i].vdev = NULL;
2221 ll_new[i].next = &ll_new[i+1];
2223 ll_new[i].next = NULL;
2229 * Create the main linked list along with each individual cores linked list. A used and a free list
2230 * are created to manage entries.
2237 RTE_LCORE_FOREACH_SLAVE(lcore) {
2238 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2239 if (lcore_info[lcore].lcore_ll == NULL) {
2240 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2244 lcore_info[lcore].lcore_ll->device_num = 0;
2245 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2246 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2247 if (num_devices % num_switching_cores)
2248 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2250 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2253 /* Allocate devices up to a maximum of MAX_DEVICES. */
2254 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2260 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2261 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2262 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2265 destroy_device (volatile struct virtio_net *dev)
2267 struct virtio_net_data_ll *ll_lcore_dev_cur;
2268 struct virtio_net_data_ll *ll_main_dev_cur;
2269 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2270 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2271 struct vhost_dev *vdev;
2274 dev->flags &= ~VIRTIO_DEV_RUNNING;
2276 vdev = (struct vhost_dev *)dev->priv;
2277 /*set the remove flag. */
2279 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2283 /* Search for entry to be removed from lcore ll */
2284 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2285 while (ll_lcore_dev_cur != NULL) {
2286 if (ll_lcore_dev_cur->vdev == vdev) {
2289 ll_lcore_dev_last = ll_lcore_dev_cur;
2290 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2294 if (ll_lcore_dev_cur == NULL) {
2295 RTE_LOG(ERR, VHOST_CONFIG,
2296 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2301 /* Search for entry to be removed from main ll */
2302 ll_main_dev_cur = ll_root_used;
2303 ll_main_dev_last = NULL;
2304 while (ll_main_dev_cur != NULL) {
2305 if (ll_main_dev_cur->vdev == vdev) {
2308 ll_main_dev_last = ll_main_dev_cur;
2309 ll_main_dev_cur = ll_main_dev_cur->next;
2313 /* Remove entries from the lcore and main ll. */
2314 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2315 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2317 /* Set the dev_removal_flag on each lcore. */
2318 RTE_LCORE_FOREACH_SLAVE(lcore) {
2319 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2323 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2324 * they can no longer access the device removed from the linked lists and that the devices
2325 * are no longer in use.
2327 RTE_LCORE_FOREACH_SLAVE(lcore) {
2328 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2333 /* Add the entries back to the lcore and main free ll.*/
2334 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2335 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2337 /* Decrement number of device on the lcore. */
2338 lcore_info[vdev->coreid].lcore_ll->device_num--;
2340 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2343 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2345 /* Stop the RX queue. */
2346 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2347 LOG_DEBUG(VHOST_CONFIG,
2348 "(%"PRIu64") In destroy_device: Failed to stop "
2354 LOG_DEBUG(VHOST_CONFIG,
2355 "(%"PRIu64") in destroy_device: Start put mbuf in "
2356 "mempool back to ring for RX queue: %d\n",
2357 dev->device_fh, vdev->vmdq_rx_q);
2359 mbuf_destroy_zcp(vpool);
2361 /* Stop the TX queue. */
2362 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2363 LOG_DEBUG(VHOST_CONFIG,
2364 "(%"PRIu64") In destroy_device: Failed to "
2365 "stop tx queue:%d\n",
2366 dev->device_fh, vdev->vmdq_rx_q);
2369 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2371 LOG_DEBUG(VHOST_CONFIG,
2372 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2373 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2374 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2377 mbuf_destroy_zcp(vpool);
2378 rte_free(vdev->regions_hpa);
2385 * Calculate the region count of physical continous regions for one particular
2386 * region of whose vhost virtual address is continous. The particular region
2387 * start from vva_start, with size of 'size' in argument.
2390 check_hpa_regions(uint64_t vva_start, uint64_t size)
2392 uint32_t i, nregions = 0, page_size = getpagesize();
2393 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2394 if (vva_start % page_size) {
2395 LOG_DEBUG(VHOST_CONFIG,
2396 "in check_countinous: vva start(%p) mod page_size(%d) "
2398 (void *)(uintptr_t)vva_start, page_size);
2401 if (size % page_size) {
2402 LOG_DEBUG(VHOST_CONFIG,
2403 "in check_countinous: "
2404 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2408 for (i = 0; i < size - page_size; i = i + page_size) {
2410 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2411 next_phys_addr = rte_mem_virt2phy(
2412 (void *)(uintptr_t)(vva_start + i + page_size));
2413 if ((cur_phys_addr + page_size) != next_phys_addr) {
2415 LOG_DEBUG(VHOST_CONFIG,
2416 "in check_continuous: hva addr:(%p) is not "
2417 "continuous with hva addr:(%p), diff:%d\n",
2418 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2419 (void *)(uintptr_t)(vva_start + (uint64_t)i
2420 + page_size), page_size);
2421 LOG_DEBUG(VHOST_CONFIG,
2422 "in check_continuous: hpa addr:(%p) is not "
2423 "continuous with hpa addr:(%p), "
2424 "diff:(%"PRIu64")\n",
2425 (void *)(uintptr_t)cur_phys_addr,
2426 (void *)(uintptr_t)next_phys_addr,
2427 (next_phys_addr-cur_phys_addr));
2434 * Divide each region whose vhost virtual address is continous into a few
2435 * sub-regions, make sure the physical address within each sub-region are
2436 * continous. And fill offset(to GPA) and size etc. information of each
2437 * sub-region into regions_hpa.
2440 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2442 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2443 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2445 if (mem_region_hpa == NULL)
2448 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2449 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2450 virtio_memory->regions[regionidx].address_offset;
2451 mem_region_hpa[regionidx_hpa].guest_phys_address
2452 = virtio_memory->regions[regionidx].guest_phys_address;
2453 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2454 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2455 mem_region_hpa[regionidx_hpa].guest_phys_address;
2456 LOG_DEBUG(VHOST_CONFIG,
2457 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2460 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2461 LOG_DEBUG(VHOST_CONFIG,
2462 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2465 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2467 i < virtio_memory->regions[regionidx].memory_size -
2470 cur_phys_addr = rte_mem_virt2phy(
2471 (void *)(uintptr_t)(vva_start + i));
2472 next_phys_addr = rte_mem_virt2phy(
2473 (void *)(uintptr_t)(vva_start +
2475 if ((cur_phys_addr + page_size) != next_phys_addr) {
2476 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2477 mem_region_hpa[regionidx_hpa].guest_phys_address +
2479 mem_region_hpa[regionidx_hpa].memory_size
2481 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2482 "phys addr end [%d]:(%p)\n",
2485 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2486 LOG_DEBUG(VHOST_CONFIG,
2487 "in fill_hpa_regions: guest phys addr "
2491 (mem_region_hpa[regionidx_hpa].memory_size));
2492 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2493 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2495 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2497 mem_region_hpa[regionidx_hpa].guest_phys_address;
2498 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2499 " phys addr start[%d]:(%p)\n",
2502 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2503 LOG_DEBUG(VHOST_CONFIG,
2504 "in fill_hpa_regions: host phys addr "
2508 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2514 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2515 = mem_region_hpa[regionidx_hpa].guest_phys_address
2517 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2518 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2519 "[%d]:(%p)\n", regionidx_hpa,
2521 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2522 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2523 "[%d]:(%p)\n", regionidx_hpa,
2525 (mem_region_hpa[regionidx_hpa].memory_size));
2528 return regionidx_hpa;
2532 * A new device is added to a data core. First the device is added to the main linked list
2533 * and the allocated to a specific data core.
2536 new_device (struct virtio_net *dev)
2538 struct virtio_net_data_ll *ll_dev;
2539 int lcore, core_add = 0;
2540 uint32_t device_num_min = num_devices;
2541 struct vhost_dev *vdev;
2544 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2546 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2554 vdev->nregions_hpa = dev->mem->nregions;
2555 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2557 += check_hpa_regions(
2558 dev->mem->regions[regionidx].guest_phys_address
2559 + dev->mem->regions[regionidx].address_offset,
2560 dev->mem->regions[regionidx].memory_size);
2564 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2565 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2567 if (vdev->regions_hpa == NULL) {
2568 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2574 if (fill_hpa_memory_regions(
2575 vdev->regions_hpa, dev->mem
2576 ) != vdev->nregions_hpa) {
2578 RTE_LOG(ERR, VHOST_CONFIG,
2579 "hpa memory regions number mismatch: "
2580 "[%d]\n", vdev->nregions_hpa);
2581 rte_free(vdev->regions_hpa);
2588 /* Add device to main ll */
2589 ll_dev = get_data_ll_free_entry(&ll_root_free);
2590 if (ll_dev == NULL) {
2591 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2592 "of %d devices per core has been reached\n",
2593 dev->device_fh, num_devices);
2594 if (vdev->regions_hpa)
2595 rte_free(vdev->regions_hpa);
2599 ll_dev->vdev = vdev;
2600 add_data_ll_entry(&ll_root_used, ll_dev);
2602 = dev->device_fh * (num_queues / num_devices);
2605 uint32_t index = vdev->vmdq_rx_q;
2606 uint32_t count_in_ring, i;
2607 struct mbuf_table *tx_q;
2609 count_in_ring = rte_ring_count(vpool_array[index].ring);
2611 LOG_DEBUG(VHOST_CONFIG,
2612 "(%"PRIu64") in new_device: mbuf count in mempool "
2613 "before attach is: %d\n",
2615 rte_mempool_count(vpool_array[index].pool));
2616 LOG_DEBUG(VHOST_CONFIG,
2617 "(%"PRIu64") in new_device: mbuf count in ring "
2618 "before attach is : %d\n",
2619 dev->device_fh, count_in_ring);
2622 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2624 for (i = 0; i < count_in_ring; i++)
2625 attach_rxmbuf_zcp(dev);
2627 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2628 "mempool after attach is: %d\n",
2630 rte_mempool_count(vpool_array[index].pool));
2631 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2632 "ring after attach is : %d\n",
2634 rte_ring_count(vpool_array[index].ring));
2636 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2637 tx_q->txq_id = vdev->vmdq_rx_q;
2639 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2640 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2642 LOG_DEBUG(VHOST_CONFIG,
2643 "(%"PRIu64") In new_device: Failed to start "
2645 dev->device_fh, vdev->vmdq_rx_q);
2647 mbuf_destroy_zcp(vpool);
2648 rte_free(vdev->regions_hpa);
2653 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2654 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2656 LOG_DEBUG(VHOST_CONFIG,
2657 "(%"PRIu64") In new_device: Failed to start "
2659 dev->device_fh, vdev->vmdq_rx_q);
2661 /* Stop the TX queue. */
2662 if (rte_eth_dev_tx_queue_stop(ports[0],
2663 vdev->vmdq_rx_q) != 0) {
2664 LOG_DEBUG(VHOST_CONFIG,
2665 "(%"PRIu64") In new_device: Failed to "
2666 "stop tx queue:%d\n",
2667 dev->device_fh, vdev->vmdq_rx_q);
2670 mbuf_destroy_zcp(vpool);
2671 rte_free(vdev->regions_hpa);
2678 /*reset ready flag*/
2679 vdev->ready = DEVICE_MAC_LEARNING;
2682 /* Find a suitable lcore to add the device. */
2683 RTE_LCORE_FOREACH_SLAVE(lcore) {
2684 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2685 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2689 /* Add device to lcore ll */
2690 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2691 if (ll_dev == NULL) {
2692 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2693 vdev->ready = DEVICE_SAFE_REMOVE;
2694 destroy_device(dev);
2695 if (vdev->regions_hpa)
2696 rte_free(vdev->regions_hpa);
2700 ll_dev->vdev = vdev;
2701 vdev->coreid = core_add;
2703 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2705 /* Initialize device stats */
2706 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2708 /* Disable notifications. */
2709 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2710 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2711 lcore_info[vdev->coreid].lcore_ll->device_num++;
2712 dev->flags |= VIRTIO_DEV_RUNNING;
2714 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2720 * These callback allow devices to be added to the data core when configuration
2721 * has been fully complete.
2723 static const struct virtio_net_device_ops virtio_net_device_ops =
2725 .new_device = new_device,
2726 .destroy_device = destroy_device,
2730 * This is a thread will wake up after a period to print stats if the user has
2736 struct virtio_net_data_ll *dev_ll;
2737 uint64_t tx_dropped, rx_dropped;
2738 uint64_t tx, tx_total, rx, rx_total;
2740 const char clr[] = { 27, '[', '2', 'J', '\0' };
2741 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2744 sleep(enable_stats);
2746 /* Clear screen and move to top left */
2747 printf("%s%s", clr, top_left);
2749 printf("\nDevice statistics ====================================");
2751 dev_ll = ll_root_used;
2752 while (dev_ll != NULL) {
2753 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2754 tx_total = dev_statistics[device_fh].tx_total;
2755 tx = dev_statistics[device_fh].tx;
2756 tx_dropped = tx_total - tx;
2757 if (zero_copy == 0) {
2758 rx_total = rte_atomic64_read(
2759 &dev_statistics[device_fh].rx_total_atomic);
2760 rx = rte_atomic64_read(
2761 &dev_statistics[device_fh].rx_atomic);
2763 rx_total = dev_statistics[device_fh].rx_total;
2764 rx = dev_statistics[device_fh].rx;
2766 rx_dropped = rx_total - rx;
2768 printf("\nStatistics for device %"PRIu32" ------------------------------"
2769 "\nTX total: %"PRIu64""
2770 "\nTX dropped: %"PRIu64""
2771 "\nTX successful: %"PRIu64""
2772 "\nRX total: %"PRIu64""
2773 "\nRX dropped: %"PRIu64""
2774 "\nRX successful: %"PRIu64"",
2783 dev_ll = dev_ll->next;
2785 printf("\n======================================================\n");
2790 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2791 char *ring_name, uint32_t nb_mbuf)
2793 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2794 vpool_array[index].pool
2795 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2796 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2797 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2798 rte_pktmbuf_init, NULL, socket, 0);
2799 if (vpool_array[index].pool != NULL) {
2800 vpool_array[index].ring
2801 = rte_ring_create(ring_name,
2802 rte_align32pow2(nb_mbuf + 1),
2803 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2804 if (likely(vpool_array[index].ring != NULL)) {
2805 LOG_DEBUG(VHOST_CONFIG,
2806 "in setup_mempool_tbl: mbuf count in "
2808 rte_mempool_count(vpool_array[index].pool));
2809 LOG_DEBUG(VHOST_CONFIG,
2810 "in setup_mempool_tbl: mbuf count in "
2812 rte_ring_count(vpool_array[index].ring));
2814 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2818 /* Need consider head room. */
2819 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2821 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2827 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2828 * device is also registered here to handle the IOCTLs.
2831 MAIN(int argc, char *argv[])
2833 struct rte_mempool *mbuf_pool = NULL;
2834 unsigned lcore_id, core_id = 0;
2835 unsigned nb_ports, valid_num_ports;
2837 uint8_t portid, queue_id = 0;
2838 static pthread_t tid;
2841 ret = rte_eal_init(argc, argv);
2843 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2847 /* parse app arguments */
2848 ret = us_vhost_parse_args(argc, argv);
2850 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2852 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2853 if (rte_lcore_is_enabled(lcore_id))
2854 lcore_ids[core_id ++] = lcore_id;
2856 if (rte_lcore_count() > RTE_MAX_LCORE)
2857 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2859 /*set the number of swithcing cores available*/
2860 num_switching_cores = rte_lcore_count()-1;
2862 /* Get the number of physical ports. */
2863 nb_ports = rte_eth_dev_count();
2864 if (nb_ports > RTE_MAX_ETHPORTS)
2865 nb_ports = RTE_MAX_ETHPORTS;
2868 * Update the global var NUM_PORTS and global array PORTS
2869 * and get value of var VALID_NUM_PORTS according to system ports number
2871 valid_num_ports = check_ports_num(nb_ports);
2873 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2874 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2875 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2879 if (zero_copy == 0) {
2880 /* Create the mbuf pool. */
2881 mbuf_pool = rte_mempool_create(
2885 MBUF_SIZE, MBUF_CACHE_SIZE,
2886 sizeof(struct rte_pktmbuf_pool_private),
2887 rte_pktmbuf_pool_init, NULL,
2888 rte_pktmbuf_init, NULL,
2889 rte_socket_id(), 0);
2890 if (mbuf_pool == NULL)
2891 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2893 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2894 vpool_array[queue_id].pool = mbuf_pool;
2896 if (vm2vm_mode == VM2VM_HARDWARE) {
2897 /* Enable VT loop back to let L2 switch to do it. */
2898 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2899 LOG_DEBUG(VHOST_CONFIG,
2900 "Enable loop back for L2 switch in vmdq.\n");
2904 char pool_name[RTE_MEMPOOL_NAMESIZE];
2905 char ring_name[RTE_MEMPOOL_NAMESIZE];
2908 * Zero copy defers queue RX/TX start to the time when guest
2909 * finishes its startup and packet buffers from that guest are
2912 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2913 rx_conf_default.rx_drop_en = 0;
2914 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2915 nb_mbuf = num_rx_descriptor
2916 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2917 + num_switching_cores * MAX_PKT_BURST;
2919 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2920 snprintf(pool_name, sizeof(pool_name),
2921 "rxmbuf_pool_%u", queue_id);
2922 snprintf(ring_name, sizeof(ring_name),
2923 "rxmbuf_ring_%u", queue_id);
2924 setup_mempool_tbl(rte_socket_id(), queue_id,
2925 pool_name, ring_name, nb_mbuf);
2928 nb_mbuf = num_tx_descriptor
2929 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2930 + num_switching_cores * MAX_PKT_BURST;
2932 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2933 snprintf(pool_name, sizeof(pool_name),
2934 "txmbuf_pool_%u", queue_id);
2935 snprintf(ring_name, sizeof(ring_name),
2936 "txmbuf_ring_%u", queue_id);
2937 setup_mempool_tbl(rte_socket_id(),
2938 (queue_id + MAX_QUEUES),
2939 pool_name, ring_name, nb_mbuf);
2942 if (vm2vm_mode == VM2VM_HARDWARE) {
2943 /* Enable VT loop back to let L2 switch to do it. */
2944 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2945 LOG_DEBUG(VHOST_CONFIG,
2946 "Enable loop back for L2 switch in vmdq.\n");
2949 /* Set log level. */
2950 rte_set_log_level(LOG_LEVEL);
2952 /* initialize all ports */
2953 for (portid = 0; portid < nb_ports; portid++) {
2954 /* skip ports that are not enabled */
2955 if ((enabled_port_mask & (1 << portid)) == 0) {
2956 RTE_LOG(INFO, VHOST_PORT,
2957 "Skipping disabled port %d\n", portid);
2960 if (port_init(portid) != 0)
2961 rte_exit(EXIT_FAILURE,
2962 "Cannot initialize network ports\n");
2965 /* Initialise all linked lists. */
2966 if (init_data_ll() == -1)
2967 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2969 /* Initialize device stats */
2970 memset(&dev_statistics, 0, sizeof(dev_statistics));
2972 /* Enable stats if the user option is set. */
2974 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2976 /* Launch all data cores. */
2977 if (zero_copy == 0) {
2978 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2979 rte_eal_remote_launch(switch_worker,
2980 mbuf_pool, lcore_id);
2983 uint32_t count_in_mempool, index, i;
2984 for (index = 0; index < 2*MAX_QUEUES; index++) {
2985 /* For all RX and TX queues. */
2987 = rte_mempool_count(vpool_array[index].pool);
2990 * Transfer all un-attached mbufs from vpool.pool
2993 for (i = 0; i < count_in_mempool; i++) {
2994 struct rte_mbuf *mbuf
2995 = __rte_mbuf_raw_alloc(
2996 vpool_array[index].pool);
2997 rte_ring_sp_enqueue(vpool_array[index].ring,
3001 LOG_DEBUG(VHOST_CONFIG,
3002 "in MAIN: mbuf count in mempool at initial "
3003 "is: %d\n", count_in_mempool);
3004 LOG_DEBUG(VHOST_CONFIG,
3005 "in MAIN: mbuf count in ring at initial is :"
3007 rte_ring_count(vpool_array[index].ring));
3010 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3011 rte_eal_remote_launch(switch_worker_zcp, NULL,
3016 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3018 /* Register CUSE device to handle IOCTLs. */
3019 ret = rte_vhost_driver_register((char *)&dev_basename);
3021 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3023 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3025 /* Start CUSE session. */
3026 rte_vhost_driver_session_start();