4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 256
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
83 * RX and TX Prefetch, Host, and Write-back threshold values should be
84 * carefully set for optimal performance. Consult the network
85 * controller's datasheet and supporting DPDK documentation for guidance
86 * on how these parameters should be set.
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
93 * These default values are optimized for use with the Intel(R) 82599 10 GbE
94 * Controller and the DPDK ixgbe PMD. Consider using other values for other
95 * network controllers and/or network drivers.
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
107 #define JUMBO_FRAME_MAX_SIZE 0x2600
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_SAFE_REMOVE 2
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
123 * Need refine these 2 macros for legacy and DPDK based front end:
124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125 * And then adjust power 2.
128 * For legacy front end, 128 descriptors,
129 * half for virtio header, another half for mbuf.
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 + sizeof(struct rte_mbuf)))
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 #define INVALID_PORT_ID 0xFF
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
164 /* Promiscuous mode */
165 static uint32_t promiscuous;
167 /*Number of switching cores enabled*/
168 static uint32_t num_switching_cores = 0;
170 /* number of devices/queues to support*/
171 static uint32_t num_queues = 0;
172 static uint32_t num_devices;
175 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
176 * disabled on default.
178 static uint32_t zero_copy;
179 static int mergeable;
181 /* number of descriptors to apply*/
182 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
183 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
185 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
186 #define MAX_RING_DESC 4096
189 struct rte_mempool *pool;
190 struct rte_ring *ring;
192 } vpool_array[MAX_QUEUES+MAX_QUEUES];
194 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
201 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
203 /* The type of host physical address translated from guest physical address. */
205 PHYS_ADDR_CONTINUOUS = 0,
206 PHYS_ADDR_CROSS_SUBREG = 1,
207 PHYS_ADDR_INVALID = 2,
212 static uint32_t enable_stats = 0;
213 /* Enable retries on RX. */
214 static uint32_t enable_retry = 1;
215 /* Specify timeout (in useconds) between retries on RX. */
216 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
217 /* Specify the number of retries on RX. */
218 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
220 /* Character device basename. Can be set by user. */
221 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
227 .pthresh = RX_PTHRESH,
228 .hthresh = RX_HTHRESH,
229 .wthresh = RX_WTHRESH,
235 * These default values are optimized for use with the Intel(R) 82599 10 GbE
236 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237 * network controllers and/or network drivers.
239 static struct rte_eth_txconf tx_conf_default = {
241 .pthresh = TX_PTHRESH,
242 .hthresh = TX_HTHRESH,
243 .wthresh = TX_WTHRESH,
245 .tx_free_thresh = 0, /* Use PMD default values */
246 .tx_rs_thresh = 0, /* Use PMD default values */
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
252 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
254 .header_split = 0, /**< Header Split disabled */
255 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
258 * It is necessary for 1G NIC such as I350,
259 * this fixes bug of ipv4 forwarding in guest can't
260 * forward pakets from one virtio dev to another virtio dev.
262 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
263 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
264 .hw_strip_crc = 0, /**< CRC stripped by hardware */
268 .mq_mode = ETH_MQ_TX_NONE,
272 * should be overridden separately in code with
276 .nb_queue_pools = ETH_8_POOLS,
277 .enable_default_pool = 0,
280 .pool_map = {{0, 0},},
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288 static uint16_t num_pf_queues, num_vmdq_queues;
289 static uint16_t vmdq_pool_base, vmdq_queue_base;
290 static uint16_t queues_per_pool;
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
314 /* Used for queueing bursts of TX packets. */
318 struct rte_mbuf *m_table[MAX_PKT_BURST];
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
327 /* Vlan header struct used to insert vlan tags on TX. */
329 unsigned char h_dest[ETH_ALEN];
330 unsigned char h_source[ETH_ALEN];
333 __be16 h_vlan_encapsulated_proto;
338 uint8_t version_ihl; /**< version and header length */
339 uint8_t type_of_service; /**< type of service */
340 uint16_t total_length; /**< length of packet */
341 uint16_t packet_id; /**< packet ID */
342 uint16_t fragment_offset; /**< fragmentation offset */
343 uint8_t time_to_live; /**< time to live */
344 uint8_t next_proto_id; /**< protocol ID */
345 uint16_t hdr_checksum; /**< header checksum */
346 uint32_t src_addr; /**< source address */
347 uint32_t dst_addr; /**< destination address */
348 } __attribute__((__packed__));
350 /* Header lengths. */
352 #define VLAN_ETH_HLEN 18
354 /* Per-device statistics struct */
355 struct device_statistics {
357 rte_atomic64_t rx_total_atomic;
360 rte_atomic64_t rx_atomic;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
366 * Builds up the correct configuration for VMDQ VLAN pool map
367 * according to the pool & queue limits.
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
372 struct rte_eth_vmdq_rx_conf conf;
373 struct rte_eth_vmdq_rx_conf *def_conf =
374 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
377 memset(&conf, 0, sizeof(conf));
378 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
379 conf.nb_pool_maps = num_devices;
380 conf.enable_loop_back = def_conf->enable_loop_back;
381 conf.rx_mode = def_conf->rx_mode;
383 for (i = 0; i < conf.nb_pool_maps; i++) {
384 conf.pool_map[i].vlan_id = vlan_tags[ i ];
385 conf.pool_map[i].pools = (1UL << i);
388 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
389 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
390 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
395 * Validate the device number according to the max pool number gotten form
396 * dev_info. If the device number is invalid, give the error message and
397 * return -1. Each device must have its own pool.
400 validate_num_devices(uint32_t max_nb_devices)
402 if (num_devices > max_nb_devices) {
403 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
410 * Initialises a given port using global settings and with the rx buffers
411 * coming from the mbuf_pool passed as parameter
414 port_init(uint8_t port)
416 struct rte_eth_dev_info dev_info;
417 struct rte_eth_conf port_conf;
418 uint16_t rx_rings, tx_rings;
419 uint16_t rx_ring_size, tx_ring_size;
423 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
424 rte_eth_dev_info_get (port, &dev_info);
426 /*configure the number of supported virtio devices based on VMDQ limits */
427 num_devices = dev_info.max_vmdq_pools;
430 rx_ring_size = num_rx_descriptor;
431 tx_ring_size = num_tx_descriptor;
432 tx_rings = dev_info.max_tx_queues;
434 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
435 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
436 tx_rings = (uint16_t)rte_lcore_count();
439 retval = validate_num_devices(MAX_DEVICES);
443 /* Get port configuration. */
444 retval = get_eth_conf(&port_conf, num_devices);
447 /* NIC queues are divided into pf queues and vmdq queues. */
448 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
449 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
450 num_vmdq_queues = num_devices * queues_per_pool;
451 num_queues = num_pf_queues + num_vmdq_queues;
452 vmdq_queue_base = dev_info.vmdq_queue_base;
453 vmdq_pool_base = dev_info.vmdq_pool_base;
454 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
455 num_pf_queues, num_devices, queues_per_pool);
457 if (port >= rte_eth_dev_count()) return -1;
459 rx_rings = (uint16_t)dev_info.max_rx_queues;
460 /* Configure ethernet device. */
461 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
465 /* Setup the queues. */
466 for (q = 0; q < rx_rings; q ++) {
467 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
468 rte_eth_dev_socket_id(port), &rx_conf_default,
469 vpool_array[q].pool);
473 for (q = 0; q < tx_rings; q ++) {
474 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
475 rte_eth_dev_socket_id(port), &tx_conf_default);
480 /* Start the device. */
481 retval = rte_eth_dev_start(port);
483 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
488 rte_eth_promiscuous_enable(port);
490 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
491 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
492 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
493 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
495 vmdq_ports_eth_addr[port].addr_bytes[0],
496 vmdq_ports_eth_addr[port].addr_bytes[1],
497 vmdq_ports_eth_addr[port].addr_bytes[2],
498 vmdq_ports_eth_addr[port].addr_bytes[3],
499 vmdq_ports_eth_addr[port].addr_bytes[4],
500 vmdq_ports_eth_addr[port].addr_bytes[5]);
506 * Set character device basename.
509 us_vhost_parse_basename(const char *q_arg)
511 /* parse number string */
513 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
516 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
522 * Parse the portmask provided at run time.
525 parse_portmask(const char *portmask)
532 /* parse hexadecimal string */
533 pm = strtoul(portmask, &end, 16);
534 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 * Parse num options at run time.
548 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
555 /* parse unsigned int string */
556 num = strtoul(q_arg, &end, 10);
557 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
560 if (num > max_valid_value)
571 us_vhost_usage(const char *prgname)
573 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
575 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
576 " --dev-basename <name>\n"
578 " -p PORTMASK: Set mask for ports to be used by application\n"
579 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
580 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
581 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
582 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
583 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
584 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
585 " --dev-basename: The basename to be used for the character device.\n"
586 " --zero-copy [0|1]: disable(default)/enable rx/tx "
588 " --rx-desc-num [0-N]: the number of descriptors on rx, "
589 "used only when zero copy is enabled.\n"
590 " --tx-desc-num [0-N]: the number of descriptors on tx, "
591 "used only when zero copy is enabled.\n",
596 * Parse the arguments given in the command line of the application.
599 us_vhost_parse_args(int argc, char **argv)
604 const char *prgname = argv[0];
605 static struct option long_option[] = {
606 {"vm2vm", required_argument, NULL, 0},
607 {"rx-retry", required_argument, NULL, 0},
608 {"rx-retry-delay", required_argument, NULL, 0},
609 {"rx-retry-num", required_argument, NULL, 0},
610 {"mergeable", required_argument, NULL, 0},
611 {"stats", required_argument, NULL, 0},
612 {"dev-basename", required_argument, NULL, 0},
613 {"zero-copy", required_argument, NULL, 0},
614 {"rx-desc-num", required_argument, NULL, 0},
615 {"tx-desc-num", required_argument, NULL, 0},
619 /* Parse command line */
620 while ((opt = getopt_long(argc, argv, "p:P",
621 long_option, &option_index)) != EOF) {
625 enabled_port_mask = parse_portmask(optarg);
626 if (enabled_port_mask == 0) {
627 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
628 us_vhost_usage(prgname);
635 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
636 ETH_VMDQ_ACCEPT_BROADCAST |
637 ETH_VMDQ_ACCEPT_MULTICAST;
638 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
643 /* Enable/disable vm2vm comms. */
644 if (!strncmp(long_option[option_index].name, "vm2vm",
646 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
648 RTE_LOG(INFO, VHOST_CONFIG,
649 "Invalid argument for "
651 us_vhost_usage(prgname);
654 vm2vm_mode = (vm2vm_type)ret;
658 /* Enable/disable retries on RX. */
659 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
660 ret = parse_num_opt(optarg, 1);
662 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
663 us_vhost_usage(prgname);
670 /* Specify the retries delay time (in useconds) on RX. */
671 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
672 ret = parse_num_opt(optarg, INT32_MAX);
674 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
675 us_vhost_usage(prgname);
678 burst_rx_delay_time = ret;
682 /* Specify the retries number on RX. */
683 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
684 ret = parse_num_opt(optarg, INT32_MAX);
686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
687 us_vhost_usage(prgname);
690 burst_rx_retry_num = ret;
694 /* Enable/disable RX mergeable buffers. */
695 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
696 ret = parse_num_opt(optarg, 1);
698 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
699 us_vhost_usage(prgname);
704 vmdq_conf_default.rxmode.jumbo_frame = 1;
705 vmdq_conf_default.rxmode.max_rx_pkt_len
706 = JUMBO_FRAME_MAX_SIZE;
711 /* Enable/disable stats. */
712 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
713 ret = parse_num_opt(optarg, INT32_MAX);
715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
716 us_vhost_usage(prgname);
723 /* Set character device basename. */
724 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
725 if (us_vhost_parse_basename(optarg) == -1) {
726 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
727 us_vhost_usage(prgname);
732 /* Enable/disable rx/tx zero copy. */
733 if (!strncmp(long_option[option_index].name,
734 "zero-copy", MAX_LONG_OPT_SZ)) {
735 ret = parse_num_opt(optarg, 1);
737 RTE_LOG(INFO, VHOST_CONFIG,
739 " for zero-copy [0|1]\n");
740 us_vhost_usage(prgname);
746 #ifdef RTE_MBUF_REFCNT
747 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
748 "zero copy vhost APP, please "
749 "disable RTE_MBUF_REFCNT\n"
750 "in config file and then rebuild DPDK "
752 "Otherwise please disable zero copy "
753 "flag in command line!\n");
759 /* Specify the descriptor number on RX. */
760 if (!strncmp(long_option[option_index].name,
761 "rx-desc-num", MAX_LONG_OPT_SZ)) {
762 ret = parse_num_opt(optarg, MAX_RING_DESC);
763 if ((ret == -1) || (!POWEROF2(ret))) {
764 RTE_LOG(INFO, VHOST_CONFIG,
765 "Invalid argument for rx-desc-num[0-N],"
766 "power of 2 required.\n");
767 us_vhost_usage(prgname);
770 num_rx_descriptor = ret;
774 /* Specify the descriptor number on TX. */
775 if (!strncmp(long_option[option_index].name,
776 "tx-desc-num", MAX_LONG_OPT_SZ)) {
777 ret = parse_num_opt(optarg, MAX_RING_DESC);
778 if ((ret == -1) || (!POWEROF2(ret))) {
779 RTE_LOG(INFO, VHOST_CONFIG,
780 "Invalid argument for tx-desc-num [0-N],"
781 "power of 2 required.\n");
782 us_vhost_usage(prgname);
785 num_tx_descriptor = ret;
791 /* Invalid option - print options. */
793 us_vhost_usage(prgname);
798 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
799 if (enabled_port_mask & (1 << i))
800 ports[num_ports++] = (uint8_t)i;
803 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
804 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
805 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
809 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
810 RTE_LOG(INFO, VHOST_PORT,
811 "Vhost zero copy doesn't support software vm2vm,"
812 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
816 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
817 RTE_LOG(INFO, VHOST_PORT,
818 "Vhost zero copy doesn't support jumbo frame,"
819 "please specify '--mergeable 0' to disable the "
820 "mergeable feature.\n");
828 * Update the global var NUM_PORTS and array PORTS according to system ports number
829 * and return valid ports number
831 static unsigned check_ports_num(unsigned nb_ports)
833 unsigned valid_num_ports = num_ports;
836 if (num_ports > nb_ports) {
837 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
838 num_ports, nb_ports);
839 num_ports = nb_ports;
842 for (portid = 0; portid < num_ports; portid ++) {
843 if (ports[portid] >= nb_ports) {
844 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
845 ports[portid], (nb_ports - 1));
846 ports[portid] = INVALID_PORT_ID;
850 return valid_num_ports;
854 * Macro to print out packet contents. Wrapped in debug define so that the
855 * data path is not effected when debug is disabled.
858 #define PRINT_PACKET(device, addr, size, header) do { \
859 char *pkt_addr = (char*)(addr); \
860 unsigned int index; \
861 char packet[MAX_PRINT_BUFF]; \
864 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
866 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
867 for (index = 0; index < (size); index++) { \
868 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
869 "%02hhx ", pkt_addr[index]); \
871 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
873 LOG_DEBUG(VHOST_DATA, "%s", packet); \
876 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
880 * Function to convert guest physical addresses to vhost physical addresses.
881 * This is used to convert virtio buffer addresses.
883 static inline uint64_t __attribute__((always_inline))
884 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
885 uint32_t buf_len, hpa_type *addr_type)
887 struct virtio_memory_regions_hpa *region;
889 uint64_t vhost_pa = 0;
891 *addr_type = PHYS_ADDR_INVALID;
893 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
894 region = &vdev->regions_hpa[regionidx];
895 if ((guest_pa >= region->guest_phys_address) &&
896 (guest_pa <= region->guest_phys_address_end)) {
897 vhost_pa = region->host_phys_addr_offset + guest_pa;
898 if (likely((guest_pa + buf_len - 1)
899 <= region->guest_phys_address_end))
900 *addr_type = PHYS_ADDR_CONTINUOUS;
902 *addr_type = PHYS_ADDR_CROSS_SUBREG;
907 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
908 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
909 (void *)(uintptr_t)vhost_pa);
915 * Compares a packet destination MAC address to a device MAC address.
917 static inline int __attribute__((always_inline))
918 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
920 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
924 * This function learns the MAC address of the device and registers this along with a
925 * vlan tag to a VMDQ.
928 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
930 struct ether_hdr *pkt_hdr;
931 struct virtio_net_data_ll *dev_ll;
932 struct virtio_net *dev = vdev->dev;
935 /* Learn MAC address of guest device from packet */
936 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
938 dev_ll = ll_root_used;
940 while (dev_ll != NULL) {
941 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
942 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
945 dev_ll = dev_ll->next;
948 for (i = 0; i < ETHER_ADDR_LEN; i++)
949 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
951 /* vlan_tag currently uses the device_id. */
952 vdev->vlan_tag = vlan_tags[dev->device_fh];
954 /* Print out VMDQ registration info. */
955 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
957 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
958 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
959 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
962 /* Register the MAC address. */
963 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
964 (uint32_t)dev->device_fh + vmdq_pool_base);
966 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
969 /* Enable stripping of the vlan tag as we handle routing. */
970 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
972 /* Set device as ready for RX. */
973 vdev->ready = DEVICE_RX;
979 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
980 * queue before disabling RX on the device.
983 unlink_vmdq(struct vhost_dev *vdev)
987 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
989 if (vdev->ready == DEVICE_RX) {
990 /*clear MAC and VLAN settings*/
991 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
992 for (i = 0; i < 6; i++)
993 vdev->mac_address.addr_bytes[i] = 0;
997 /*Clear out the receive buffers*/
998 rx_count = rte_eth_rx_burst(ports[0],
999 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1002 for (i = 0; i < rx_count; i++)
1003 rte_pktmbuf_free(pkts_burst[i]);
1005 rx_count = rte_eth_rx_burst(ports[0],
1006 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1009 vdev->ready = DEVICE_MAC_LEARNING;
1014 * Check if the packet destination MAC address is for a local device. If so then put
1015 * the packet on that devices RX queue. If not then return.
1017 static inline int __attribute__((always_inline))
1018 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1020 struct virtio_net_data_ll *dev_ll;
1021 struct ether_hdr *pkt_hdr;
1023 struct virtio_net *dev = vdev->dev;
1024 struct virtio_net *tdev; /* destination virito device */
1026 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1028 /*get the used devices list*/
1029 dev_ll = ll_root_used;
1031 while (dev_ll != NULL) {
1032 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1033 &dev_ll->vdev->mac_address)) {
1035 /* Drop the packet if the TX packet is destined for the TX device. */
1036 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1037 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1041 tdev = dev_ll->vdev->dev;
1044 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1046 if (unlikely(dev_ll->vdev->remove)) {
1047 /*drop the packet if the device is marked for removal*/
1048 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1050 /*send the packet to the local virtio device*/
1051 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1054 &dev_statistics[tdev->device_fh].rx_total_atomic,
1057 &dev_statistics[tdev->device_fh].rx_atomic,
1059 dev_statistics[tdev->device_fh].tx_total++;
1060 dev_statistics[tdev->device_fh].tx += ret;
1066 dev_ll = dev_ll->next;
1073 * Check if the destination MAC of a packet is one local VM,
1074 * and get its vlan tag, and offset if it is.
1076 static inline int __attribute__((always_inline))
1077 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1078 uint32_t *offset, uint16_t *vlan_tag)
1080 struct virtio_net_data_ll *dev_ll = ll_root_used;
1081 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1083 while (dev_ll != NULL) {
1084 if ((dev_ll->vdev->ready == DEVICE_RX)
1085 && ether_addr_cmp(&(pkt_hdr->d_addr),
1086 &dev_ll->vdev->mac_address)) {
1088 * Drop the packet if the TX packet is
1089 * destined for the TX device.
1091 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1092 LOG_DEBUG(VHOST_DATA,
1093 "(%"PRIu64") TX: Source and destination"
1094 " MAC addresses are the same. Dropping "
1096 dev_ll->vdev->dev->device_fh);
1101 * HW vlan strip will reduce the packet length
1102 * by minus length of vlan tag, so need restore
1103 * the packet length by plus it.
1105 *offset = VLAN_HLEN;
1108 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1110 LOG_DEBUG(VHOST_DATA,
1111 "(%"PRIu64") TX: pkt to local VM device id:"
1112 "(%"PRIu64") vlan tag: %d.\n",
1113 dev->device_fh, dev_ll->vdev->dev->device_fh,
1118 dev_ll = dev_ll->next;
1124 * This function routes the TX packet to the correct interface. This may be a local device
1125 * or the physical port.
1127 static inline void __attribute__((always_inline))
1128 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1130 struct mbuf_table *tx_q;
1131 struct rte_mbuf **m_table;
1132 unsigned len, ret, offset = 0;
1133 const uint16_t lcore_id = rte_lcore_id();
1134 struct virtio_net *dev = vdev->dev;
1136 /*check if destination is local VM*/
1137 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1138 rte_pktmbuf_free(m);
1142 if (vm2vm_mode == VM2VM_HARDWARE) {
1143 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1144 offset > rte_pktmbuf_tailroom(m)) {
1145 rte_pktmbuf_free(m);
1150 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1152 /*Add packet to the port tx queue*/
1153 tx_q = &lcore_tx_queue[lcore_id];
1156 m->ol_flags = PKT_TX_VLAN_PKT;
1158 m->data_len += offset;
1159 m->pkt_len += offset;
1161 m->vlan_tci = vlan_tag;
1163 tx_q->m_table[len] = m;
1166 dev_statistics[dev->device_fh].tx_total++;
1167 dev_statistics[dev->device_fh].tx++;
1170 if (unlikely(len == MAX_PKT_BURST)) {
1171 m_table = (struct rte_mbuf **)tx_q->m_table;
1172 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1173 /* Free any buffers not handled by TX and update the port stats. */
1174 if (unlikely(ret < len)) {
1176 rte_pktmbuf_free(m_table[ret]);
1177 } while (++ret < len);
1187 * This function is called by each data core. It handles all RX/TX registered with the
1188 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1189 * with all devices in the main linked list.
1192 switch_worker(__attribute__((unused)) void *arg)
1194 struct rte_mempool *mbuf_pool = arg;
1195 struct virtio_net *dev = NULL;
1196 struct vhost_dev *vdev = NULL;
1197 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1198 struct virtio_net_data_ll *dev_ll;
1199 struct mbuf_table *tx_q;
1200 volatile struct lcore_ll_info *lcore_ll;
1201 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1202 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1204 const uint16_t lcore_id = rte_lcore_id();
1205 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1206 uint16_t rx_count = 0;
1210 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1211 lcore_ll = lcore_info[lcore_id].lcore_ll;
1214 tx_q = &lcore_tx_queue[lcore_id];
1215 for (i = 0; i < num_cores; i ++) {
1216 if (lcore_ids[i] == lcore_id) {
1223 cur_tsc = rte_rdtsc();
1225 * TX burst queue drain
1227 diff_tsc = cur_tsc - prev_tsc;
1228 if (unlikely(diff_tsc > drain_tsc)) {
1231 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1233 /*Tx any packets in the queue*/
1234 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1235 (struct rte_mbuf **)tx_q->m_table,
1236 (uint16_t)tx_q->len);
1237 if (unlikely(ret < tx_q->len)) {
1239 rte_pktmbuf_free(tx_q->m_table[ret]);
1240 } while (++ret < tx_q->len);
1250 rte_prefetch0(lcore_ll->ll_root_used);
1252 * Inform the configuration core that we have exited the linked list and that no devices are
1253 * in use if requested.
1255 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1256 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1261 dev_ll = lcore_ll->ll_root_used;
1263 while (dev_ll != NULL) {
1264 /*get virtio device ID*/
1265 vdev = dev_ll->vdev;
1268 if (unlikely(vdev->remove)) {
1269 dev_ll = dev_ll->next;
1271 vdev->ready = DEVICE_SAFE_REMOVE;
1274 if (likely(vdev->ready == DEVICE_RX)) {
1276 rx_count = rte_eth_rx_burst(ports[0],
1277 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1281 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1282 * Here MAX_PKT_BURST must be less than virtio queue size
1284 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1285 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1286 rte_delay_us(burst_rx_delay_time);
1287 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1291 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1294 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1297 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1299 while (likely(rx_count)) {
1301 rte_pktmbuf_free(pkts_burst[rx_count]);
1307 if (likely(!vdev->remove)) {
1308 /* Handle guest TX*/
1309 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1310 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1311 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1312 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1314 rte_pktmbuf_free(pkts_burst[tx_count]);
1318 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1321 /*move to the next device in the list*/
1322 dev_ll = dev_ll->next;
1330 * This function gets available ring number for zero copy rx.
1331 * Only one thread will call this funciton for a paticular virtio device,
1332 * so, it is designed as non-thread-safe function.
1334 static inline uint32_t __attribute__((always_inline))
1335 get_available_ring_num_zcp(struct virtio_net *dev)
1337 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1340 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1341 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1345 * This function gets available ring index for zero copy rx,
1346 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1347 * Only one thread will call this funciton for a paticular virtio device,
1348 * so, it is designed as non-thread-safe function.
1350 static inline uint32_t __attribute__((always_inline))
1351 get_available_ring_index_zcp(struct virtio_net *dev,
1352 uint16_t *res_base_idx, uint32_t count)
1354 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1357 uint16_t free_entries;
1359 *res_base_idx = vq->last_used_idx_res;
1360 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361 free_entries = (avail_idx - *res_base_idx);
1363 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1365 "res base idx:%d, free entries:%d\n",
1366 dev->device_fh, avail_idx, *res_base_idx,
1370 * If retry is enabled and the queue is full then we wait
1371 * and retry to avoid packet loss.
1373 if (enable_retry && unlikely(count > free_entries)) {
1374 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1375 rte_delay_us(burst_rx_delay_time);
1376 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1377 free_entries = (avail_idx - *res_base_idx);
1378 if (count <= free_entries)
1383 /*check that we have enough buffers*/
1384 if (unlikely(count > free_entries))
1385 count = free_entries;
1387 if (unlikely(count == 0)) {
1388 LOG_DEBUG(VHOST_DATA,
1389 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1390 "avail idx: %d, res base idx:%d, free entries:%d\n",
1391 dev->device_fh, avail_idx,
1392 *res_base_idx, free_entries);
1396 vq->last_used_idx_res = *res_base_idx + count;
1402 * This function put descriptor back to used list.
1404 static inline void __attribute__((always_inline))
1405 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1407 uint16_t res_cur_idx = vq->last_used_idx;
1408 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1409 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1410 rte_compiler_barrier();
1411 *(volatile uint16_t *)&vq->used->idx += 1;
1412 vq->last_used_idx += 1;
1414 /* Kick the guest if necessary. */
1415 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1416 eventfd_write((int)vq->kickfd, 1);
1420 * This function get available descriptor from vitio vring and un-attached mbuf
1421 * from vpool->ring, and then attach them together. It needs adjust the offset
1422 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1423 * frame data may be put to wrong location in mbuf.
1425 static inline void __attribute__((always_inline))
1426 attach_rxmbuf_zcp(struct virtio_net *dev)
1428 uint16_t res_base_idx, desc_idx;
1429 uint64_t buff_addr, phys_addr;
1430 struct vhost_virtqueue *vq;
1431 struct vring_desc *desc;
1432 struct rte_mbuf *mbuf = NULL;
1433 struct vpool *vpool;
1435 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1437 vpool = &vpool_array[vdev->vmdq_rx_q];
1438 vq = dev->virtqueue[VIRTIO_RXQ];
1441 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1444 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1446 desc = &vq->desc[desc_idx];
1447 if (desc->flags & VRING_DESC_F_NEXT) {
1448 desc = &vq->desc[desc->next];
1449 buff_addr = gpa_to_vva(dev, desc->addr);
1450 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1453 buff_addr = gpa_to_vva(dev,
1454 desc->addr + vq->vhost_hlen);
1455 phys_addr = gpa_to_hpa(vdev,
1456 desc->addr + vq->vhost_hlen,
1457 desc->len, &addr_type);
1460 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1461 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1462 " address found when attaching RX frame buffer"
1463 " address!\n", dev->device_fh);
1464 put_desc_to_used_list_zcp(vq, desc_idx);
1469 * Check if the frame buffer address from guest crosses
1470 * sub-region or not.
1472 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1473 RTE_LOG(ERR, VHOST_DATA,
1474 "(%"PRIu64") Frame buffer address cross "
1475 "sub-regioin found when attaching RX frame "
1476 "buffer address!\n",
1478 put_desc_to_used_list_zcp(vq, desc_idx);
1481 } while (unlikely(phys_addr == 0));
1483 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1484 if (unlikely(mbuf == NULL)) {
1485 LOG_DEBUG(VHOST_DATA,
1486 "(%"PRIu64") in attach_rxmbuf_zcp: "
1487 "ring_sc_dequeue fail.\n",
1489 put_desc_to_used_list_zcp(vq, desc_idx);
1493 if (unlikely(vpool->buf_size > desc->len)) {
1494 LOG_DEBUG(VHOST_DATA,
1495 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1496 "length(%d) of descriptor idx: %d less than room "
1497 "size required: %d\n",
1498 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1499 put_desc_to_used_list_zcp(vq, desc_idx);
1500 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1504 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1505 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1506 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1507 mbuf->data_len = desc->len;
1508 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1510 LOG_DEBUG(VHOST_DATA,
1511 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1512 "descriptor idx:%d\n",
1513 dev->device_fh, res_base_idx, desc_idx);
1515 __rte_mbuf_raw_free(mbuf);
1521 * Detach an attched packet mbuf -
1522 * - restore original mbuf address and length values.
1523 * - reset pktmbuf data and data_len to their default values.
1524 * All other fields of the given packet mbuf will be left intact.
1527 * The attached packet mbuf.
1529 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1531 const struct rte_mempool *mp = m->pool;
1532 void *buf = RTE_MBUF_TO_BADDR(m);
1534 uint32_t buf_len = mp->elt_size - sizeof(*m);
1535 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1538 m->buf_len = (uint16_t)buf_len;
1540 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1541 RTE_PKTMBUF_HEADROOM : m->buf_len;
1542 m->data_off = buf_ofs;
1548 * This function is called after packets have been transimited. It fetchs mbuf
1549 * from vpool->pool, detached it and put into vpool->ring. It also update the
1550 * used index and kick the guest if necessary.
1552 static inline uint32_t __attribute__((always_inline))
1553 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1555 struct rte_mbuf *mbuf;
1556 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1557 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1559 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1561 LOG_DEBUG(VHOST_DATA,
1562 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1564 dev->device_fh, mbuf_count);
1565 LOG_DEBUG(VHOST_DATA,
1566 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1568 dev->device_fh, rte_ring_count(vpool->ring));
1570 for (index = 0; index < mbuf_count; index++) {
1571 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1572 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1573 pktmbuf_detach_zcp(mbuf);
1574 rte_ring_sp_enqueue(vpool->ring, mbuf);
1576 /* Update used index buffer information. */
1577 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1578 vq->used->ring[used_idx].len = 0;
1580 used_idx = (used_idx + 1) & (vq->size - 1);
1583 LOG_DEBUG(VHOST_DATA,
1584 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1586 dev->device_fh, rte_mempool_count(vpool->pool));
1587 LOG_DEBUG(VHOST_DATA,
1588 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1590 dev->device_fh, rte_ring_count(vpool->ring));
1591 LOG_DEBUG(VHOST_DATA,
1592 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1593 "vq->last_used_idx:%d\n",
1594 dev->device_fh, vq->last_used_idx);
1596 vq->last_used_idx += mbuf_count;
1598 LOG_DEBUG(VHOST_DATA,
1599 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1600 "vq->last_used_idx:%d\n",
1601 dev->device_fh, vq->last_used_idx);
1603 rte_compiler_barrier();
1605 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1607 /* Kick guest if required. */
1608 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1609 eventfd_write((int)vq->kickfd, 1);
1615 * This function is called when a virtio device is destroy.
1616 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1618 static void mbuf_destroy_zcp(struct vpool *vpool)
1620 struct rte_mbuf *mbuf = NULL;
1621 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1623 LOG_DEBUG(VHOST_CONFIG,
1624 "in mbuf_destroy_zcp: mbuf count in mempool before "
1625 "mbuf_destroy_zcp is: %d\n",
1627 LOG_DEBUG(VHOST_CONFIG,
1628 "in mbuf_destroy_zcp: mbuf count in ring before "
1629 "mbuf_destroy_zcp is : %d\n",
1630 rte_ring_count(vpool->ring));
1632 for (index = 0; index < mbuf_count; index++) {
1633 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1634 if (likely(mbuf != NULL)) {
1635 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1636 pktmbuf_detach_zcp(mbuf);
1637 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1641 LOG_DEBUG(VHOST_CONFIG,
1642 "in mbuf_destroy_zcp: mbuf count in mempool after "
1643 "mbuf_destroy_zcp is: %d\n",
1644 rte_mempool_count(vpool->pool));
1645 LOG_DEBUG(VHOST_CONFIG,
1646 "in mbuf_destroy_zcp: mbuf count in ring after "
1647 "mbuf_destroy_zcp is : %d\n",
1648 rte_ring_count(vpool->ring));
1652 * This function update the use flag and counter.
1654 static inline uint32_t __attribute__((always_inline))
1655 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1658 struct vhost_virtqueue *vq;
1659 struct vring_desc *desc;
1660 struct rte_mbuf *buff;
1661 /* The virtio_hdr is initialised to 0. */
1662 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1663 = {{0, 0, 0, 0, 0, 0}, 0};
1664 uint64_t buff_hdr_addr = 0;
1665 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1666 uint32_t head_idx, packet_success = 0;
1667 uint16_t res_cur_idx;
1669 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1674 vq = dev->virtqueue[VIRTIO_RXQ];
1675 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1677 res_cur_idx = vq->last_used_idx;
1678 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1679 dev->device_fh, res_cur_idx, res_cur_idx + count);
1681 /* Retrieve all of the head indexes first to avoid caching issues. */
1682 for (head_idx = 0; head_idx < count; head_idx++)
1683 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1685 /*Prefetch descriptor index. */
1686 rte_prefetch0(&vq->desc[head[packet_success]]);
1688 while (packet_success != count) {
1689 /* Get descriptor from available ring */
1690 desc = &vq->desc[head[packet_success]];
1692 buff = pkts[packet_success];
1693 LOG_DEBUG(VHOST_DATA,
1694 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1695 "pkt[%d] descriptor idx: %d\n",
1696 dev->device_fh, packet_success,
1697 MBUF_HEADROOM_UINT32(buff));
1700 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1701 + RTE_PKTMBUF_HEADROOM),
1702 rte_pktmbuf_data_len(buff), 0);
1704 /* Buffer address translation for virtio header. */
1705 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1706 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1709 * If the descriptors are chained the header and data are
1710 * placed in separate buffers.
1712 if (desc->flags & VRING_DESC_F_NEXT) {
1713 desc->len = vq->vhost_hlen;
1714 desc = &vq->desc[desc->next];
1715 desc->len = rte_pktmbuf_data_len(buff);
1717 desc->len = packet_len;
1720 /* Update used ring with desc information */
1721 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1722 = head[packet_success];
1723 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1728 /* A header is required per buffer. */
1729 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1730 (const void *)&virtio_hdr, vq->vhost_hlen);
1732 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1734 if (likely(packet_success < count)) {
1735 /* Prefetch descriptor index. */
1736 rte_prefetch0(&vq->desc[head[packet_success]]);
1740 rte_compiler_barrier();
1742 LOG_DEBUG(VHOST_DATA,
1743 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1744 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1745 dev->device_fh, vq->last_used_idx, vq->used->idx);
1747 *(volatile uint16_t *)&vq->used->idx += count;
1748 vq->last_used_idx += count;
1750 LOG_DEBUG(VHOST_DATA,
1751 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1752 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1753 dev->device_fh, vq->last_used_idx, vq->used->idx);
1755 /* Kick the guest if necessary. */
1756 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1757 eventfd_write((int)vq->kickfd, 1);
1763 * This function routes the TX packet to the correct interface.
1764 * This may be a local device or the physical port.
1766 static inline void __attribute__((always_inline))
1767 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1768 uint32_t desc_idx, uint8_t need_copy)
1770 struct mbuf_table *tx_q;
1771 struct rte_mbuf **m_table;
1772 struct rte_mbuf *mbuf = NULL;
1773 unsigned len, ret, offset = 0;
1774 struct vpool *vpool;
1775 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1776 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1778 /*Add packet to the port tx queue*/
1779 tx_q = &tx_queue_zcp[vmdq_rx_q];
1782 /* Allocate an mbuf and populate the structure. */
1783 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1784 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1785 if (unlikely(mbuf == NULL)) {
1786 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1787 RTE_LOG(ERR, VHOST_DATA,
1788 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1790 put_desc_to_used_list_zcp(vq, desc_idx);
1794 if (vm2vm_mode == VM2VM_HARDWARE) {
1795 /* Avoid using a vlan tag from any vm for external pkt, such as
1796 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1797 * selection, MAC address determines it as an external pkt
1798 * which should go to network, while vlan tag determine it as
1799 * a vm2vm pkt should forward to another vm. Hardware confuse
1800 * such a ambiguous situation, so pkt will lost.
1802 vlan_tag = external_pkt_default_vlan_tag;
1803 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1804 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1805 __rte_mbuf_raw_free(mbuf);
1810 mbuf->nb_segs = m->nb_segs;
1811 mbuf->next = m->next;
1812 mbuf->data_len = m->data_len + offset;
1813 mbuf->pkt_len = mbuf->data_len;
1814 if (unlikely(need_copy)) {
1815 /* Copy the packet contents to the mbuf. */
1816 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1817 rte_pktmbuf_mtod(m, void *),
1820 mbuf->data_off = m->data_off;
1821 mbuf->buf_physaddr = m->buf_physaddr;
1822 mbuf->buf_addr = m->buf_addr;
1824 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1825 mbuf->vlan_tci = vlan_tag;
1826 mbuf->l2_len = sizeof(struct ether_hdr);
1827 mbuf->l3_len = sizeof(struct ipv4_hdr);
1828 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1830 tx_q->m_table[len] = mbuf;
1833 LOG_DEBUG(VHOST_DATA,
1834 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1837 (mbuf->next == NULL) ? "null" : "non-null");
1840 dev_statistics[dev->device_fh].tx_total++;
1841 dev_statistics[dev->device_fh].tx++;
1844 if (unlikely(len == MAX_PKT_BURST)) {
1845 m_table = (struct rte_mbuf **)tx_q->m_table;
1846 ret = rte_eth_tx_burst(ports[0],
1847 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1850 * Free any buffers not handled by TX and update
1853 if (unlikely(ret < len)) {
1855 rte_pktmbuf_free(m_table[ret]);
1856 } while (++ret < len);
1860 txmbuf_clean_zcp(dev, vpool);
1869 * This function TX all available packets in virtio TX queue for one
1870 * virtio-net device. If it is first packet, it learns MAC address and
1873 static inline void __attribute__((always_inline))
1874 virtio_dev_tx_zcp(struct virtio_net *dev)
1877 struct vhost_virtqueue *vq;
1878 struct vring_desc *desc;
1879 uint64_t buff_addr = 0, phys_addr;
1880 uint32_t head[MAX_PKT_BURST];
1882 uint16_t free_entries, packet_success = 0;
1884 uint8_t need_copy = 0;
1886 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1888 vq = dev->virtqueue[VIRTIO_TXQ];
1889 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1891 /* If there are no available buffers then return. */
1892 if (vq->last_used_idx_res == avail_idx)
1895 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1897 /* Prefetch available ring to retrieve head indexes. */
1898 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1900 /* Get the number of free entries in the ring */
1901 free_entries = (avail_idx - vq->last_used_idx_res);
1903 /* Limit to MAX_PKT_BURST. */
1905 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1907 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1908 dev->device_fh, free_entries);
1910 /* Retrieve all of the head indexes first to avoid caching issues. */
1911 for (i = 0; i < free_entries; i++)
1913 = vq->avail->ring[(vq->last_used_idx_res + i)
1916 vq->last_used_idx_res += free_entries;
1918 /* Prefetch descriptor index. */
1919 rte_prefetch0(&vq->desc[head[packet_success]]);
1920 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1922 while (packet_success < free_entries) {
1923 desc = &vq->desc[head[packet_success]];
1925 /* Discard first buffer as it is the virtio header */
1926 desc = &vq->desc[desc->next];
1928 /* Buffer address translation. */
1929 buff_addr = gpa_to_vva(dev, desc->addr);
1930 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1931 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1934 if (likely(packet_success < (free_entries - 1)))
1935 /* Prefetch descriptor index. */
1936 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1938 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1939 RTE_LOG(ERR, VHOST_DATA,
1940 "(%"PRIu64") Invalid frame buffer address found"
1941 "when TX packets!\n",
1947 /* Prefetch buffer address. */
1948 rte_prefetch0((void *)(uintptr_t)buff_addr);
1951 * Setup dummy mbuf. This is copied to a real mbuf if
1952 * transmitted out the physical port.
1954 m.data_len = desc->len;
1958 m.buf_addr = (void *)(uintptr_t)buff_addr;
1959 m.buf_physaddr = phys_addr;
1962 * Check if the frame buffer address from guest crosses
1963 * sub-region or not.
1965 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1966 RTE_LOG(ERR, VHOST_DATA,
1967 "(%"PRIu64") Frame buffer address cross "
1968 "sub-regioin found when attaching TX frame "
1969 "buffer address!\n",
1975 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1978 * If this is the first received packet we need to learn
1979 * the MAC and setup VMDQ
1981 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1982 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1984 * Discard frame if device is scheduled for
1985 * removal or a duplicate MAC address is found.
1987 packet_success += free_entries;
1988 vq->last_used_idx += packet_success;
1993 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1999 * This function is called by each data core. It handles all RX/TX registered
2000 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2001 * addresses are compared with all devices in the main linked list.
2004 switch_worker_zcp(__attribute__((unused)) void *arg)
2006 struct virtio_net *dev = NULL;
2007 struct vhost_dev *vdev = NULL;
2008 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2009 struct virtio_net_data_ll *dev_ll;
2010 struct mbuf_table *tx_q;
2011 volatile struct lcore_ll_info *lcore_ll;
2012 const uint64_t drain_tsc
2013 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2014 * BURST_TX_DRAIN_US;
2015 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2017 const uint16_t lcore_id = rte_lcore_id();
2018 uint16_t count_in_ring, rx_count = 0;
2020 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2022 lcore_ll = lcore_info[lcore_id].lcore_ll;
2026 cur_tsc = rte_rdtsc();
2028 /* TX burst queue drain */
2029 diff_tsc = cur_tsc - prev_tsc;
2030 if (unlikely(diff_tsc > drain_tsc)) {
2032 * Get mbuf from vpool.pool and detach mbuf and
2033 * put back into vpool.ring.
2035 dev_ll = lcore_ll->ll_root_used;
2036 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2037 /* Get virtio device ID */
2038 vdev = dev_ll->vdev;
2041 if (likely(!vdev->remove)) {
2042 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2044 LOG_DEBUG(VHOST_DATA,
2045 "TX queue drained after timeout"
2046 " with burst size %u\n",
2050 * Tx any packets in the queue
2052 ret = rte_eth_tx_burst(
2054 (uint16_t)tx_q->txq_id,
2055 (struct rte_mbuf **)
2057 (uint16_t)tx_q->len);
2058 if (unlikely(ret < tx_q->len)) {
2061 tx_q->m_table[ret]);
2062 } while (++ret < tx_q->len);
2066 txmbuf_clean_zcp(dev,
2067 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2070 dev_ll = dev_ll->next;
2075 rte_prefetch0(lcore_ll->ll_root_used);
2078 * Inform the configuration core that we have exited the linked
2079 * list and that no devices are in use if requested.
2081 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2082 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2084 /* Process devices */
2085 dev_ll = lcore_ll->ll_root_used;
2087 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2088 vdev = dev_ll->vdev;
2090 if (unlikely(vdev->remove)) {
2091 dev_ll = dev_ll->next;
2093 vdev->ready = DEVICE_SAFE_REMOVE;
2097 if (likely(vdev->ready == DEVICE_RX)) {
2098 uint32_t index = vdev->vmdq_rx_q;
2101 = rte_ring_count(vpool_array[index].ring);
2102 uint16_t free_entries
2103 = (uint16_t)get_available_ring_num_zcp(dev);
2106 * Attach all mbufs in vpool.ring and put back
2110 i < RTE_MIN(free_entries,
2111 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2113 attach_rxmbuf_zcp(dev);
2115 /* Handle guest RX */
2116 rx_count = rte_eth_rx_burst(ports[0],
2117 vdev->vmdq_rx_q, pkts_burst,
2121 ret_count = virtio_dev_rx_zcp(dev,
2122 pkts_burst, rx_count);
2124 dev_statistics[dev->device_fh].rx_total
2126 dev_statistics[dev->device_fh].rx
2129 while (likely(rx_count)) {
2132 pkts_burst[rx_count]);
2133 rte_ring_sp_enqueue(
2134 vpool_array[index].ring,
2135 (void *)pkts_burst[rx_count]);
2140 if (likely(!vdev->remove))
2141 /* Handle guest TX */
2142 virtio_dev_tx_zcp(dev);
2144 /* Move to the next device in the list */
2145 dev_ll = dev_ll->next;
2154 * Add an entry to a used linked list. A free entry must first be found
2155 * in the free linked list using get_data_ll_free_entry();
2158 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2159 struct virtio_net_data_ll *ll_dev)
2161 struct virtio_net_data_ll *ll = *ll_root_addr;
2163 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2164 ll_dev->next = NULL;
2165 rte_compiler_barrier();
2167 /* If ll == NULL then this is the first device. */
2169 /* Increment to the tail of the linked list. */
2170 while ((ll->next != NULL) )
2175 *ll_root_addr = ll_dev;
2180 * Remove an entry from a used linked list. The entry must then be added to
2181 * the free linked list using put_data_ll_free_entry().
2184 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2185 struct virtio_net_data_ll *ll_dev,
2186 struct virtio_net_data_ll *ll_dev_last)
2188 struct virtio_net_data_ll *ll = *ll_root_addr;
2190 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2194 *ll_root_addr = ll_dev->next;
2196 if (likely(ll_dev_last != NULL))
2197 ll_dev_last->next = ll_dev->next;
2199 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2203 * Find and return an entry from the free linked list.
2205 static struct virtio_net_data_ll *
2206 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2208 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2209 struct virtio_net_data_ll *ll_dev;
2211 if (ll_free == NULL)
2215 *ll_root_addr = ll_free->next;
2221 * Place an entry back on to the free linked list.
2224 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2225 struct virtio_net_data_ll *ll_dev)
2227 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2232 ll_dev->next = ll_free;
2233 *ll_root_addr = ll_dev;
2237 * Creates a linked list of a given size.
2239 static struct virtio_net_data_ll *
2240 alloc_data_ll(uint32_t size)
2242 struct virtio_net_data_ll *ll_new;
2245 /* Malloc and then chain the linked list. */
2246 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2247 if (ll_new == NULL) {
2248 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2252 for (i = 0; i < size - 1; i++) {
2253 ll_new[i].vdev = NULL;
2254 ll_new[i].next = &ll_new[i+1];
2256 ll_new[i].next = NULL;
2262 * Create the main linked list along with each individual cores linked list. A used and a free list
2263 * are created to manage entries.
2270 RTE_LCORE_FOREACH_SLAVE(lcore) {
2271 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2272 if (lcore_info[lcore].lcore_ll == NULL) {
2273 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2277 lcore_info[lcore].lcore_ll->device_num = 0;
2278 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2279 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2280 if (num_devices % num_switching_cores)
2281 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2283 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2286 /* Allocate devices up to a maximum of MAX_DEVICES. */
2287 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2293 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2294 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2295 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2298 destroy_device (volatile struct virtio_net *dev)
2300 struct virtio_net_data_ll *ll_lcore_dev_cur;
2301 struct virtio_net_data_ll *ll_main_dev_cur;
2302 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2303 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2304 struct vhost_dev *vdev;
2307 dev->flags &= ~VIRTIO_DEV_RUNNING;
2309 vdev = (struct vhost_dev *)dev->priv;
2310 /*set the remove flag. */
2312 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2316 /* Search for entry to be removed from lcore ll */
2317 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2318 while (ll_lcore_dev_cur != NULL) {
2319 if (ll_lcore_dev_cur->vdev == vdev) {
2322 ll_lcore_dev_last = ll_lcore_dev_cur;
2323 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2327 if (ll_lcore_dev_cur == NULL) {
2328 RTE_LOG(ERR, VHOST_CONFIG,
2329 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2334 /* Search for entry to be removed from main ll */
2335 ll_main_dev_cur = ll_root_used;
2336 ll_main_dev_last = NULL;
2337 while (ll_main_dev_cur != NULL) {
2338 if (ll_main_dev_cur->vdev == vdev) {
2341 ll_main_dev_last = ll_main_dev_cur;
2342 ll_main_dev_cur = ll_main_dev_cur->next;
2346 /* Remove entries from the lcore and main ll. */
2347 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2348 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2350 /* Set the dev_removal_flag on each lcore. */
2351 RTE_LCORE_FOREACH_SLAVE(lcore) {
2352 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2356 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2357 * they can no longer access the device removed from the linked lists and that the devices
2358 * are no longer in use.
2360 RTE_LCORE_FOREACH_SLAVE(lcore) {
2361 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2366 /* Add the entries back to the lcore and main free ll.*/
2367 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2368 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2370 /* Decrement number of device on the lcore. */
2371 lcore_info[vdev->coreid].lcore_ll->device_num--;
2373 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2376 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2378 /* Stop the RX queue. */
2379 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2380 LOG_DEBUG(VHOST_CONFIG,
2381 "(%"PRIu64") In destroy_device: Failed to stop "
2387 LOG_DEBUG(VHOST_CONFIG,
2388 "(%"PRIu64") in destroy_device: Start put mbuf in "
2389 "mempool back to ring for RX queue: %d\n",
2390 dev->device_fh, vdev->vmdq_rx_q);
2392 mbuf_destroy_zcp(vpool);
2394 /* Stop the TX queue. */
2395 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2396 LOG_DEBUG(VHOST_CONFIG,
2397 "(%"PRIu64") In destroy_device: Failed to "
2398 "stop tx queue:%d\n",
2399 dev->device_fh, vdev->vmdq_rx_q);
2402 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2404 LOG_DEBUG(VHOST_CONFIG,
2405 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2406 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2407 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2410 mbuf_destroy_zcp(vpool);
2411 rte_free(vdev->regions_hpa);
2418 * Calculate the region count of physical continous regions for one particular
2419 * region of whose vhost virtual address is continous. The particular region
2420 * start from vva_start, with size of 'size' in argument.
2423 check_hpa_regions(uint64_t vva_start, uint64_t size)
2425 uint32_t i, nregions = 0, page_size = getpagesize();
2426 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2427 if (vva_start % page_size) {
2428 LOG_DEBUG(VHOST_CONFIG,
2429 "in check_countinous: vva start(%p) mod page_size(%d) "
2431 (void *)(uintptr_t)vva_start, page_size);
2434 if (size % page_size) {
2435 LOG_DEBUG(VHOST_CONFIG,
2436 "in check_countinous: "
2437 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2441 for (i = 0; i < size - page_size; i = i + page_size) {
2443 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2444 next_phys_addr = rte_mem_virt2phy(
2445 (void *)(uintptr_t)(vva_start + i + page_size));
2446 if ((cur_phys_addr + page_size) != next_phys_addr) {
2448 LOG_DEBUG(VHOST_CONFIG,
2449 "in check_continuous: hva addr:(%p) is not "
2450 "continuous with hva addr:(%p), diff:%d\n",
2451 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2452 (void *)(uintptr_t)(vva_start + (uint64_t)i
2453 + page_size), page_size);
2454 LOG_DEBUG(VHOST_CONFIG,
2455 "in check_continuous: hpa addr:(%p) is not "
2456 "continuous with hpa addr:(%p), "
2457 "diff:(%"PRIu64")\n",
2458 (void *)(uintptr_t)cur_phys_addr,
2459 (void *)(uintptr_t)next_phys_addr,
2460 (next_phys_addr-cur_phys_addr));
2467 * Divide each region whose vhost virtual address is continous into a few
2468 * sub-regions, make sure the physical address within each sub-region are
2469 * continous. And fill offset(to GPA) and size etc. information of each
2470 * sub-region into regions_hpa.
2473 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2475 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2476 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2478 if (mem_region_hpa == NULL)
2481 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2482 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2483 virtio_memory->regions[regionidx].address_offset;
2484 mem_region_hpa[regionidx_hpa].guest_phys_address
2485 = virtio_memory->regions[regionidx].guest_phys_address;
2486 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2487 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2488 mem_region_hpa[regionidx_hpa].guest_phys_address;
2489 LOG_DEBUG(VHOST_CONFIG,
2490 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2493 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2494 LOG_DEBUG(VHOST_CONFIG,
2495 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2498 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2500 i < virtio_memory->regions[regionidx].memory_size -
2503 cur_phys_addr = rte_mem_virt2phy(
2504 (void *)(uintptr_t)(vva_start + i));
2505 next_phys_addr = rte_mem_virt2phy(
2506 (void *)(uintptr_t)(vva_start +
2508 if ((cur_phys_addr + page_size) != next_phys_addr) {
2509 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2510 mem_region_hpa[regionidx_hpa].guest_phys_address +
2512 mem_region_hpa[regionidx_hpa].memory_size
2514 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2515 "phys addr end [%d]:(%p)\n",
2518 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2519 LOG_DEBUG(VHOST_CONFIG,
2520 "in fill_hpa_regions: guest phys addr "
2524 (mem_region_hpa[regionidx_hpa].memory_size));
2525 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2526 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2528 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2530 mem_region_hpa[regionidx_hpa].guest_phys_address;
2531 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2532 " phys addr start[%d]:(%p)\n",
2535 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2536 LOG_DEBUG(VHOST_CONFIG,
2537 "in fill_hpa_regions: host phys addr "
2541 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2547 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2548 = mem_region_hpa[regionidx_hpa].guest_phys_address
2550 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2551 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2552 "[%d]:(%p)\n", regionidx_hpa,
2554 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2555 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2556 "[%d]:(%p)\n", regionidx_hpa,
2558 (mem_region_hpa[regionidx_hpa].memory_size));
2561 return regionidx_hpa;
2565 * A new device is added to a data core. First the device is added to the main linked list
2566 * and the allocated to a specific data core.
2569 new_device (struct virtio_net *dev)
2571 struct virtio_net_data_ll *ll_dev;
2572 int lcore, core_add = 0;
2573 uint32_t device_num_min = num_devices;
2574 struct vhost_dev *vdev;
2577 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2579 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2587 vdev->nregions_hpa = dev->mem->nregions;
2588 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2590 += check_hpa_regions(
2591 dev->mem->regions[regionidx].guest_phys_address
2592 + dev->mem->regions[regionidx].address_offset,
2593 dev->mem->regions[regionidx].memory_size);
2597 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2598 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2599 RTE_CACHE_LINE_SIZE);
2600 if (vdev->regions_hpa == NULL) {
2601 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2607 if (fill_hpa_memory_regions(
2608 vdev->regions_hpa, dev->mem
2609 ) != vdev->nregions_hpa) {
2611 RTE_LOG(ERR, VHOST_CONFIG,
2612 "hpa memory regions number mismatch: "
2613 "[%d]\n", vdev->nregions_hpa);
2614 rte_free(vdev->regions_hpa);
2621 /* Add device to main ll */
2622 ll_dev = get_data_ll_free_entry(&ll_root_free);
2623 if (ll_dev == NULL) {
2624 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2625 "of %d devices per core has been reached\n",
2626 dev->device_fh, num_devices);
2627 if (vdev->regions_hpa)
2628 rte_free(vdev->regions_hpa);
2632 ll_dev->vdev = vdev;
2633 add_data_ll_entry(&ll_root_used, ll_dev);
2635 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2638 uint32_t index = vdev->vmdq_rx_q;
2639 uint32_t count_in_ring, i;
2640 struct mbuf_table *tx_q;
2642 count_in_ring = rte_ring_count(vpool_array[index].ring);
2644 LOG_DEBUG(VHOST_CONFIG,
2645 "(%"PRIu64") in new_device: mbuf count in mempool "
2646 "before attach is: %d\n",
2648 rte_mempool_count(vpool_array[index].pool));
2649 LOG_DEBUG(VHOST_CONFIG,
2650 "(%"PRIu64") in new_device: mbuf count in ring "
2651 "before attach is : %d\n",
2652 dev->device_fh, count_in_ring);
2655 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2657 for (i = 0; i < count_in_ring; i++)
2658 attach_rxmbuf_zcp(dev);
2660 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2661 "mempool after attach is: %d\n",
2663 rte_mempool_count(vpool_array[index].pool));
2664 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2665 "ring after attach is : %d\n",
2667 rte_ring_count(vpool_array[index].ring));
2669 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2670 tx_q->txq_id = vdev->vmdq_rx_q;
2672 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2673 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2675 LOG_DEBUG(VHOST_CONFIG,
2676 "(%"PRIu64") In new_device: Failed to start "
2678 dev->device_fh, vdev->vmdq_rx_q);
2680 mbuf_destroy_zcp(vpool);
2681 rte_free(vdev->regions_hpa);
2686 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2687 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2689 LOG_DEBUG(VHOST_CONFIG,
2690 "(%"PRIu64") In new_device: Failed to start "
2692 dev->device_fh, vdev->vmdq_rx_q);
2694 /* Stop the TX queue. */
2695 if (rte_eth_dev_tx_queue_stop(ports[0],
2696 vdev->vmdq_rx_q) != 0) {
2697 LOG_DEBUG(VHOST_CONFIG,
2698 "(%"PRIu64") In new_device: Failed to "
2699 "stop tx queue:%d\n",
2700 dev->device_fh, vdev->vmdq_rx_q);
2703 mbuf_destroy_zcp(vpool);
2704 rte_free(vdev->regions_hpa);
2711 /*reset ready flag*/
2712 vdev->ready = DEVICE_MAC_LEARNING;
2715 /* Find a suitable lcore to add the device. */
2716 RTE_LCORE_FOREACH_SLAVE(lcore) {
2717 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2718 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2722 /* Add device to lcore ll */
2723 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2724 if (ll_dev == NULL) {
2725 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2726 vdev->ready = DEVICE_SAFE_REMOVE;
2727 destroy_device(dev);
2728 if (vdev->regions_hpa)
2729 rte_free(vdev->regions_hpa);
2733 ll_dev->vdev = vdev;
2734 vdev->coreid = core_add;
2736 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2738 /* Initialize device stats */
2739 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2741 /* Disable notifications. */
2742 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2743 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2744 lcore_info[vdev->coreid].lcore_ll->device_num++;
2745 dev->flags |= VIRTIO_DEV_RUNNING;
2747 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2753 * These callback allow devices to be added to the data core when configuration
2754 * has been fully complete.
2756 static const struct virtio_net_device_ops virtio_net_device_ops =
2758 .new_device = new_device,
2759 .destroy_device = destroy_device,
2763 * This is a thread will wake up after a period to print stats if the user has
2769 struct virtio_net_data_ll *dev_ll;
2770 uint64_t tx_dropped, rx_dropped;
2771 uint64_t tx, tx_total, rx, rx_total;
2773 const char clr[] = { 27, '[', '2', 'J', '\0' };
2774 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2777 sleep(enable_stats);
2779 /* Clear screen and move to top left */
2780 printf("%s%s", clr, top_left);
2782 printf("\nDevice statistics ====================================");
2784 dev_ll = ll_root_used;
2785 while (dev_ll != NULL) {
2786 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2787 tx_total = dev_statistics[device_fh].tx_total;
2788 tx = dev_statistics[device_fh].tx;
2789 tx_dropped = tx_total - tx;
2790 if (zero_copy == 0) {
2791 rx_total = rte_atomic64_read(
2792 &dev_statistics[device_fh].rx_total_atomic);
2793 rx = rte_atomic64_read(
2794 &dev_statistics[device_fh].rx_atomic);
2796 rx_total = dev_statistics[device_fh].rx_total;
2797 rx = dev_statistics[device_fh].rx;
2799 rx_dropped = rx_total - rx;
2801 printf("\nStatistics for device %"PRIu32" ------------------------------"
2802 "\nTX total: %"PRIu64""
2803 "\nTX dropped: %"PRIu64""
2804 "\nTX successful: %"PRIu64""
2805 "\nRX total: %"PRIu64""
2806 "\nRX dropped: %"PRIu64""
2807 "\nRX successful: %"PRIu64"",
2816 dev_ll = dev_ll->next;
2818 printf("\n======================================================\n");
2823 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2824 char *ring_name, uint32_t nb_mbuf)
2826 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2827 vpool_array[index].pool
2828 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2829 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2830 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2831 rte_pktmbuf_init, NULL, socket, 0);
2832 if (vpool_array[index].pool != NULL) {
2833 vpool_array[index].ring
2834 = rte_ring_create(ring_name,
2835 rte_align32pow2(nb_mbuf + 1),
2836 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2837 if (likely(vpool_array[index].ring != NULL)) {
2838 LOG_DEBUG(VHOST_CONFIG,
2839 "in setup_mempool_tbl: mbuf count in "
2841 rte_mempool_count(vpool_array[index].pool));
2842 LOG_DEBUG(VHOST_CONFIG,
2843 "in setup_mempool_tbl: mbuf count in "
2845 rte_ring_count(vpool_array[index].ring));
2847 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2851 /* Need consider head room. */
2852 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2854 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2860 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2861 * device is also registered here to handle the IOCTLs.
2864 main(int argc, char *argv[])
2866 struct rte_mempool *mbuf_pool = NULL;
2867 unsigned lcore_id, core_id = 0;
2868 unsigned nb_ports, valid_num_ports;
2872 static pthread_t tid;
2875 ret = rte_eal_init(argc, argv);
2877 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2881 /* parse app arguments */
2882 ret = us_vhost_parse_args(argc, argv);
2884 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2886 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2887 if (rte_lcore_is_enabled(lcore_id))
2888 lcore_ids[core_id ++] = lcore_id;
2890 if (rte_lcore_count() > RTE_MAX_LCORE)
2891 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2893 /*set the number of swithcing cores available*/
2894 num_switching_cores = rte_lcore_count()-1;
2896 /* Get the number of physical ports. */
2897 nb_ports = rte_eth_dev_count();
2898 if (nb_ports > RTE_MAX_ETHPORTS)
2899 nb_ports = RTE_MAX_ETHPORTS;
2902 * Update the global var NUM_PORTS and global array PORTS
2903 * and get value of var VALID_NUM_PORTS according to system ports number
2905 valid_num_ports = check_ports_num(nb_ports);
2907 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2908 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2909 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2913 if (zero_copy == 0) {
2914 /* Create the mbuf pool. */
2915 mbuf_pool = rte_mempool_create(
2919 MBUF_SIZE, MBUF_CACHE_SIZE,
2920 sizeof(struct rte_pktmbuf_pool_private),
2921 rte_pktmbuf_pool_init, NULL,
2922 rte_pktmbuf_init, NULL,
2923 rte_socket_id(), 0);
2924 if (mbuf_pool == NULL)
2925 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2927 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2928 vpool_array[queue_id].pool = mbuf_pool;
2930 if (vm2vm_mode == VM2VM_HARDWARE) {
2931 /* Enable VT loop back to let L2 switch to do it. */
2932 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2933 LOG_DEBUG(VHOST_CONFIG,
2934 "Enable loop back for L2 switch in vmdq.\n");
2938 char pool_name[RTE_MEMPOOL_NAMESIZE];
2939 char ring_name[RTE_MEMPOOL_NAMESIZE];
2942 * Zero copy defers queue RX/TX start to the time when guest
2943 * finishes its startup and packet buffers from that guest are
2946 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2947 rx_conf_default.rx_drop_en = 0;
2948 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2949 nb_mbuf = num_rx_descriptor
2950 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951 + num_switching_cores * MAX_PKT_BURST;
2953 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954 snprintf(pool_name, sizeof(pool_name),
2955 "rxmbuf_pool_%u", queue_id);
2956 snprintf(ring_name, sizeof(ring_name),
2957 "rxmbuf_ring_%u", queue_id);
2958 setup_mempool_tbl(rte_socket_id(), queue_id,
2959 pool_name, ring_name, nb_mbuf);
2962 nb_mbuf = num_tx_descriptor
2963 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2964 + num_switching_cores * MAX_PKT_BURST;
2966 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2967 snprintf(pool_name, sizeof(pool_name),
2968 "txmbuf_pool_%u", queue_id);
2969 snprintf(ring_name, sizeof(ring_name),
2970 "txmbuf_ring_%u", queue_id);
2971 setup_mempool_tbl(rte_socket_id(),
2972 (queue_id + MAX_QUEUES),
2973 pool_name, ring_name, nb_mbuf);
2976 if (vm2vm_mode == VM2VM_HARDWARE) {
2977 /* Enable VT loop back to let L2 switch to do it. */
2978 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2979 LOG_DEBUG(VHOST_CONFIG,
2980 "Enable loop back for L2 switch in vmdq.\n");
2983 /* Set log level. */
2984 rte_set_log_level(LOG_LEVEL);
2986 /* initialize all ports */
2987 for (portid = 0; portid < nb_ports; portid++) {
2988 /* skip ports that are not enabled */
2989 if ((enabled_port_mask & (1 << portid)) == 0) {
2990 RTE_LOG(INFO, VHOST_PORT,
2991 "Skipping disabled port %d\n", portid);
2994 if (port_init(portid) != 0)
2995 rte_exit(EXIT_FAILURE,
2996 "Cannot initialize network ports\n");
2999 /* Initialise all linked lists. */
3000 if (init_data_ll() == -1)
3001 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3003 /* Initialize device stats */
3004 memset(&dev_statistics, 0, sizeof(dev_statistics));
3006 /* Enable stats if the user option is set. */
3008 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3010 /* Launch all data cores. */
3011 if (zero_copy == 0) {
3012 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3013 rte_eal_remote_launch(switch_worker,
3014 mbuf_pool, lcore_id);
3017 uint32_t count_in_mempool, index, i;
3018 for (index = 0; index < 2*MAX_QUEUES; index++) {
3019 /* For all RX and TX queues. */
3021 = rte_mempool_count(vpool_array[index].pool);
3024 * Transfer all un-attached mbufs from vpool.pool
3027 for (i = 0; i < count_in_mempool; i++) {
3028 struct rte_mbuf *mbuf
3029 = __rte_mbuf_raw_alloc(
3030 vpool_array[index].pool);
3031 rte_ring_sp_enqueue(vpool_array[index].ring,
3035 LOG_DEBUG(VHOST_CONFIG,
3036 "in main: mbuf count in mempool at initial "
3037 "is: %d\n", count_in_mempool);
3038 LOG_DEBUG(VHOST_CONFIG,
3039 "in main: mbuf count in ring at initial is :"
3041 rte_ring_count(vpool_array[index].ring));
3044 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3045 rte_eal_remote_launch(switch_worker_zcp, NULL,
3050 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3052 /* Register CUSE device to handle IOCTLs. */
3053 ret = rte_vhost_driver_register((char *)&dev_basename);
3055 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3057 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3059 /* Start CUSE session. */
3060 rte_vhost_driver_session_start();