4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 512
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
88 #define JUMBO_FRAME_MAX_SIZE 0x2600
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
93 #define DEVICE_SAFE_REMOVE 2
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
104 * Need refine these 2 macros for legacy and DPDK based front end:
105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106 * And then adjust power 2.
109 * For legacy front end, 128 descriptors,
110 * half for virtio header, another half for mbuf.
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 + sizeof(struct rte_mbuf)))
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
122 #define INVALID_PORT_ID 0xFF
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
156 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157 * disabled on default.
159 static uint32_t zero_copy;
160 static int mergeable;
162 /* Do vlan strip on host, enabled on default */
163 static uint32_t vlan_strip = 1;
165 /* number of descriptors to apply*/
166 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
167 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
169 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
170 #define MAX_RING_DESC 4096
173 struct rte_mempool *pool;
174 struct rte_ring *ring;
176 } vpool_array[MAX_QUEUES+MAX_QUEUES];
178 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
185 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
187 /* The type of host physical address translated from guest physical address. */
189 PHYS_ADDR_CONTINUOUS = 0,
190 PHYS_ADDR_CROSS_SUBREG = 1,
191 PHYS_ADDR_INVALID = 2,
196 static uint32_t enable_stats = 0;
197 /* Enable retries on RX. */
198 static uint32_t enable_retry = 1;
199 /* Specify timeout (in useconds) between retries on RX. */
200 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
201 /* Specify the number of retries on RX. */
202 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
204 /* Character device basename. Can be set by user. */
205 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
207 /* empty vmdq configuration structure. Filled in programatically */
208 static struct rte_eth_conf vmdq_conf_default = {
210 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
212 .header_split = 0, /**< Header Split disabled */
213 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
214 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
216 * It is necessary for 1G NIC such as I350,
217 * this fixes bug of ipv4 forwarding in guest can't
218 * forward pakets from one virtio dev to another virtio dev.
220 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
221 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
222 .hw_strip_crc = 0, /**< CRC stripped by hardware */
226 .mq_mode = ETH_MQ_TX_NONE,
230 * should be overridden separately in code with
234 .nb_queue_pools = ETH_8_POOLS,
235 .enable_default_pool = 0,
238 .pool_map = {{0, 0},},
243 static unsigned lcore_ids[RTE_MAX_LCORE];
244 static uint8_t ports[RTE_MAX_ETHPORTS];
245 static unsigned num_ports = 0; /**< The number of ports specified in command line */
246 static uint16_t num_pf_queues, num_vmdq_queues;
247 static uint16_t vmdq_pool_base, vmdq_queue_base;
248 static uint16_t queues_per_pool;
250 static const uint16_t external_pkt_default_vlan_tag = 2000;
251 const uint16_t vlan_tags[] = {
252 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
253 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
254 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
255 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
256 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
257 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
258 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
259 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
262 /* ethernet addresses of ports */
263 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
265 /* heads for the main used and free linked lists for the data path. */
266 static struct virtio_net_data_ll *ll_root_used = NULL;
267 static struct virtio_net_data_ll *ll_root_free = NULL;
269 /* Array of data core structures containing information on individual core linked lists. */
270 static struct lcore_info lcore_info[RTE_MAX_LCORE];
272 /* Used for queueing bursts of TX packets. */
276 struct rte_mbuf *m_table[MAX_PKT_BURST];
279 /* TX queue for each data core. */
280 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
282 /* TX queue fori each virtio device for zero copy. */
283 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
285 /* Vlan header struct used to insert vlan tags on TX. */
287 unsigned char h_dest[ETH_ALEN];
288 unsigned char h_source[ETH_ALEN];
291 __be16 h_vlan_encapsulated_proto;
296 uint8_t version_ihl; /**< version and header length */
297 uint8_t type_of_service; /**< type of service */
298 uint16_t total_length; /**< length of packet */
299 uint16_t packet_id; /**< packet ID */
300 uint16_t fragment_offset; /**< fragmentation offset */
301 uint8_t time_to_live; /**< time to live */
302 uint8_t next_proto_id; /**< protocol ID */
303 uint16_t hdr_checksum; /**< header checksum */
304 uint32_t src_addr; /**< source address */
305 uint32_t dst_addr; /**< destination address */
306 } __attribute__((__packed__));
308 /* Header lengths. */
310 #define VLAN_ETH_HLEN 18
312 /* Per-device statistics struct */
313 struct device_statistics {
315 rte_atomic64_t rx_total_atomic;
318 rte_atomic64_t rx_atomic;
320 } __rte_cache_aligned;
321 struct device_statistics dev_statistics[MAX_DEVICES];
324 * Builds up the correct configuration for VMDQ VLAN pool map
325 * according to the pool & queue limits.
328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
330 struct rte_eth_vmdq_rx_conf conf;
331 struct rte_eth_vmdq_rx_conf *def_conf =
332 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
335 memset(&conf, 0, sizeof(conf));
336 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
337 conf.nb_pool_maps = num_devices;
338 conf.enable_loop_back = def_conf->enable_loop_back;
339 conf.rx_mode = def_conf->rx_mode;
341 for (i = 0; i < conf.nb_pool_maps; i++) {
342 conf.pool_map[i].vlan_id = vlan_tags[ i ];
343 conf.pool_map[i].pools = (1UL << i);
346 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
347 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
348 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
353 * Validate the device number according to the max pool number gotten form
354 * dev_info. If the device number is invalid, give the error message and
355 * return -1. Each device must have its own pool.
358 validate_num_devices(uint32_t max_nb_devices)
360 if (num_devices > max_nb_devices) {
361 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
368 * Initialises a given port using global settings and with the rx buffers
369 * coming from the mbuf_pool passed as parameter
372 port_init(uint8_t port)
374 struct rte_eth_dev_info dev_info;
375 struct rte_eth_conf port_conf;
376 struct rte_eth_rxconf *rxconf;
377 struct rte_eth_txconf *txconf;
378 int16_t rx_rings, tx_rings;
379 uint16_t rx_ring_size, tx_ring_size;
383 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
384 rte_eth_dev_info_get (port, &dev_info);
386 if (dev_info.max_rx_queues > MAX_QUEUES) {
387 rte_exit(EXIT_FAILURE,
388 "please define MAX_QUEUES no less than %u in %s\n",
389 dev_info.max_rx_queues, __FILE__);
392 rxconf = &dev_info.default_rxconf;
393 txconf = &dev_info.default_txconf;
394 rxconf->rx_drop_en = 1;
396 /* Enable vlan offload */
397 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
400 * Zero copy defers queue RX/TX start to the time when guest
401 * finishes its startup and packet buffers from that guest are
405 rxconf->rx_deferred_start = 1;
406 rxconf->rx_drop_en = 0;
407 txconf->tx_deferred_start = 1;
410 /*configure the number of supported virtio devices based on VMDQ limits */
411 num_devices = dev_info.max_vmdq_pools;
414 rx_ring_size = num_rx_descriptor;
415 tx_ring_size = num_tx_descriptor;
416 tx_rings = dev_info.max_tx_queues;
418 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
419 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
420 tx_rings = (uint16_t)rte_lcore_count();
423 retval = validate_num_devices(MAX_DEVICES);
427 /* Get port configuration. */
428 retval = get_eth_conf(&port_conf, num_devices);
431 /* NIC queues are divided into pf queues and vmdq queues. */
432 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
433 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
434 num_vmdq_queues = num_devices * queues_per_pool;
435 num_queues = num_pf_queues + num_vmdq_queues;
436 vmdq_queue_base = dev_info.vmdq_queue_base;
437 vmdq_pool_base = dev_info.vmdq_pool_base;
438 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
439 num_pf_queues, num_devices, queues_per_pool);
441 if (port >= rte_eth_dev_count()) return -1;
443 rx_rings = (uint16_t)dev_info.max_rx_queues;
444 /* Configure ethernet device. */
445 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449 /* Setup the queues. */
450 for (q = 0; q < rx_rings; q ++) {
451 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452 rte_eth_dev_socket_id(port),
454 vpool_array[q].pool);
458 for (q = 0; q < tx_rings; q ++) {
459 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
460 rte_eth_dev_socket_id(port),
466 /* Start the device. */
467 retval = rte_eth_dev_start(port);
469 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474 rte_eth_promiscuous_enable(port);
476 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
477 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
478 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
479 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481 vmdq_ports_eth_addr[port].addr_bytes[0],
482 vmdq_ports_eth_addr[port].addr_bytes[1],
483 vmdq_ports_eth_addr[port].addr_bytes[2],
484 vmdq_ports_eth_addr[port].addr_bytes[3],
485 vmdq_ports_eth_addr[port].addr_bytes[4],
486 vmdq_ports_eth_addr[port].addr_bytes[5]);
492 * Set character device basename.
495 us_vhost_parse_basename(const char *q_arg)
497 /* parse number string */
499 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
502 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
508 * Parse the portmask provided at run time.
511 parse_portmask(const char *portmask)
518 /* parse hexadecimal string */
519 pm = strtoul(portmask, &end, 16);
520 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
531 * Parse num options at run time.
534 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 /* parse unsigned int string */
542 num = strtoul(q_arg, &end, 10);
543 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
546 if (num > max_valid_value)
557 us_vhost_usage(const char *prgname)
559 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
562 " --dev-basename <name>\n"
564 " -p PORTMASK: Set mask for ports to be used by application\n"
565 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
566 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
567 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
568 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
569 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
570 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 " --dev-basename: The basename to be used for the character device.\n"
573 " --zero-copy [0|1]: disable(default)/enable rx/tx "
575 " --rx-desc-num [0-N]: the number of descriptors on rx, "
576 "used only when zero copy is enabled.\n"
577 " --tx-desc-num [0-N]: the number of descriptors on tx, "
578 "used only when zero copy is enabled.\n",
583 * Parse the arguments given in the command line of the application.
586 us_vhost_parse_args(int argc, char **argv)
591 const char *prgname = argv[0];
592 static struct option long_option[] = {
593 {"vm2vm", required_argument, NULL, 0},
594 {"rx-retry", required_argument, NULL, 0},
595 {"rx-retry-delay", required_argument, NULL, 0},
596 {"rx-retry-num", required_argument, NULL, 0},
597 {"mergeable", required_argument, NULL, 0},
598 {"vlan-strip", required_argument, NULL, 0},
599 {"stats", required_argument, NULL, 0},
600 {"dev-basename", required_argument, NULL, 0},
601 {"zero-copy", required_argument, NULL, 0},
602 {"rx-desc-num", required_argument, NULL, 0},
603 {"tx-desc-num", required_argument, NULL, 0},
607 /* Parse command line */
608 while ((opt = getopt_long(argc, argv, "p:P",
609 long_option, &option_index)) != EOF) {
613 enabled_port_mask = parse_portmask(optarg);
614 if (enabled_port_mask == 0) {
615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 us_vhost_usage(prgname);
623 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
624 ETH_VMDQ_ACCEPT_BROADCAST |
625 ETH_VMDQ_ACCEPT_MULTICAST;
626 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
631 /* Enable/disable vm2vm comms. */
632 if (!strncmp(long_option[option_index].name, "vm2vm",
634 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636 RTE_LOG(INFO, VHOST_CONFIG,
637 "Invalid argument for "
639 us_vhost_usage(prgname);
642 vm2vm_mode = (vm2vm_type)ret;
646 /* Enable/disable retries on RX. */
647 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
648 ret = parse_num_opt(optarg, 1);
650 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
651 us_vhost_usage(prgname);
658 /* Specify the retries delay time (in useconds) on RX. */
659 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
660 ret = parse_num_opt(optarg, INT32_MAX);
662 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
663 us_vhost_usage(prgname);
666 burst_rx_delay_time = ret;
670 /* Specify the retries number on RX. */
671 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
672 ret = parse_num_opt(optarg, INT32_MAX);
674 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
675 us_vhost_usage(prgname);
678 burst_rx_retry_num = ret;
682 /* Enable/disable RX mergeable buffers. */
683 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
684 ret = parse_num_opt(optarg, 1);
686 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
687 us_vhost_usage(prgname);
692 vmdq_conf_default.rxmode.jumbo_frame = 1;
693 vmdq_conf_default.rxmode.max_rx_pkt_len
694 = JUMBO_FRAME_MAX_SIZE;
699 /* Enable/disable RX VLAN strip on host. */
700 if (!strncmp(long_option[option_index].name,
701 "vlan-strip", MAX_LONG_OPT_SZ)) {
702 ret = parse_num_opt(optarg, 1);
704 RTE_LOG(INFO, VHOST_CONFIG,
705 "Invalid argument for VLAN strip [0|1]\n");
706 us_vhost_usage(prgname);
710 vmdq_conf_default.rxmode.hw_vlan_strip =
715 /* Enable/disable stats. */
716 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
717 ret = parse_num_opt(optarg, INT32_MAX);
719 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
720 us_vhost_usage(prgname);
727 /* Set character device basename. */
728 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
729 if (us_vhost_parse_basename(optarg) == -1) {
730 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
731 us_vhost_usage(prgname);
736 /* Enable/disable rx/tx zero copy. */
737 if (!strncmp(long_option[option_index].name,
738 "zero-copy", MAX_LONG_OPT_SZ)) {
739 ret = parse_num_opt(optarg, 1);
741 RTE_LOG(INFO, VHOST_CONFIG,
743 " for zero-copy [0|1]\n");
744 us_vhost_usage(prgname);
750 #ifdef RTE_MBUF_REFCNT
751 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
752 "zero copy vhost APP, please "
753 "disable RTE_MBUF_REFCNT\n"
754 "in config file and then rebuild DPDK "
756 "Otherwise please disable zero copy "
757 "flag in command line!\n");
763 /* Specify the descriptor number on RX. */
764 if (!strncmp(long_option[option_index].name,
765 "rx-desc-num", MAX_LONG_OPT_SZ)) {
766 ret = parse_num_opt(optarg, MAX_RING_DESC);
767 if ((ret == -1) || (!POWEROF2(ret))) {
768 RTE_LOG(INFO, VHOST_CONFIG,
769 "Invalid argument for rx-desc-num[0-N],"
770 "power of 2 required.\n");
771 us_vhost_usage(prgname);
774 num_rx_descriptor = ret;
778 /* Specify the descriptor number on TX. */
779 if (!strncmp(long_option[option_index].name,
780 "tx-desc-num", MAX_LONG_OPT_SZ)) {
781 ret = parse_num_opt(optarg, MAX_RING_DESC);
782 if ((ret == -1) || (!POWEROF2(ret))) {
783 RTE_LOG(INFO, VHOST_CONFIG,
784 "Invalid argument for tx-desc-num [0-N],"
785 "power of 2 required.\n");
786 us_vhost_usage(prgname);
789 num_tx_descriptor = ret;
795 /* Invalid option - print options. */
797 us_vhost_usage(prgname);
802 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
803 if (enabled_port_mask & (1 << i))
804 ports[num_ports++] = (uint8_t)i;
807 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
808 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
809 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
813 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
814 RTE_LOG(INFO, VHOST_PORT,
815 "Vhost zero copy doesn't support software vm2vm,"
816 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
820 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
821 RTE_LOG(INFO, VHOST_PORT,
822 "Vhost zero copy doesn't support jumbo frame,"
823 "please specify '--mergeable 0' to disable the "
824 "mergeable feature.\n");
832 * Update the global var NUM_PORTS and array PORTS according to system ports number
833 * and return valid ports number
835 static unsigned check_ports_num(unsigned nb_ports)
837 unsigned valid_num_ports = num_ports;
840 if (num_ports > nb_ports) {
841 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
842 num_ports, nb_ports);
843 num_ports = nb_ports;
846 for (portid = 0; portid < num_ports; portid ++) {
847 if (ports[portid] >= nb_ports) {
848 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
849 ports[portid], (nb_ports - 1));
850 ports[portid] = INVALID_PORT_ID;
854 return valid_num_ports;
858 * Macro to print out packet contents. Wrapped in debug define so that the
859 * data path is not effected when debug is disabled.
862 #define PRINT_PACKET(device, addr, size, header) do { \
863 char *pkt_addr = (char*)(addr); \
864 unsigned int index; \
865 char packet[MAX_PRINT_BUFF]; \
868 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
870 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
871 for (index = 0; index < (size); index++) { \
872 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
873 "%02hhx ", pkt_addr[index]); \
875 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
877 LOG_DEBUG(VHOST_DATA, "%s", packet); \
880 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
884 * Function to convert guest physical addresses to vhost physical addresses.
885 * This is used to convert virtio buffer addresses.
887 static inline uint64_t __attribute__((always_inline))
888 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
889 uint32_t buf_len, hpa_type *addr_type)
891 struct virtio_memory_regions_hpa *region;
893 uint64_t vhost_pa = 0;
895 *addr_type = PHYS_ADDR_INVALID;
897 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
898 region = &vdev->regions_hpa[regionidx];
899 if ((guest_pa >= region->guest_phys_address) &&
900 (guest_pa <= region->guest_phys_address_end)) {
901 vhost_pa = region->host_phys_addr_offset + guest_pa;
902 if (likely((guest_pa + buf_len - 1)
903 <= region->guest_phys_address_end))
904 *addr_type = PHYS_ADDR_CONTINUOUS;
906 *addr_type = PHYS_ADDR_CROSS_SUBREG;
911 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
912 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
913 (void *)(uintptr_t)vhost_pa);
919 * Compares a packet destination MAC address to a device MAC address.
921 static inline int __attribute__((always_inline))
922 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
924 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
928 * This function learns the MAC address of the device and registers this along with a
929 * vlan tag to a VMDQ.
932 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
934 struct ether_hdr *pkt_hdr;
935 struct virtio_net_data_ll *dev_ll;
936 struct virtio_net *dev = vdev->dev;
939 /* Learn MAC address of guest device from packet */
940 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
942 dev_ll = ll_root_used;
944 while (dev_ll != NULL) {
945 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
946 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
949 dev_ll = dev_ll->next;
952 for (i = 0; i < ETHER_ADDR_LEN; i++)
953 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
955 /* vlan_tag currently uses the device_id. */
956 vdev->vlan_tag = vlan_tags[dev->device_fh];
958 /* Print out VMDQ registration info. */
959 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
961 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
962 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
963 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
966 /* Register the MAC address. */
967 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
968 (uint32_t)dev->device_fh + vmdq_pool_base);
970 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
973 /* Enable stripping of the vlan tag as we handle routing. */
975 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
976 (uint16_t)vdev->vmdq_rx_q, 1);
978 /* Set device as ready for RX. */
979 vdev->ready = DEVICE_RX;
985 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
986 * queue before disabling RX on the device.
989 unlink_vmdq(struct vhost_dev *vdev)
993 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
995 if (vdev->ready == DEVICE_RX) {
996 /*clear MAC and VLAN settings*/
997 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
998 for (i = 0; i < 6; i++)
999 vdev->mac_address.addr_bytes[i] = 0;
1003 /*Clear out the receive buffers*/
1004 rx_count = rte_eth_rx_burst(ports[0],
1005 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1008 for (i = 0; i < rx_count; i++)
1009 rte_pktmbuf_free(pkts_burst[i]);
1011 rx_count = rte_eth_rx_burst(ports[0],
1012 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1015 vdev->ready = DEVICE_MAC_LEARNING;
1020 * Check if the packet destination MAC address is for a local device. If so then put
1021 * the packet on that devices RX queue. If not then return.
1023 static inline int __attribute__((always_inline))
1024 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1026 struct virtio_net_data_ll *dev_ll;
1027 struct ether_hdr *pkt_hdr;
1029 struct virtio_net *dev = vdev->dev;
1030 struct virtio_net *tdev; /* destination virito device */
1032 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1034 /*get the used devices list*/
1035 dev_ll = ll_root_used;
1037 while (dev_ll != NULL) {
1038 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1039 &dev_ll->vdev->mac_address)) {
1041 /* Drop the packet if the TX packet is destined for the TX device. */
1042 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1043 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1047 tdev = dev_ll->vdev->dev;
1050 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1052 if (unlikely(dev_ll->vdev->remove)) {
1053 /*drop the packet if the device is marked for removal*/
1054 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1056 /*send the packet to the local virtio device*/
1057 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1060 &dev_statistics[tdev->device_fh].rx_total_atomic,
1063 &dev_statistics[tdev->device_fh].rx_atomic,
1065 dev_statistics[tdev->device_fh].tx_total++;
1066 dev_statistics[tdev->device_fh].tx += ret;
1072 dev_ll = dev_ll->next;
1079 * Check if the destination MAC of a packet is one local VM,
1080 * and get its vlan tag, and offset if it is.
1082 static inline int __attribute__((always_inline))
1083 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1084 uint32_t *offset, uint16_t *vlan_tag)
1086 struct virtio_net_data_ll *dev_ll = ll_root_used;
1087 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1089 while (dev_ll != NULL) {
1090 if ((dev_ll->vdev->ready == DEVICE_RX)
1091 && ether_addr_cmp(&(pkt_hdr->d_addr),
1092 &dev_ll->vdev->mac_address)) {
1094 * Drop the packet if the TX packet is
1095 * destined for the TX device.
1097 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1098 LOG_DEBUG(VHOST_DATA,
1099 "(%"PRIu64") TX: Source and destination"
1100 " MAC addresses are the same. Dropping "
1102 dev_ll->vdev->dev->device_fh);
1107 * HW vlan strip will reduce the packet length
1108 * by minus length of vlan tag, so need restore
1109 * the packet length by plus it.
1111 *offset = VLAN_HLEN;
1114 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1116 LOG_DEBUG(VHOST_DATA,
1117 "(%"PRIu64") TX: pkt to local VM device id:"
1118 "(%"PRIu64") vlan tag: %d.\n",
1119 dev->device_fh, dev_ll->vdev->dev->device_fh,
1124 dev_ll = dev_ll->next;
1130 * This function routes the TX packet to the correct interface. This may be a local device
1131 * or the physical port.
1133 static inline void __attribute__((always_inline))
1134 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1136 struct mbuf_table *tx_q;
1137 struct rte_mbuf **m_table;
1138 unsigned len, ret, offset = 0;
1139 const uint16_t lcore_id = rte_lcore_id();
1140 struct virtio_net *dev = vdev->dev;
1141 struct ether_hdr *nh;
1143 /*check if destination is local VM*/
1144 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1145 rte_pktmbuf_free(m);
1149 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1150 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1151 rte_pktmbuf_free(m);
1156 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1158 /*Add packet to the port tx queue*/
1159 tx_q = &lcore_tx_queue[lcore_id];
1162 nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1164 /* Guest has inserted the vlan tag. */
1165 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1166 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1167 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1168 (vh->vlan_tci != vlan_tag_be))
1169 vh->vlan_tci = vlan_tag_be;
1171 m->ol_flags = PKT_TX_VLAN_PKT;
1174 * Find the right seg to adjust the data len when offset is
1175 * bigger than tail room size.
1177 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1178 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1179 m->data_len += offset;
1181 struct rte_mbuf *seg = m;
1183 while ((seg->next != NULL) &&
1184 (offset > rte_pktmbuf_tailroom(seg)))
1187 seg->data_len += offset;
1189 m->pkt_len += offset;
1192 m->vlan_tci = vlan_tag;
1195 tx_q->m_table[len] = m;
1198 dev_statistics[dev->device_fh].tx_total++;
1199 dev_statistics[dev->device_fh].tx++;
1202 if (unlikely(len == MAX_PKT_BURST)) {
1203 m_table = (struct rte_mbuf **)tx_q->m_table;
1204 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1205 /* Free any buffers not handled by TX and update the port stats. */
1206 if (unlikely(ret < len)) {
1208 rte_pktmbuf_free(m_table[ret]);
1209 } while (++ret < len);
1219 * This function is called by each data core. It handles all RX/TX registered with the
1220 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1221 * with all devices in the main linked list.
1224 switch_worker(__attribute__((unused)) void *arg)
1226 struct rte_mempool *mbuf_pool = arg;
1227 struct virtio_net *dev = NULL;
1228 struct vhost_dev *vdev = NULL;
1229 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1230 struct virtio_net_data_ll *dev_ll;
1231 struct mbuf_table *tx_q;
1232 volatile struct lcore_ll_info *lcore_ll;
1233 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1234 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1236 const uint16_t lcore_id = rte_lcore_id();
1237 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1238 uint16_t rx_count = 0;
1242 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1243 lcore_ll = lcore_info[lcore_id].lcore_ll;
1246 tx_q = &lcore_tx_queue[lcore_id];
1247 for (i = 0; i < num_cores; i ++) {
1248 if (lcore_ids[i] == lcore_id) {
1255 cur_tsc = rte_rdtsc();
1257 * TX burst queue drain
1259 diff_tsc = cur_tsc - prev_tsc;
1260 if (unlikely(diff_tsc > drain_tsc)) {
1263 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1265 /*Tx any packets in the queue*/
1266 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1267 (struct rte_mbuf **)tx_q->m_table,
1268 (uint16_t)tx_q->len);
1269 if (unlikely(ret < tx_q->len)) {
1271 rte_pktmbuf_free(tx_q->m_table[ret]);
1272 } while (++ret < tx_q->len);
1282 rte_prefetch0(lcore_ll->ll_root_used);
1284 * Inform the configuration core that we have exited the linked list and that no devices are
1285 * in use if requested.
1287 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1288 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1293 dev_ll = lcore_ll->ll_root_used;
1295 while (dev_ll != NULL) {
1296 /*get virtio device ID*/
1297 vdev = dev_ll->vdev;
1300 if (unlikely(vdev->remove)) {
1301 dev_ll = dev_ll->next;
1303 vdev->ready = DEVICE_SAFE_REMOVE;
1306 if (likely(vdev->ready == DEVICE_RX)) {
1308 rx_count = rte_eth_rx_burst(ports[0],
1309 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1313 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1314 * Here MAX_PKT_BURST must be less than virtio queue size
1316 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1317 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1318 rte_delay_us(burst_rx_delay_time);
1319 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1323 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1326 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1329 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1331 while (likely(rx_count)) {
1333 rte_pktmbuf_free(pkts_burst[rx_count]);
1339 if (likely(!vdev->remove)) {
1340 /* Handle guest TX*/
1341 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1342 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1343 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1344 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1346 rte_pktmbuf_free(pkts_burst[--tx_count]);
1350 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1353 /*move to the next device in the list*/
1354 dev_ll = dev_ll->next;
1362 * This function gets available ring number for zero copy rx.
1363 * Only one thread will call this funciton for a paticular virtio device,
1364 * so, it is designed as non-thread-safe function.
1366 static inline uint32_t __attribute__((always_inline))
1367 get_available_ring_num_zcp(struct virtio_net *dev)
1369 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1372 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1373 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1377 * This function gets available ring index for zero copy rx,
1378 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1379 * Only one thread will call this funciton for a paticular virtio device,
1380 * so, it is designed as non-thread-safe function.
1382 static inline uint32_t __attribute__((always_inline))
1383 get_available_ring_index_zcp(struct virtio_net *dev,
1384 uint16_t *res_base_idx, uint32_t count)
1386 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1389 uint16_t free_entries;
1391 *res_base_idx = vq->last_used_idx_res;
1392 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1393 free_entries = (avail_idx - *res_base_idx);
1395 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1397 "res base idx:%d, free entries:%d\n",
1398 dev->device_fh, avail_idx, *res_base_idx,
1402 * If retry is enabled and the queue is full then we wait
1403 * and retry to avoid packet loss.
1405 if (enable_retry && unlikely(count > free_entries)) {
1406 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1407 rte_delay_us(burst_rx_delay_time);
1408 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1409 free_entries = (avail_idx - *res_base_idx);
1410 if (count <= free_entries)
1415 /*check that we have enough buffers*/
1416 if (unlikely(count > free_entries))
1417 count = free_entries;
1419 if (unlikely(count == 0)) {
1420 LOG_DEBUG(VHOST_DATA,
1421 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1422 "avail idx: %d, res base idx:%d, free entries:%d\n",
1423 dev->device_fh, avail_idx,
1424 *res_base_idx, free_entries);
1428 vq->last_used_idx_res = *res_base_idx + count;
1434 * This function put descriptor back to used list.
1436 static inline void __attribute__((always_inline))
1437 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1439 uint16_t res_cur_idx = vq->last_used_idx;
1440 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1441 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1442 rte_compiler_barrier();
1443 *(volatile uint16_t *)&vq->used->idx += 1;
1444 vq->last_used_idx += 1;
1446 /* Kick the guest if necessary. */
1447 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1448 eventfd_write((int)vq->kickfd, 1);
1452 * This function get available descriptor from vitio vring and un-attached mbuf
1453 * from vpool->ring, and then attach them together. It needs adjust the offset
1454 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1455 * frame data may be put to wrong location in mbuf.
1457 static inline void __attribute__((always_inline))
1458 attach_rxmbuf_zcp(struct virtio_net *dev)
1460 uint16_t res_base_idx, desc_idx;
1461 uint64_t buff_addr, phys_addr;
1462 struct vhost_virtqueue *vq;
1463 struct vring_desc *desc;
1464 struct rte_mbuf *mbuf = NULL;
1465 struct vpool *vpool;
1467 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1469 vpool = &vpool_array[vdev->vmdq_rx_q];
1470 vq = dev->virtqueue[VIRTIO_RXQ];
1473 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1476 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1478 desc = &vq->desc[desc_idx];
1479 if (desc->flags & VRING_DESC_F_NEXT) {
1480 desc = &vq->desc[desc->next];
1481 buff_addr = gpa_to_vva(dev, desc->addr);
1482 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1485 buff_addr = gpa_to_vva(dev,
1486 desc->addr + vq->vhost_hlen);
1487 phys_addr = gpa_to_hpa(vdev,
1488 desc->addr + vq->vhost_hlen,
1489 desc->len, &addr_type);
1492 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1493 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1494 " address found when attaching RX frame buffer"
1495 " address!\n", dev->device_fh);
1496 put_desc_to_used_list_zcp(vq, desc_idx);
1501 * Check if the frame buffer address from guest crosses
1502 * sub-region or not.
1504 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1505 RTE_LOG(ERR, VHOST_DATA,
1506 "(%"PRIu64") Frame buffer address cross "
1507 "sub-regioin found when attaching RX frame "
1508 "buffer address!\n",
1510 put_desc_to_used_list_zcp(vq, desc_idx);
1513 } while (unlikely(phys_addr == 0));
1515 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1516 if (unlikely(mbuf == NULL)) {
1517 LOG_DEBUG(VHOST_DATA,
1518 "(%"PRIu64") in attach_rxmbuf_zcp: "
1519 "ring_sc_dequeue fail.\n",
1521 put_desc_to_used_list_zcp(vq, desc_idx);
1525 if (unlikely(vpool->buf_size > desc->len)) {
1526 LOG_DEBUG(VHOST_DATA,
1527 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1528 "length(%d) of descriptor idx: %d less than room "
1529 "size required: %d\n",
1530 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1531 put_desc_to_used_list_zcp(vq, desc_idx);
1532 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1536 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1537 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1538 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1539 mbuf->data_len = desc->len;
1540 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1542 LOG_DEBUG(VHOST_DATA,
1543 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1544 "descriptor idx:%d\n",
1545 dev->device_fh, res_base_idx, desc_idx);
1547 __rte_mbuf_raw_free(mbuf);
1553 * Detach an attched packet mbuf -
1554 * - restore original mbuf address and length values.
1555 * - reset pktmbuf data and data_len to their default values.
1556 * All other fields of the given packet mbuf will be left intact.
1559 * The attached packet mbuf.
1561 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1563 const struct rte_mempool *mp = m->pool;
1564 void *buf = RTE_MBUF_TO_BADDR(m);
1566 uint32_t buf_len = mp->elt_size - sizeof(*m);
1567 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1570 m->buf_len = (uint16_t)buf_len;
1572 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1573 RTE_PKTMBUF_HEADROOM : m->buf_len;
1574 m->data_off = buf_ofs;
1580 * This function is called after packets have been transimited. It fetchs mbuf
1581 * from vpool->pool, detached it and put into vpool->ring. It also update the
1582 * used index and kick the guest if necessary.
1584 static inline uint32_t __attribute__((always_inline))
1585 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1587 struct rte_mbuf *mbuf;
1588 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1589 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1591 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1593 LOG_DEBUG(VHOST_DATA,
1594 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1596 dev->device_fh, mbuf_count);
1597 LOG_DEBUG(VHOST_DATA,
1598 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1600 dev->device_fh, rte_ring_count(vpool->ring));
1602 for (index = 0; index < mbuf_count; index++) {
1603 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1604 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1605 pktmbuf_detach_zcp(mbuf);
1606 rte_ring_sp_enqueue(vpool->ring, mbuf);
1608 /* Update used index buffer information. */
1609 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1610 vq->used->ring[used_idx].len = 0;
1612 used_idx = (used_idx + 1) & (vq->size - 1);
1615 LOG_DEBUG(VHOST_DATA,
1616 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1618 dev->device_fh, rte_mempool_count(vpool->pool));
1619 LOG_DEBUG(VHOST_DATA,
1620 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1622 dev->device_fh, rte_ring_count(vpool->ring));
1623 LOG_DEBUG(VHOST_DATA,
1624 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1625 "vq->last_used_idx:%d\n",
1626 dev->device_fh, vq->last_used_idx);
1628 vq->last_used_idx += mbuf_count;
1630 LOG_DEBUG(VHOST_DATA,
1631 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1632 "vq->last_used_idx:%d\n",
1633 dev->device_fh, vq->last_used_idx);
1635 rte_compiler_barrier();
1637 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1639 /* Kick guest if required. */
1640 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1641 eventfd_write((int)vq->kickfd, 1);
1647 * This function is called when a virtio device is destroy.
1648 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1650 static void mbuf_destroy_zcp(struct vpool *vpool)
1652 struct rte_mbuf *mbuf = NULL;
1653 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1655 LOG_DEBUG(VHOST_CONFIG,
1656 "in mbuf_destroy_zcp: mbuf count in mempool before "
1657 "mbuf_destroy_zcp is: %d\n",
1659 LOG_DEBUG(VHOST_CONFIG,
1660 "in mbuf_destroy_zcp: mbuf count in ring before "
1661 "mbuf_destroy_zcp is : %d\n",
1662 rte_ring_count(vpool->ring));
1664 for (index = 0; index < mbuf_count; index++) {
1665 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1666 if (likely(mbuf != NULL)) {
1667 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1668 pktmbuf_detach_zcp(mbuf);
1669 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1673 LOG_DEBUG(VHOST_CONFIG,
1674 "in mbuf_destroy_zcp: mbuf count in mempool after "
1675 "mbuf_destroy_zcp is: %d\n",
1676 rte_mempool_count(vpool->pool));
1677 LOG_DEBUG(VHOST_CONFIG,
1678 "in mbuf_destroy_zcp: mbuf count in ring after "
1679 "mbuf_destroy_zcp is : %d\n",
1680 rte_ring_count(vpool->ring));
1684 * This function update the use flag and counter.
1686 static inline uint32_t __attribute__((always_inline))
1687 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1690 struct vhost_virtqueue *vq;
1691 struct vring_desc *desc;
1692 struct rte_mbuf *buff;
1693 /* The virtio_hdr is initialised to 0. */
1694 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1695 = {{0, 0, 0, 0, 0, 0}, 0};
1696 uint64_t buff_hdr_addr = 0;
1697 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1698 uint32_t head_idx, packet_success = 0;
1699 uint16_t res_cur_idx;
1701 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1706 vq = dev->virtqueue[VIRTIO_RXQ];
1707 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1709 res_cur_idx = vq->last_used_idx;
1710 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1711 dev->device_fh, res_cur_idx, res_cur_idx + count);
1713 /* Retrieve all of the head indexes first to avoid caching issues. */
1714 for (head_idx = 0; head_idx < count; head_idx++)
1715 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1717 /*Prefetch descriptor index. */
1718 rte_prefetch0(&vq->desc[head[packet_success]]);
1720 while (packet_success != count) {
1721 /* Get descriptor from available ring */
1722 desc = &vq->desc[head[packet_success]];
1724 buff = pkts[packet_success];
1725 LOG_DEBUG(VHOST_DATA,
1726 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1727 "pkt[%d] descriptor idx: %d\n",
1728 dev->device_fh, packet_success,
1729 MBUF_HEADROOM_UINT32(buff));
1732 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1733 + RTE_PKTMBUF_HEADROOM),
1734 rte_pktmbuf_data_len(buff), 0);
1736 /* Buffer address translation for virtio header. */
1737 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1738 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1741 * If the descriptors are chained the header and data are
1742 * placed in separate buffers.
1744 if (desc->flags & VRING_DESC_F_NEXT) {
1745 desc->len = vq->vhost_hlen;
1746 desc = &vq->desc[desc->next];
1747 desc->len = rte_pktmbuf_data_len(buff);
1749 desc->len = packet_len;
1752 /* Update used ring with desc information */
1753 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1754 = head[packet_success];
1755 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1760 /* A header is required per buffer. */
1761 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1762 (const void *)&virtio_hdr, vq->vhost_hlen);
1764 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1766 if (likely(packet_success < count)) {
1767 /* Prefetch descriptor index. */
1768 rte_prefetch0(&vq->desc[head[packet_success]]);
1772 rte_compiler_barrier();
1774 LOG_DEBUG(VHOST_DATA,
1775 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1776 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1777 dev->device_fh, vq->last_used_idx, vq->used->idx);
1779 *(volatile uint16_t *)&vq->used->idx += count;
1780 vq->last_used_idx += count;
1782 LOG_DEBUG(VHOST_DATA,
1783 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1784 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1785 dev->device_fh, vq->last_used_idx, vq->used->idx);
1787 /* Kick the guest if necessary. */
1788 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1789 eventfd_write((int)vq->kickfd, 1);
1795 * This function routes the TX packet to the correct interface.
1796 * This may be a local device or the physical port.
1798 static inline void __attribute__((always_inline))
1799 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1800 uint32_t desc_idx, uint8_t need_copy)
1802 struct mbuf_table *tx_q;
1803 struct rte_mbuf **m_table;
1804 struct rte_mbuf *mbuf = NULL;
1805 unsigned len, ret, offset = 0;
1806 struct vpool *vpool;
1807 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1808 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1810 /*Add packet to the port tx queue*/
1811 tx_q = &tx_queue_zcp[vmdq_rx_q];
1814 /* Allocate an mbuf and populate the structure. */
1815 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1816 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1817 if (unlikely(mbuf == NULL)) {
1818 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1819 RTE_LOG(ERR, VHOST_DATA,
1820 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1822 put_desc_to_used_list_zcp(vq, desc_idx);
1826 if (vm2vm_mode == VM2VM_HARDWARE) {
1827 /* Avoid using a vlan tag from any vm for external pkt, such as
1828 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1829 * selection, MAC address determines it as an external pkt
1830 * which should go to network, while vlan tag determine it as
1831 * a vm2vm pkt should forward to another vm. Hardware confuse
1832 * such a ambiguous situation, so pkt will lost.
1834 vlan_tag = external_pkt_default_vlan_tag;
1835 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1836 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1837 __rte_mbuf_raw_free(mbuf);
1842 mbuf->nb_segs = m->nb_segs;
1843 mbuf->next = m->next;
1844 mbuf->data_len = m->data_len + offset;
1845 mbuf->pkt_len = mbuf->data_len;
1846 if (unlikely(need_copy)) {
1847 /* Copy the packet contents to the mbuf. */
1848 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1849 rte_pktmbuf_mtod(m, void *),
1852 mbuf->data_off = m->data_off;
1853 mbuf->buf_physaddr = m->buf_physaddr;
1854 mbuf->buf_addr = m->buf_addr;
1856 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1857 mbuf->vlan_tci = vlan_tag;
1858 mbuf->l2_len = sizeof(struct ether_hdr);
1859 mbuf->l3_len = sizeof(struct ipv4_hdr);
1860 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1862 tx_q->m_table[len] = mbuf;
1865 LOG_DEBUG(VHOST_DATA,
1866 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1869 (mbuf->next == NULL) ? "null" : "non-null");
1872 dev_statistics[dev->device_fh].tx_total++;
1873 dev_statistics[dev->device_fh].tx++;
1876 if (unlikely(len == MAX_PKT_BURST)) {
1877 m_table = (struct rte_mbuf **)tx_q->m_table;
1878 ret = rte_eth_tx_burst(ports[0],
1879 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1882 * Free any buffers not handled by TX and update
1885 if (unlikely(ret < len)) {
1887 rte_pktmbuf_free(m_table[ret]);
1888 } while (++ret < len);
1892 txmbuf_clean_zcp(dev, vpool);
1901 * This function TX all available packets in virtio TX queue for one
1902 * virtio-net device. If it is first packet, it learns MAC address and
1905 static inline void __attribute__((always_inline))
1906 virtio_dev_tx_zcp(struct virtio_net *dev)
1909 struct vhost_virtqueue *vq;
1910 struct vring_desc *desc;
1911 uint64_t buff_addr = 0, phys_addr;
1912 uint32_t head[MAX_PKT_BURST];
1914 uint16_t free_entries, packet_success = 0;
1916 uint8_t need_copy = 0;
1918 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1920 vq = dev->virtqueue[VIRTIO_TXQ];
1921 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1923 /* If there are no available buffers then return. */
1924 if (vq->last_used_idx_res == avail_idx)
1927 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1929 /* Prefetch available ring to retrieve head indexes. */
1930 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1932 /* Get the number of free entries in the ring */
1933 free_entries = (avail_idx - vq->last_used_idx_res);
1935 /* Limit to MAX_PKT_BURST. */
1937 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1939 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1940 dev->device_fh, free_entries);
1942 /* Retrieve all of the head indexes first to avoid caching issues. */
1943 for (i = 0; i < free_entries; i++)
1945 = vq->avail->ring[(vq->last_used_idx_res + i)
1948 vq->last_used_idx_res += free_entries;
1950 /* Prefetch descriptor index. */
1951 rte_prefetch0(&vq->desc[head[packet_success]]);
1952 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1954 while (packet_success < free_entries) {
1955 desc = &vq->desc[head[packet_success]];
1957 /* Discard first buffer as it is the virtio header */
1958 desc = &vq->desc[desc->next];
1960 /* Buffer address translation. */
1961 buff_addr = gpa_to_vva(dev, desc->addr);
1962 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1963 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1966 if (likely(packet_success < (free_entries - 1)))
1967 /* Prefetch descriptor index. */
1968 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1970 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1971 RTE_LOG(ERR, VHOST_DATA,
1972 "(%"PRIu64") Invalid frame buffer address found"
1973 "when TX packets!\n",
1979 /* Prefetch buffer address. */
1980 rte_prefetch0((void *)(uintptr_t)buff_addr);
1983 * Setup dummy mbuf. This is copied to a real mbuf if
1984 * transmitted out the physical port.
1986 m.data_len = desc->len;
1990 m.buf_addr = (void *)(uintptr_t)buff_addr;
1991 m.buf_physaddr = phys_addr;
1994 * Check if the frame buffer address from guest crosses
1995 * sub-region or not.
1997 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1998 RTE_LOG(ERR, VHOST_DATA,
1999 "(%"PRIu64") Frame buffer address cross "
2000 "sub-regioin found when attaching TX frame "
2001 "buffer address!\n",
2007 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2010 * If this is the first received packet we need to learn
2011 * the MAC and setup VMDQ
2013 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2014 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2016 * Discard frame if device is scheduled for
2017 * removal or a duplicate MAC address is found.
2019 packet_success += free_entries;
2020 vq->last_used_idx += packet_success;
2025 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2031 * This function is called by each data core. It handles all RX/TX registered
2032 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2033 * addresses are compared with all devices in the main linked list.
2036 switch_worker_zcp(__attribute__((unused)) void *arg)
2038 struct virtio_net *dev = NULL;
2039 struct vhost_dev *vdev = NULL;
2040 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2041 struct virtio_net_data_ll *dev_ll;
2042 struct mbuf_table *tx_q;
2043 volatile struct lcore_ll_info *lcore_ll;
2044 const uint64_t drain_tsc
2045 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2046 * BURST_TX_DRAIN_US;
2047 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2049 const uint16_t lcore_id = rte_lcore_id();
2050 uint16_t count_in_ring, rx_count = 0;
2052 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2054 lcore_ll = lcore_info[lcore_id].lcore_ll;
2058 cur_tsc = rte_rdtsc();
2060 /* TX burst queue drain */
2061 diff_tsc = cur_tsc - prev_tsc;
2062 if (unlikely(diff_tsc > drain_tsc)) {
2064 * Get mbuf from vpool.pool and detach mbuf and
2065 * put back into vpool.ring.
2067 dev_ll = lcore_ll->ll_root_used;
2068 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2069 /* Get virtio device ID */
2070 vdev = dev_ll->vdev;
2073 if (likely(!vdev->remove)) {
2074 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2076 LOG_DEBUG(VHOST_DATA,
2077 "TX queue drained after timeout"
2078 " with burst size %u\n",
2082 * Tx any packets in the queue
2084 ret = rte_eth_tx_burst(
2086 (uint16_t)tx_q->txq_id,
2087 (struct rte_mbuf **)
2089 (uint16_t)tx_q->len);
2090 if (unlikely(ret < tx_q->len)) {
2093 tx_q->m_table[ret]);
2094 } while (++ret < tx_q->len);
2098 txmbuf_clean_zcp(dev,
2099 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2102 dev_ll = dev_ll->next;
2107 rte_prefetch0(lcore_ll->ll_root_used);
2110 * Inform the configuration core that we have exited the linked
2111 * list and that no devices are in use if requested.
2113 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2114 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2116 /* Process devices */
2117 dev_ll = lcore_ll->ll_root_used;
2119 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2120 vdev = dev_ll->vdev;
2122 if (unlikely(vdev->remove)) {
2123 dev_ll = dev_ll->next;
2125 vdev->ready = DEVICE_SAFE_REMOVE;
2129 if (likely(vdev->ready == DEVICE_RX)) {
2130 uint32_t index = vdev->vmdq_rx_q;
2133 = rte_ring_count(vpool_array[index].ring);
2134 uint16_t free_entries
2135 = (uint16_t)get_available_ring_num_zcp(dev);
2138 * Attach all mbufs in vpool.ring and put back
2142 i < RTE_MIN(free_entries,
2143 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2145 attach_rxmbuf_zcp(dev);
2147 /* Handle guest RX */
2148 rx_count = rte_eth_rx_burst(ports[0],
2149 vdev->vmdq_rx_q, pkts_burst,
2153 ret_count = virtio_dev_rx_zcp(dev,
2154 pkts_burst, rx_count);
2156 dev_statistics[dev->device_fh].rx_total
2158 dev_statistics[dev->device_fh].rx
2161 while (likely(rx_count)) {
2164 pkts_burst[rx_count]);
2165 rte_ring_sp_enqueue(
2166 vpool_array[index].ring,
2167 (void *)pkts_burst[rx_count]);
2172 if (likely(!vdev->remove))
2173 /* Handle guest TX */
2174 virtio_dev_tx_zcp(dev);
2176 /* Move to the next device in the list */
2177 dev_ll = dev_ll->next;
2186 * Add an entry to a used linked list. A free entry must first be found
2187 * in the free linked list using get_data_ll_free_entry();
2190 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2191 struct virtio_net_data_ll *ll_dev)
2193 struct virtio_net_data_ll *ll = *ll_root_addr;
2195 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2196 ll_dev->next = NULL;
2197 rte_compiler_barrier();
2199 /* If ll == NULL then this is the first device. */
2201 /* Increment to the tail of the linked list. */
2202 while ((ll->next != NULL) )
2207 *ll_root_addr = ll_dev;
2212 * Remove an entry from a used linked list. The entry must then be added to
2213 * the free linked list using put_data_ll_free_entry().
2216 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2217 struct virtio_net_data_ll *ll_dev,
2218 struct virtio_net_data_ll *ll_dev_last)
2220 struct virtio_net_data_ll *ll = *ll_root_addr;
2222 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2226 *ll_root_addr = ll_dev->next;
2228 if (likely(ll_dev_last != NULL))
2229 ll_dev_last->next = ll_dev->next;
2231 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2235 * Find and return an entry from the free linked list.
2237 static struct virtio_net_data_ll *
2238 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2240 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2241 struct virtio_net_data_ll *ll_dev;
2243 if (ll_free == NULL)
2247 *ll_root_addr = ll_free->next;
2253 * Place an entry back on to the free linked list.
2256 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2257 struct virtio_net_data_ll *ll_dev)
2259 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2264 ll_dev->next = ll_free;
2265 *ll_root_addr = ll_dev;
2269 * Creates a linked list of a given size.
2271 static struct virtio_net_data_ll *
2272 alloc_data_ll(uint32_t size)
2274 struct virtio_net_data_ll *ll_new;
2277 /* Malloc and then chain the linked list. */
2278 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2279 if (ll_new == NULL) {
2280 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2284 for (i = 0; i < size - 1; i++) {
2285 ll_new[i].vdev = NULL;
2286 ll_new[i].next = &ll_new[i+1];
2288 ll_new[i].next = NULL;
2294 * Create the main linked list along with each individual cores linked list. A used and a free list
2295 * are created to manage entries.
2302 RTE_LCORE_FOREACH_SLAVE(lcore) {
2303 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2304 if (lcore_info[lcore].lcore_ll == NULL) {
2305 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2309 lcore_info[lcore].lcore_ll->device_num = 0;
2310 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2311 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2312 if (num_devices % num_switching_cores)
2313 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2315 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2318 /* Allocate devices up to a maximum of MAX_DEVICES. */
2319 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2325 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2326 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2327 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2330 destroy_device (volatile struct virtio_net *dev)
2332 struct virtio_net_data_ll *ll_lcore_dev_cur;
2333 struct virtio_net_data_ll *ll_main_dev_cur;
2334 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2335 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2336 struct vhost_dev *vdev;
2339 dev->flags &= ~VIRTIO_DEV_RUNNING;
2341 vdev = (struct vhost_dev *)dev->priv;
2342 /*set the remove flag. */
2344 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2348 /* Search for entry to be removed from lcore ll */
2349 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2350 while (ll_lcore_dev_cur != NULL) {
2351 if (ll_lcore_dev_cur->vdev == vdev) {
2354 ll_lcore_dev_last = ll_lcore_dev_cur;
2355 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2359 if (ll_lcore_dev_cur == NULL) {
2360 RTE_LOG(ERR, VHOST_CONFIG,
2361 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2366 /* Search for entry to be removed from main ll */
2367 ll_main_dev_cur = ll_root_used;
2368 ll_main_dev_last = NULL;
2369 while (ll_main_dev_cur != NULL) {
2370 if (ll_main_dev_cur->vdev == vdev) {
2373 ll_main_dev_last = ll_main_dev_cur;
2374 ll_main_dev_cur = ll_main_dev_cur->next;
2378 /* Remove entries from the lcore and main ll. */
2379 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2380 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2382 /* Set the dev_removal_flag on each lcore. */
2383 RTE_LCORE_FOREACH_SLAVE(lcore) {
2384 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2388 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2389 * they can no longer access the device removed from the linked lists and that the devices
2390 * are no longer in use.
2392 RTE_LCORE_FOREACH_SLAVE(lcore) {
2393 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2398 /* Add the entries back to the lcore and main free ll.*/
2399 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2400 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2402 /* Decrement number of device on the lcore. */
2403 lcore_info[vdev->coreid].lcore_ll->device_num--;
2405 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2408 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2410 /* Stop the RX queue. */
2411 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2412 LOG_DEBUG(VHOST_CONFIG,
2413 "(%"PRIu64") In destroy_device: Failed to stop "
2419 LOG_DEBUG(VHOST_CONFIG,
2420 "(%"PRIu64") in destroy_device: Start put mbuf in "
2421 "mempool back to ring for RX queue: %d\n",
2422 dev->device_fh, vdev->vmdq_rx_q);
2424 mbuf_destroy_zcp(vpool);
2426 /* Stop the TX queue. */
2427 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2428 LOG_DEBUG(VHOST_CONFIG,
2429 "(%"PRIu64") In destroy_device: Failed to "
2430 "stop tx queue:%d\n",
2431 dev->device_fh, vdev->vmdq_rx_q);
2434 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2436 LOG_DEBUG(VHOST_CONFIG,
2437 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2438 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2439 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2442 mbuf_destroy_zcp(vpool);
2443 rte_free(vdev->regions_hpa);
2450 * Calculate the region count of physical continous regions for one particular
2451 * region of whose vhost virtual address is continous. The particular region
2452 * start from vva_start, with size of 'size' in argument.
2455 check_hpa_regions(uint64_t vva_start, uint64_t size)
2457 uint32_t i, nregions = 0, page_size = getpagesize();
2458 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2459 if (vva_start % page_size) {
2460 LOG_DEBUG(VHOST_CONFIG,
2461 "in check_countinous: vva start(%p) mod page_size(%d) "
2463 (void *)(uintptr_t)vva_start, page_size);
2466 if (size % page_size) {
2467 LOG_DEBUG(VHOST_CONFIG,
2468 "in check_countinous: "
2469 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2473 for (i = 0; i < size - page_size; i = i + page_size) {
2475 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2476 next_phys_addr = rte_mem_virt2phy(
2477 (void *)(uintptr_t)(vva_start + i + page_size));
2478 if ((cur_phys_addr + page_size) != next_phys_addr) {
2480 LOG_DEBUG(VHOST_CONFIG,
2481 "in check_continuous: hva addr:(%p) is not "
2482 "continuous with hva addr:(%p), diff:%d\n",
2483 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2484 (void *)(uintptr_t)(vva_start + (uint64_t)i
2485 + page_size), page_size);
2486 LOG_DEBUG(VHOST_CONFIG,
2487 "in check_continuous: hpa addr:(%p) is not "
2488 "continuous with hpa addr:(%p), "
2489 "diff:(%"PRIu64")\n",
2490 (void *)(uintptr_t)cur_phys_addr,
2491 (void *)(uintptr_t)next_phys_addr,
2492 (next_phys_addr-cur_phys_addr));
2499 * Divide each region whose vhost virtual address is continous into a few
2500 * sub-regions, make sure the physical address within each sub-region are
2501 * continous. And fill offset(to GPA) and size etc. information of each
2502 * sub-region into regions_hpa.
2505 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2507 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2508 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2510 if (mem_region_hpa == NULL)
2513 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2514 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2515 virtio_memory->regions[regionidx].address_offset;
2516 mem_region_hpa[regionidx_hpa].guest_phys_address
2517 = virtio_memory->regions[regionidx].guest_phys_address;
2518 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2519 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2520 mem_region_hpa[regionidx_hpa].guest_phys_address;
2521 LOG_DEBUG(VHOST_CONFIG,
2522 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2525 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2526 LOG_DEBUG(VHOST_CONFIG,
2527 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2530 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2532 i < virtio_memory->regions[regionidx].memory_size -
2535 cur_phys_addr = rte_mem_virt2phy(
2536 (void *)(uintptr_t)(vva_start + i));
2537 next_phys_addr = rte_mem_virt2phy(
2538 (void *)(uintptr_t)(vva_start +
2540 if ((cur_phys_addr + page_size) != next_phys_addr) {
2541 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2542 mem_region_hpa[regionidx_hpa].guest_phys_address +
2544 mem_region_hpa[regionidx_hpa].memory_size
2546 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2547 "phys addr end [%d]:(%p)\n",
2550 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2551 LOG_DEBUG(VHOST_CONFIG,
2552 "in fill_hpa_regions: guest phys addr "
2556 (mem_region_hpa[regionidx_hpa].memory_size));
2557 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2558 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2560 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2562 mem_region_hpa[regionidx_hpa].guest_phys_address;
2563 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2564 " phys addr start[%d]:(%p)\n",
2567 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2568 LOG_DEBUG(VHOST_CONFIG,
2569 "in fill_hpa_regions: host phys addr "
2573 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2579 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2580 = mem_region_hpa[regionidx_hpa].guest_phys_address
2582 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2583 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2584 "[%d]:(%p)\n", regionidx_hpa,
2586 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2587 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2588 "[%d]:(%p)\n", regionidx_hpa,
2590 (mem_region_hpa[regionidx_hpa].memory_size));
2593 return regionidx_hpa;
2597 * A new device is added to a data core. First the device is added to the main linked list
2598 * and the allocated to a specific data core.
2601 new_device (struct virtio_net *dev)
2603 struct virtio_net_data_ll *ll_dev;
2604 int lcore, core_add = 0;
2605 uint32_t device_num_min = num_devices;
2606 struct vhost_dev *vdev;
2609 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2611 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2619 vdev->nregions_hpa = dev->mem->nregions;
2620 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2622 += check_hpa_regions(
2623 dev->mem->regions[regionidx].guest_phys_address
2624 + dev->mem->regions[regionidx].address_offset,
2625 dev->mem->regions[regionidx].memory_size);
2629 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2630 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2631 RTE_CACHE_LINE_SIZE);
2632 if (vdev->regions_hpa == NULL) {
2633 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2639 if (fill_hpa_memory_regions(
2640 vdev->regions_hpa, dev->mem
2641 ) != vdev->nregions_hpa) {
2643 RTE_LOG(ERR, VHOST_CONFIG,
2644 "hpa memory regions number mismatch: "
2645 "[%d]\n", vdev->nregions_hpa);
2646 rte_free(vdev->regions_hpa);
2653 /* Add device to main ll */
2654 ll_dev = get_data_ll_free_entry(&ll_root_free);
2655 if (ll_dev == NULL) {
2656 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2657 "of %d devices per core has been reached\n",
2658 dev->device_fh, num_devices);
2659 if (vdev->regions_hpa)
2660 rte_free(vdev->regions_hpa);
2664 ll_dev->vdev = vdev;
2665 add_data_ll_entry(&ll_root_used, ll_dev);
2667 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2670 uint32_t index = vdev->vmdq_rx_q;
2671 uint32_t count_in_ring, i;
2672 struct mbuf_table *tx_q;
2674 count_in_ring = rte_ring_count(vpool_array[index].ring);
2676 LOG_DEBUG(VHOST_CONFIG,
2677 "(%"PRIu64") in new_device: mbuf count in mempool "
2678 "before attach is: %d\n",
2680 rte_mempool_count(vpool_array[index].pool));
2681 LOG_DEBUG(VHOST_CONFIG,
2682 "(%"PRIu64") in new_device: mbuf count in ring "
2683 "before attach is : %d\n",
2684 dev->device_fh, count_in_ring);
2687 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2689 for (i = 0; i < count_in_ring; i++)
2690 attach_rxmbuf_zcp(dev);
2692 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2693 "mempool after attach is: %d\n",
2695 rte_mempool_count(vpool_array[index].pool));
2696 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2697 "ring after attach is : %d\n",
2699 rte_ring_count(vpool_array[index].ring));
2701 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2702 tx_q->txq_id = vdev->vmdq_rx_q;
2704 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2705 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2707 LOG_DEBUG(VHOST_CONFIG,
2708 "(%"PRIu64") In new_device: Failed to start "
2710 dev->device_fh, vdev->vmdq_rx_q);
2712 mbuf_destroy_zcp(vpool);
2713 rte_free(vdev->regions_hpa);
2718 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2719 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2721 LOG_DEBUG(VHOST_CONFIG,
2722 "(%"PRIu64") In new_device: Failed to start "
2724 dev->device_fh, vdev->vmdq_rx_q);
2726 /* Stop the TX queue. */
2727 if (rte_eth_dev_tx_queue_stop(ports[0],
2728 vdev->vmdq_rx_q) != 0) {
2729 LOG_DEBUG(VHOST_CONFIG,
2730 "(%"PRIu64") In new_device: Failed to "
2731 "stop tx queue:%d\n",
2732 dev->device_fh, vdev->vmdq_rx_q);
2735 mbuf_destroy_zcp(vpool);
2736 rte_free(vdev->regions_hpa);
2743 /*reset ready flag*/
2744 vdev->ready = DEVICE_MAC_LEARNING;
2747 /* Find a suitable lcore to add the device. */
2748 RTE_LCORE_FOREACH_SLAVE(lcore) {
2749 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2750 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2754 /* Add device to lcore ll */
2755 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2756 if (ll_dev == NULL) {
2757 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2758 vdev->ready = DEVICE_SAFE_REMOVE;
2759 destroy_device(dev);
2760 if (vdev->regions_hpa)
2761 rte_free(vdev->regions_hpa);
2765 ll_dev->vdev = vdev;
2766 vdev->coreid = core_add;
2768 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2770 /* Initialize device stats */
2771 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2773 /* Disable notifications. */
2774 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2775 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2776 lcore_info[vdev->coreid].lcore_ll->device_num++;
2777 dev->flags |= VIRTIO_DEV_RUNNING;
2779 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2785 * These callback allow devices to be added to the data core when configuration
2786 * has been fully complete.
2788 static const struct virtio_net_device_ops virtio_net_device_ops =
2790 .new_device = new_device,
2791 .destroy_device = destroy_device,
2795 * This is a thread will wake up after a period to print stats if the user has
2801 struct virtio_net_data_ll *dev_ll;
2802 uint64_t tx_dropped, rx_dropped;
2803 uint64_t tx, tx_total, rx, rx_total;
2805 const char clr[] = { 27, '[', '2', 'J', '\0' };
2806 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2809 sleep(enable_stats);
2811 /* Clear screen and move to top left */
2812 printf("%s%s", clr, top_left);
2814 printf("\nDevice statistics ====================================");
2816 dev_ll = ll_root_used;
2817 while (dev_ll != NULL) {
2818 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2819 tx_total = dev_statistics[device_fh].tx_total;
2820 tx = dev_statistics[device_fh].tx;
2821 tx_dropped = tx_total - tx;
2822 if (zero_copy == 0) {
2823 rx_total = rte_atomic64_read(
2824 &dev_statistics[device_fh].rx_total_atomic);
2825 rx = rte_atomic64_read(
2826 &dev_statistics[device_fh].rx_atomic);
2828 rx_total = dev_statistics[device_fh].rx_total;
2829 rx = dev_statistics[device_fh].rx;
2831 rx_dropped = rx_total - rx;
2833 printf("\nStatistics for device %"PRIu32" ------------------------------"
2834 "\nTX total: %"PRIu64""
2835 "\nTX dropped: %"PRIu64""
2836 "\nTX successful: %"PRIu64""
2837 "\nRX total: %"PRIu64""
2838 "\nRX dropped: %"PRIu64""
2839 "\nRX successful: %"PRIu64"",
2848 dev_ll = dev_ll->next;
2850 printf("\n======================================================\n");
2855 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2856 char *ring_name, uint32_t nb_mbuf)
2858 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2859 vpool_array[index].pool
2860 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2861 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2862 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2863 rte_pktmbuf_init, NULL, socket, 0);
2864 if (vpool_array[index].pool != NULL) {
2865 vpool_array[index].ring
2866 = rte_ring_create(ring_name,
2867 rte_align32pow2(nb_mbuf + 1),
2868 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2869 if (likely(vpool_array[index].ring != NULL)) {
2870 LOG_DEBUG(VHOST_CONFIG,
2871 "in setup_mempool_tbl: mbuf count in "
2873 rte_mempool_count(vpool_array[index].pool));
2874 LOG_DEBUG(VHOST_CONFIG,
2875 "in setup_mempool_tbl: mbuf count in "
2877 rte_ring_count(vpool_array[index].ring));
2879 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2883 /* Need consider head room. */
2884 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2886 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2892 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2893 * device is also registered here to handle the IOCTLs.
2896 main(int argc, char *argv[])
2898 struct rte_mempool *mbuf_pool = NULL;
2899 unsigned lcore_id, core_id = 0;
2900 unsigned nb_ports, valid_num_ports;
2904 static pthread_t tid;
2907 ret = rte_eal_init(argc, argv);
2909 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2913 /* parse app arguments */
2914 ret = us_vhost_parse_args(argc, argv);
2916 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2918 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2919 if (rte_lcore_is_enabled(lcore_id))
2920 lcore_ids[core_id ++] = lcore_id;
2922 if (rte_lcore_count() > RTE_MAX_LCORE)
2923 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2925 /*set the number of swithcing cores available*/
2926 num_switching_cores = rte_lcore_count()-1;
2928 /* Get the number of physical ports. */
2929 nb_ports = rte_eth_dev_count();
2930 if (nb_ports > RTE_MAX_ETHPORTS)
2931 nb_ports = RTE_MAX_ETHPORTS;
2934 * Update the global var NUM_PORTS and global array PORTS
2935 * and get value of var VALID_NUM_PORTS according to system ports number
2937 valid_num_ports = check_ports_num(nb_ports);
2939 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2940 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2941 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2945 if (zero_copy == 0) {
2946 /* Create the mbuf pool. */
2947 mbuf_pool = rte_mempool_create(
2951 MBUF_SIZE, MBUF_CACHE_SIZE,
2952 sizeof(struct rte_pktmbuf_pool_private),
2953 rte_pktmbuf_pool_init, NULL,
2954 rte_pktmbuf_init, NULL,
2955 rte_socket_id(), 0);
2956 if (mbuf_pool == NULL)
2957 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2959 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2960 vpool_array[queue_id].pool = mbuf_pool;
2962 if (vm2vm_mode == VM2VM_HARDWARE) {
2963 /* Enable VT loop back to let L2 switch to do it. */
2964 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2965 LOG_DEBUG(VHOST_CONFIG,
2966 "Enable loop back for L2 switch in vmdq.\n");
2970 char pool_name[RTE_MEMPOOL_NAMESIZE];
2971 char ring_name[RTE_MEMPOOL_NAMESIZE];
2973 nb_mbuf = num_rx_descriptor
2974 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2975 + num_switching_cores * MAX_PKT_BURST;
2977 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2978 snprintf(pool_name, sizeof(pool_name),
2979 "rxmbuf_pool_%u", queue_id);
2980 snprintf(ring_name, sizeof(ring_name),
2981 "rxmbuf_ring_%u", queue_id);
2982 setup_mempool_tbl(rte_socket_id(), queue_id,
2983 pool_name, ring_name, nb_mbuf);
2986 nb_mbuf = num_tx_descriptor
2987 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2988 + num_switching_cores * MAX_PKT_BURST;
2990 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2991 snprintf(pool_name, sizeof(pool_name),
2992 "txmbuf_pool_%u", queue_id);
2993 snprintf(ring_name, sizeof(ring_name),
2994 "txmbuf_ring_%u", queue_id);
2995 setup_mempool_tbl(rte_socket_id(),
2996 (queue_id + MAX_QUEUES),
2997 pool_name, ring_name, nb_mbuf);
3000 if (vm2vm_mode == VM2VM_HARDWARE) {
3001 /* Enable VT loop back to let L2 switch to do it. */
3002 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3003 LOG_DEBUG(VHOST_CONFIG,
3004 "Enable loop back for L2 switch in vmdq.\n");
3007 /* Set log level. */
3008 rte_set_log_level(LOG_LEVEL);
3010 /* initialize all ports */
3011 for (portid = 0; portid < nb_ports; portid++) {
3012 /* skip ports that are not enabled */
3013 if ((enabled_port_mask & (1 << portid)) == 0) {
3014 RTE_LOG(INFO, VHOST_PORT,
3015 "Skipping disabled port %d\n", portid);
3018 if (port_init(portid) != 0)
3019 rte_exit(EXIT_FAILURE,
3020 "Cannot initialize network ports\n");
3023 /* Initialise all linked lists. */
3024 if (init_data_ll() == -1)
3025 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3027 /* Initialize device stats */
3028 memset(&dev_statistics, 0, sizeof(dev_statistics));
3030 /* Enable stats if the user option is set. */
3032 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3034 /* Launch all data cores. */
3035 if (zero_copy == 0) {
3036 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3037 rte_eal_remote_launch(switch_worker,
3038 mbuf_pool, lcore_id);
3041 uint32_t count_in_mempool, index, i;
3042 for (index = 0; index < 2*MAX_QUEUES; index++) {
3043 /* For all RX and TX queues. */
3045 = rte_mempool_count(vpool_array[index].pool);
3048 * Transfer all un-attached mbufs from vpool.pool
3051 for (i = 0; i < count_in_mempool; i++) {
3052 struct rte_mbuf *mbuf
3053 = __rte_mbuf_raw_alloc(
3054 vpool_array[index].pool);
3055 rte_ring_sp_enqueue(vpool_array[index].ring,
3059 LOG_DEBUG(VHOST_CONFIG,
3060 "in main: mbuf count in mempool at initial "
3061 "is: %d\n", count_in_mempool);
3062 LOG_DEBUG(VHOST_CONFIG,
3063 "in main: mbuf count in ring at initial is :"
3065 rte_ring_count(vpool_array[index].ring));
3068 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3069 rte_eal_remote_launch(switch_worker_zcp, NULL,
3074 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3076 /* Register CUSE device to handle IOCTLs. */
3077 ret = rte_vhost_driver_register((char *)&dev_basename);
3079 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3081 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3083 /* Start CUSE session. */
3084 rte_vhost_driver_session_start();