4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 512
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
88 #define JUMBO_FRAME_MAX_SIZE 0x2600
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
93 #define DEVICE_SAFE_REMOVE 2
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
104 * Need refine these 2 macros for legacy and DPDK based front end:
105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106 * And then adjust power 2.
109 * For legacy front end, 128 descriptors,
110 * half for virtio header, another half for mbuf.
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 + sizeof(struct rte_mbuf)))
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
122 #define INVALID_PORT_ID 0xFF
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
142 #define MBUF_EXT_MEM(mb) (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
144 /* mask of enabled ports */
145 static uint32_t enabled_port_mask = 0;
147 /* Promiscuous mode */
148 static uint32_t promiscuous;
150 /*Number of switching cores enabled*/
151 static uint32_t num_switching_cores = 0;
153 /* number of devices/queues to support*/
154 static uint32_t num_queues = 0;
155 static uint32_t num_devices;
158 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
159 * disabled on default.
161 static uint32_t zero_copy;
162 static int mergeable;
164 /* Do vlan strip on host, enabled on default */
165 static uint32_t vlan_strip = 1;
167 /* number of descriptors to apply*/
168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
172 #define MAX_RING_DESC 4096
175 struct rte_mempool *pool;
176 struct rte_ring *ring;
178 } vpool_array[MAX_QUEUES+MAX_QUEUES];
180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
189 /* The type of host physical address translated from guest physical address. */
191 PHYS_ADDR_CONTINUOUS = 0,
192 PHYS_ADDR_CROSS_SUBREG = 1,
193 PHYS_ADDR_INVALID = 2,
198 static uint32_t enable_stats = 0;
199 /* Enable retries on RX. */
200 static uint32_t enable_retry = 1;
201 /* Specify timeout (in useconds) between retries on RX. */
202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
203 /* Specify the number of retries on RX. */
204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
206 /* Character device basename. Can be set by user. */
207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
209 /* empty vmdq configuration structure. Filled in programatically */
210 static struct rte_eth_conf vmdq_conf_default = {
212 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
214 .header_split = 0, /**< Header Split disabled */
215 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
216 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
218 * It is necessary for 1G NIC such as I350,
219 * this fixes bug of ipv4 forwarding in guest can't
220 * forward pakets from one virtio dev to another virtio dev.
222 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
223 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
224 .hw_strip_crc = 0, /**< CRC stripped by hardware */
228 .mq_mode = ETH_MQ_TX_NONE,
232 * should be overridden separately in code with
236 .nb_queue_pools = ETH_8_POOLS,
237 .enable_default_pool = 0,
240 .pool_map = {{0, 0},},
245 static unsigned lcore_ids[RTE_MAX_LCORE];
246 static uint8_t ports[RTE_MAX_ETHPORTS];
247 static unsigned num_ports = 0; /**< The number of ports specified in command line */
248 static uint16_t num_pf_queues, num_vmdq_queues;
249 static uint16_t vmdq_pool_base, vmdq_queue_base;
250 static uint16_t queues_per_pool;
252 static const uint16_t external_pkt_default_vlan_tag = 2000;
253 const uint16_t vlan_tags[] = {
254 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
255 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
256 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
257 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
258 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
259 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
260 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
261 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
264 /* ethernet addresses of ports */
265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
267 /* heads for the main used and free linked lists for the data path. */
268 static struct virtio_net_data_ll *ll_root_used = NULL;
269 static struct virtio_net_data_ll *ll_root_free = NULL;
271 /* Array of data core structures containing information on individual core linked lists. */
272 static struct lcore_info lcore_info[RTE_MAX_LCORE];
274 /* Used for queueing bursts of TX packets. */
278 struct rte_mbuf *m_table[MAX_PKT_BURST];
281 /* TX queue for each data core. */
282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
284 /* TX queue fori each virtio device for zero copy. */
285 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
287 /* Vlan header struct used to insert vlan tags on TX. */
289 unsigned char h_dest[ETH_ALEN];
290 unsigned char h_source[ETH_ALEN];
293 __be16 h_vlan_encapsulated_proto;
298 uint8_t version_ihl; /**< version and header length */
299 uint8_t type_of_service; /**< type of service */
300 uint16_t total_length; /**< length of packet */
301 uint16_t packet_id; /**< packet ID */
302 uint16_t fragment_offset; /**< fragmentation offset */
303 uint8_t time_to_live; /**< time to live */
304 uint8_t next_proto_id; /**< protocol ID */
305 uint16_t hdr_checksum; /**< header checksum */
306 uint32_t src_addr; /**< source address */
307 uint32_t dst_addr; /**< destination address */
308 } __attribute__((__packed__));
310 /* Header lengths. */
312 #define VLAN_ETH_HLEN 18
314 /* Per-device statistics struct */
315 struct device_statistics {
317 rte_atomic64_t rx_total_atomic;
320 rte_atomic64_t rx_atomic;
322 } __rte_cache_aligned;
323 struct device_statistics dev_statistics[MAX_DEVICES];
326 * Builds up the correct configuration for VMDQ VLAN pool map
327 * according to the pool & queue limits.
330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
332 struct rte_eth_vmdq_rx_conf conf;
333 struct rte_eth_vmdq_rx_conf *def_conf =
334 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
337 memset(&conf, 0, sizeof(conf));
338 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
339 conf.nb_pool_maps = num_devices;
340 conf.enable_loop_back = def_conf->enable_loop_back;
341 conf.rx_mode = def_conf->rx_mode;
343 for (i = 0; i < conf.nb_pool_maps; i++) {
344 conf.pool_map[i].vlan_id = vlan_tags[ i ];
345 conf.pool_map[i].pools = (1UL << i);
348 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
349 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
350 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
355 * Validate the device number according to the max pool number gotten form
356 * dev_info. If the device number is invalid, give the error message and
357 * return -1. Each device must have its own pool.
360 validate_num_devices(uint32_t max_nb_devices)
362 if (num_devices > max_nb_devices) {
363 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
370 * Initialises a given port using global settings and with the rx buffers
371 * coming from the mbuf_pool passed as parameter
374 port_init(uint8_t port)
376 struct rte_eth_dev_info dev_info;
377 struct rte_eth_conf port_conf;
378 struct rte_eth_rxconf *rxconf;
379 struct rte_eth_txconf *txconf;
380 int16_t rx_rings, tx_rings;
381 uint16_t rx_ring_size, tx_ring_size;
385 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
386 rte_eth_dev_info_get (port, &dev_info);
388 if (dev_info.max_rx_queues > MAX_QUEUES) {
389 rte_exit(EXIT_FAILURE,
390 "please define MAX_QUEUES no less than %u in %s\n",
391 dev_info.max_rx_queues, __FILE__);
394 rxconf = &dev_info.default_rxconf;
395 txconf = &dev_info.default_txconf;
396 rxconf->rx_drop_en = 1;
398 /* Enable vlan offload */
399 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
402 * Zero copy defers queue RX/TX start to the time when guest
403 * finishes its startup and packet buffers from that guest are
407 rxconf->rx_deferred_start = 1;
408 rxconf->rx_drop_en = 0;
409 txconf->tx_deferred_start = 1;
412 /*configure the number of supported virtio devices based on VMDQ limits */
413 num_devices = dev_info.max_vmdq_pools;
416 rx_ring_size = num_rx_descriptor;
417 tx_ring_size = num_tx_descriptor;
418 tx_rings = dev_info.max_tx_queues;
420 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
421 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
422 tx_rings = (uint16_t)rte_lcore_count();
425 retval = validate_num_devices(MAX_DEVICES);
429 /* Get port configuration. */
430 retval = get_eth_conf(&port_conf, num_devices);
433 /* NIC queues are divided into pf queues and vmdq queues. */
434 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
435 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
436 num_vmdq_queues = num_devices * queues_per_pool;
437 num_queues = num_pf_queues + num_vmdq_queues;
438 vmdq_queue_base = dev_info.vmdq_queue_base;
439 vmdq_pool_base = dev_info.vmdq_pool_base;
440 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
441 num_pf_queues, num_devices, queues_per_pool);
443 if (port >= rte_eth_dev_count()) return -1;
445 rx_rings = (uint16_t)dev_info.max_rx_queues;
446 /* Configure ethernet device. */
447 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
451 /* Setup the queues. */
452 for (q = 0; q < rx_rings; q ++) {
453 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
454 rte_eth_dev_socket_id(port),
456 vpool_array[q].pool);
460 for (q = 0; q < tx_rings; q ++) {
461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 rte_eth_dev_socket_id(port),
468 /* Start the device. */
469 retval = rte_eth_dev_start(port);
471 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476 rte_eth_promiscuous_enable(port);
478 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
479 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
480 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
481 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
483 vmdq_ports_eth_addr[port].addr_bytes[0],
484 vmdq_ports_eth_addr[port].addr_bytes[1],
485 vmdq_ports_eth_addr[port].addr_bytes[2],
486 vmdq_ports_eth_addr[port].addr_bytes[3],
487 vmdq_ports_eth_addr[port].addr_bytes[4],
488 vmdq_ports_eth_addr[port].addr_bytes[5]);
494 * Set character device basename.
497 us_vhost_parse_basename(const char *q_arg)
499 /* parse number string */
501 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
504 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
510 * Parse the portmask provided at run time.
513 parse_portmask(const char *portmask)
520 /* parse hexadecimal string */
521 pm = strtoul(portmask, &end, 16);
522 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
533 * Parse num options at run time.
536 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
543 /* parse unsigned int string */
544 num = strtoul(q_arg, &end, 10);
545 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
548 if (num > max_valid_value)
559 us_vhost_usage(const char *prgname)
561 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
563 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
564 " --dev-basename <name>\n"
566 " -p PORTMASK: Set mask for ports to be used by application\n"
567 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
568 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
569 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
570 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
571 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
572 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
573 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574 " --dev-basename: The basename to be used for the character device.\n"
575 " --zero-copy [0|1]: disable(default)/enable rx/tx "
577 " --rx-desc-num [0-N]: the number of descriptors on rx, "
578 "used only when zero copy is enabled.\n"
579 " --tx-desc-num [0-N]: the number of descriptors on tx, "
580 "used only when zero copy is enabled.\n",
585 * Parse the arguments given in the command line of the application.
588 us_vhost_parse_args(int argc, char **argv)
593 const char *prgname = argv[0];
594 static struct option long_option[] = {
595 {"vm2vm", required_argument, NULL, 0},
596 {"rx-retry", required_argument, NULL, 0},
597 {"rx-retry-delay", required_argument, NULL, 0},
598 {"rx-retry-num", required_argument, NULL, 0},
599 {"mergeable", required_argument, NULL, 0},
600 {"vlan-strip", required_argument, NULL, 0},
601 {"stats", required_argument, NULL, 0},
602 {"dev-basename", required_argument, NULL, 0},
603 {"zero-copy", required_argument, NULL, 0},
604 {"rx-desc-num", required_argument, NULL, 0},
605 {"tx-desc-num", required_argument, NULL, 0},
609 /* Parse command line */
610 while ((opt = getopt_long(argc, argv, "p:P",
611 long_option, &option_index)) != EOF) {
615 enabled_port_mask = parse_portmask(optarg);
616 if (enabled_port_mask == 0) {
617 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
618 us_vhost_usage(prgname);
625 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
626 ETH_VMDQ_ACCEPT_BROADCAST |
627 ETH_VMDQ_ACCEPT_MULTICAST;
628 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
633 /* Enable/disable vm2vm comms. */
634 if (!strncmp(long_option[option_index].name, "vm2vm",
636 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
638 RTE_LOG(INFO, VHOST_CONFIG,
639 "Invalid argument for "
641 us_vhost_usage(prgname);
644 vm2vm_mode = (vm2vm_type)ret;
648 /* Enable/disable retries on RX. */
649 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
650 ret = parse_num_opt(optarg, 1);
652 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
653 us_vhost_usage(prgname);
660 /* Specify the retries delay time (in useconds) on RX. */
661 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
662 ret = parse_num_opt(optarg, INT32_MAX);
664 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
665 us_vhost_usage(prgname);
668 burst_rx_delay_time = ret;
672 /* Specify the retries number on RX. */
673 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
674 ret = parse_num_opt(optarg, INT32_MAX);
676 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
677 us_vhost_usage(prgname);
680 burst_rx_retry_num = ret;
684 /* Enable/disable RX mergeable buffers. */
685 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
686 ret = parse_num_opt(optarg, 1);
688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
689 us_vhost_usage(prgname);
694 vmdq_conf_default.rxmode.jumbo_frame = 1;
695 vmdq_conf_default.rxmode.max_rx_pkt_len
696 = JUMBO_FRAME_MAX_SIZE;
701 /* Enable/disable RX VLAN strip on host. */
702 if (!strncmp(long_option[option_index].name,
703 "vlan-strip", MAX_LONG_OPT_SZ)) {
704 ret = parse_num_opt(optarg, 1);
706 RTE_LOG(INFO, VHOST_CONFIG,
707 "Invalid argument for VLAN strip [0|1]\n");
708 us_vhost_usage(prgname);
712 vmdq_conf_default.rxmode.hw_vlan_strip =
717 /* Enable/disable stats. */
718 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
719 ret = parse_num_opt(optarg, INT32_MAX);
721 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
722 us_vhost_usage(prgname);
729 /* Set character device basename. */
730 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
731 if (us_vhost_parse_basename(optarg) == -1) {
732 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
733 us_vhost_usage(prgname);
738 /* Enable/disable rx/tx zero copy. */
739 if (!strncmp(long_option[option_index].name,
740 "zero-copy", MAX_LONG_OPT_SZ)) {
741 ret = parse_num_opt(optarg, 1);
743 RTE_LOG(INFO, VHOST_CONFIG,
745 " for zero-copy [0|1]\n");
746 us_vhost_usage(prgname);
752 /* Specify the descriptor number on RX. */
753 if (!strncmp(long_option[option_index].name,
754 "rx-desc-num", MAX_LONG_OPT_SZ)) {
755 ret = parse_num_opt(optarg, MAX_RING_DESC);
756 if ((ret == -1) || (!POWEROF2(ret))) {
757 RTE_LOG(INFO, VHOST_CONFIG,
758 "Invalid argument for rx-desc-num[0-N],"
759 "power of 2 required.\n");
760 us_vhost_usage(prgname);
763 num_rx_descriptor = ret;
767 /* Specify the descriptor number on TX. */
768 if (!strncmp(long_option[option_index].name,
769 "tx-desc-num", MAX_LONG_OPT_SZ)) {
770 ret = parse_num_opt(optarg, MAX_RING_DESC);
771 if ((ret == -1) || (!POWEROF2(ret))) {
772 RTE_LOG(INFO, VHOST_CONFIG,
773 "Invalid argument for tx-desc-num [0-N],"
774 "power of 2 required.\n");
775 us_vhost_usage(prgname);
778 num_tx_descriptor = ret;
784 /* Invalid option - print options. */
786 us_vhost_usage(prgname);
791 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
792 if (enabled_port_mask & (1 << i))
793 ports[num_ports++] = (uint8_t)i;
796 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
797 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
798 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
802 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
803 RTE_LOG(INFO, VHOST_PORT,
804 "Vhost zero copy doesn't support software vm2vm,"
805 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
809 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
810 RTE_LOG(INFO, VHOST_PORT,
811 "Vhost zero copy doesn't support jumbo frame,"
812 "please specify '--mergeable 0' to disable the "
813 "mergeable feature.\n");
821 * Update the global var NUM_PORTS and array PORTS according to system ports number
822 * and return valid ports number
824 static unsigned check_ports_num(unsigned nb_ports)
826 unsigned valid_num_ports = num_ports;
829 if (num_ports > nb_ports) {
830 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
831 num_ports, nb_ports);
832 num_ports = nb_ports;
835 for (portid = 0; portid < num_ports; portid ++) {
836 if (ports[portid] >= nb_ports) {
837 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
838 ports[portid], (nb_ports - 1));
839 ports[portid] = INVALID_PORT_ID;
843 return valid_num_ports;
847 * Macro to print out packet contents. Wrapped in debug define so that the
848 * data path is not effected when debug is disabled.
851 #define PRINT_PACKET(device, addr, size, header) do { \
852 char *pkt_addr = (char*)(addr); \
853 unsigned int index; \
854 char packet[MAX_PRINT_BUFF]; \
857 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
859 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
860 for (index = 0; index < (size); index++) { \
861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
862 "%02hhx ", pkt_addr[index]); \
864 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
866 LOG_DEBUG(VHOST_DATA, "%s", packet); \
869 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
873 * Function to convert guest physical addresses to vhost physical addresses.
874 * This is used to convert virtio buffer addresses.
876 static inline uint64_t __attribute__((always_inline))
877 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
878 uint32_t buf_len, hpa_type *addr_type)
880 struct virtio_memory_regions_hpa *region;
882 uint64_t vhost_pa = 0;
884 *addr_type = PHYS_ADDR_INVALID;
886 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
887 region = &vdev->regions_hpa[regionidx];
888 if ((guest_pa >= region->guest_phys_address) &&
889 (guest_pa <= region->guest_phys_address_end)) {
890 vhost_pa = region->host_phys_addr_offset + guest_pa;
891 if (likely((guest_pa + buf_len - 1)
892 <= region->guest_phys_address_end))
893 *addr_type = PHYS_ADDR_CONTINUOUS;
895 *addr_type = PHYS_ADDR_CROSS_SUBREG;
900 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
901 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
902 (void *)(uintptr_t)vhost_pa);
908 * Compares a packet destination MAC address to a device MAC address.
910 static inline int __attribute__((always_inline))
911 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
913 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
917 * This function learns the MAC address of the device and registers this along with a
918 * vlan tag to a VMDQ.
921 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
923 struct ether_hdr *pkt_hdr;
924 struct virtio_net_data_ll *dev_ll;
925 struct virtio_net *dev = vdev->dev;
928 /* Learn MAC address of guest device from packet */
929 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
931 dev_ll = ll_root_used;
933 while (dev_ll != NULL) {
934 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
935 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
938 dev_ll = dev_ll->next;
941 for (i = 0; i < ETHER_ADDR_LEN; i++)
942 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
944 /* vlan_tag currently uses the device_id. */
945 vdev->vlan_tag = vlan_tags[dev->device_fh];
947 /* Print out VMDQ registration info. */
948 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
950 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
951 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
952 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
955 /* Register the MAC address. */
956 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
957 (uint32_t)dev->device_fh + vmdq_pool_base);
959 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
962 /* Enable stripping of the vlan tag as we handle routing. */
964 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
965 (uint16_t)vdev->vmdq_rx_q, 1);
967 /* Set device as ready for RX. */
968 vdev->ready = DEVICE_RX;
974 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
975 * queue before disabling RX on the device.
978 unlink_vmdq(struct vhost_dev *vdev)
982 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
984 if (vdev->ready == DEVICE_RX) {
985 /*clear MAC and VLAN settings*/
986 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
987 for (i = 0; i < 6; i++)
988 vdev->mac_address.addr_bytes[i] = 0;
992 /*Clear out the receive buffers*/
993 rx_count = rte_eth_rx_burst(ports[0],
994 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
997 for (i = 0; i < rx_count; i++)
998 rte_pktmbuf_free(pkts_burst[i]);
1000 rx_count = rte_eth_rx_burst(ports[0],
1001 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1004 vdev->ready = DEVICE_MAC_LEARNING;
1009 * Check if the packet destination MAC address is for a local device. If so then put
1010 * the packet on that devices RX queue. If not then return.
1012 static inline int __attribute__((always_inline))
1013 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1015 struct virtio_net_data_ll *dev_ll;
1016 struct ether_hdr *pkt_hdr;
1018 struct virtio_net *dev = vdev->dev;
1019 struct virtio_net *tdev; /* destination virito device */
1021 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1023 /*get the used devices list*/
1024 dev_ll = ll_root_used;
1026 while (dev_ll != NULL) {
1027 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1028 &dev_ll->vdev->mac_address)) {
1030 /* Drop the packet if the TX packet is destined for the TX device. */
1031 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1032 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1036 tdev = dev_ll->vdev->dev;
1039 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1041 if (unlikely(dev_ll->vdev->remove)) {
1042 /*drop the packet if the device is marked for removal*/
1043 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1045 /*send the packet to the local virtio device*/
1046 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1049 &dev_statistics[tdev->device_fh].rx_total_atomic,
1052 &dev_statistics[tdev->device_fh].rx_atomic,
1054 dev_statistics[tdev->device_fh].tx_total++;
1055 dev_statistics[tdev->device_fh].tx += ret;
1061 dev_ll = dev_ll->next;
1068 * Check if the destination MAC of a packet is one local VM,
1069 * and get its vlan tag, and offset if it is.
1071 static inline int __attribute__((always_inline))
1072 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1073 uint32_t *offset, uint16_t *vlan_tag)
1075 struct virtio_net_data_ll *dev_ll = ll_root_used;
1076 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1078 while (dev_ll != NULL) {
1079 if ((dev_ll->vdev->ready == DEVICE_RX)
1080 && ether_addr_cmp(&(pkt_hdr->d_addr),
1081 &dev_ll->vdev->mac_address)) {
1083 * Drop the packet if the TX packet is
1084 * destined for the TX device.
1086 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1087 LOG_DEBUG(VHOST_DATA,
1088 "(%"PRIu64") TX: Source and destination"
1089 " MAC addresses are the same. Dropping "
1091 dev_ll->vdev->dev->device_fh);
1096 * HW vlan strip will reduce the packet length
1097 * by minus length of vlan tag, so need restore
1098 * the packet length by plus it.
1100 *offset = VLAN_HLEN;
1103 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1105 LOG_DEBUG(VHOST_DATA,
1106 "(%"PRIu64") TX: pkt to local VM device id:"
1107 "(%"PRIu64") vlan tag: %d.\n",
1108 dev->device_fh, dev_ll->vdev->dev->device_fh,
1113 dev_ll = dev_ll->next;
1119 * This function routes the TX packet to the correct interface. This may be a local device
1120 * or the physical port.
1122 static inline void __attribute__((always_inline))
1123 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1125 struct mbuf_table *tx_q;
1126 struct rte_mbuf **m_table;
1127 unsigned len, ret, offset = 0;
1128 const uint16_t lcore_id = rte_lcore_id();
1129 struct virtio_net *dev = vdev->dev;
1130 struct ether_hdr *nh;
1132 /*check if destination is local VM*/
1133 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1134 rte_pktmbuf_free(m);
1138 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1139 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1140 rte_pktmbuf_free(m);
1145 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1147 /*Add packet to the port tx queue*/
1148 tx_q = &lcore_tx_queue[lcore_id];
1151 nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1152 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1153 /* Guest has inserted the vlan tag. */
1154 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1155 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1156 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1157 (vh->vlan_tci != vlan_tag_be))
1158 vh->vlan_tci = vlan_tag_be;
1160 m->ol_flags = PKT_TX_VLAN_PKT;
1163 * Find the right seg to adjust the data len when offset is
1164 * bigger than tail room size.
1166 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1167 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1168 m->data_len += offset;
1170 struct rte_mbuf *seg = m;
1172 while ((seg->next != NULL) &&
1173 (offset > rte_pktmbuf_tailroom(seg)))
1176 seg->data_len += offset;
1178 m->pkt_len += offset;
1181 m->vlan_tci = vlan_tag;
1184 tx_q->m_table[len] = m;
1187 dev_statistics[dev->device_fh].tx_total++;
1188 dev_statistics[dev->device_fh].tx++;
1191 if (unlikely(len == MAX_PKT_BURST)) {
1192 m_table = (struct rte_mbuf **)tx_q->m_table;
1193 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1194 /* Free any buffers not handled by TX and update the port stats. */
1195 if (unlikely(ret < len)) {
1197 rte_pktmbuf_free(m_table[ret]);
1198 } while (++ret < len);
1208 * This function is called by each data core. It handles all RX/TX registered with the
1209 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1210 * with all devices in the main linked list.
1213 switch_worker(__attribute__((unused)) void *arg)
1215 struct rte_mempool *mbuf_pool = arg;
1216 struct virtio_net *dev = NULL;
1217 struct vhost_dev *vdev = NULL;
1218 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1219 struct virtio_net_data_ll *dev_ll;
1220 struct mbuf_table *tx_q;
1221 volatile struct lcore_ll_info *lcore_ll;
1222 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1223 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1225 const uint16_t lcore_id = rte_lcore_id();
1226 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1227 uint16_t rx_count = 0;
1231 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1232 lcore_ll = lcore_info[lcore_id].lcore_ll;
1235 tx_q = &lcore_tx_queue[lcore_id];
1236 for (i = 0; i < num_cores; i ++) {
1237 if (lcore_ids[i] == lcore_id) {
1244 cur_tsc = rte_rdtsc();
1246 * TX burst queue drain
1248 diff_tsc = cur_tsc - prev_tsc;
1249 if (unlikely(diff_tsc > drain_tsc)) {
1252 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1254 /*Tx any packets in the queue*/
1255 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1256 (struct rte_mbuf **)tx_q->m_table,
1257 (uint16_t)tx_q->len);
1258 if (unlikely(ret < tx_q->len)) {
1260 rte_pktmbuf_free(tx_q->m_table[ret]);
1261 } while (++ret < tx_q->len);
1271 rte_prefetch0(lcore_ll->ll_root_used);
1273 * Inform the configuration core that we have exited the linked list and that no devices are
1274 * in use if requested.
1276 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1277 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1282 dev_ll = lcore_ll->ll_root_used;
1284 while (dev_ll != NULL) {
1285 /*get virtio device ID*/
1286 vdev = dev_ll->vdev;
1289 if (unlikely(vdev->remove)) {
1290 dev_ll = dev_ll->next;
1292 vdev->ready = DEVICE_SAFE_REMOVE;
1295 if (likely(vdev->ready == DEVICE_RX)) {
1297 rx_count = rte_eth_rx_burst(ports[0],
1298 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1302 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1303 * Here MAX_PKT_BURST must be less than virtio queue size
1305 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1306 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1307 rte_delay_us(burst_rx_delay_time);
1308 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1312 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1315 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1318 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1320 while (likely(rx_count)) {
1322 rte_pktmbuf_free(pkts_burst[rx_count]);
1328 if (likely(!vdev->remove)) {
1329 /* Handle guest TX*/
1330 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1331 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1332 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1333 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1335 rte_pktmbuf_free(pkts_burst[--tx_count]);
1339 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1342 /*move to the next device in the list*/
1343 dev_ll = dev_ll->next;
1351 * This function gets available ring number for zero copy rx.
1352 * Only one thread will call this funciton for a paticular virtio device,
1353 * so, it is designed as non-thread-safe function.
1355 static inline uint32_t __attribute__((always_inline))
1356 get_available_ring_num_zcp(struct virtio_net *dev)
1358 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1361 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1362 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1366 * This function gets available ring index for zero copy rx,
1367 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1368 * Only one thread will call this funciton for a paticular virtio device,
1369 * so, it is designed as non-thread-safe function.
1371 static inline uint32_t __attribute__((always_inline))
1372 get_available_ring_index_zcp(struct virtio_net *dev,
1373 uint16_t *res_base_idx, uint32_t count)
1375 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1378 uint16_t free_entries;
1380 *res_base_idx = vq->last_used_idx_res;
1381 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1382 free_entries = (avail_idx - *res_base_idx);
1384 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1386 "res base idx:%d, free entries:%d\n",
1387 dev->device_fh, avail_idx, *res_base_idx,
1391 * If retry is enabled and the queue is full then we wait
1392 * and retry to avoid packet loss.
1394 if (enable_retry && unlikely(count > free_entries)) {
1395 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1396 rte_delay_us(burst_rx_delay_time);
1397 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1398 free_entries = (avail_idx - *res_base_idx);
1399 if (count <= free_entries)
1404 /*check that we have enough buffers*/
1405 if (unlikely(count > free_entries))
1406 count = free_entries;
1408 if (unlikely(count == 0)) {
1409 LOG_DEBUG(VHOST_DATA,
1410 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1411 "avail idx: %d, res base idx:%d, free entries:%d\n",
1412 dev->device_fh, avail_idx,
1413 *res_base_idx, free_entries);
1417 vq->last_used_idx_res = *res_base_idx + count;
1423 * This function put descriptor back to used list.
1425 static inline void __attribute__((always_inline))
1426 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1428 uint16_t res_cur_idx = vq->last_used_idx;
1429 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1430 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1431 rte_compiler_barrier();
1432 *(volatile uint16_t *)&vq->used->idx += 1;
1433 vq->last_used_idx += 1;
1435 /* Kick the guest if necessary. */
1436 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1437 eventfd_write((int)vq->callfd, 1);
1441 * This function get available descriptor from vitio vring and un-attached mbuf
1442 * from vpool->ring, and then attach them together. It needs adjust the offset
1443 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1444 * frame data may be put to wrong location in mbuf.
1446 static inline void __attribute__((always_inline))
1447 attach_rxmbuf_zcp(struct virtio_net *dev)
1449 uint16_t res_base_idx, desc_idx;
1450 uint64_t buff_addr, phys_addr;
1451 struct vhost_virtqueue *vq;
1452 struct vring_desc *desc;
1453 struct rte_mbuf *mbuf = NULL;
1454 struct vpool *vpool;
1456 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1458 vpool = &vpool_array[vdev->vmdq_rx_q];
1459 vq = dev->virtqueue[VIRTIO_RXQ];
1462 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1465 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1467 desc = &vq->desc[desc_idx];
1468 if (desc->flags & VRING_DESC_F_NEXT) {
1469 desc = &vq->desc[desc->next];
1470 buff_addr = gpa_to_vva(dev, desc->addr);
1471 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1474 buff_addr = gpa_to_vva(dev,
1475 desc->addr + vq->vhost_hlen);
1476 phys_addr = gpa_to_hpa(vdev,
1477 desc->addr + vq->vhost_hlen,
1478 desc->len, &addr_type);
1481 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1482 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1483 " address found when attaching RX frame buffer"
1484 " address!\n", dev->device_fh);
1485 put_desc_to_used_list_zcp(vq, desc_idx);
1490 * Check if the frame buffer address from guest crosses
1491 * sub-region or not.
1493 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1494 RTE_LOG(ERR, VHOST_DATA,
1495 "(%"PRIu64") Frame buffer address cross "
1496 "sub-regioin found when attaching RX frame "
1497 "buffer address!\n",
1499 put_desc_to_used_list_zcp(vq, desc_idx);
1502 } while (unlikely(phys_addr == 0));
1504 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1505 if (unlikely(mbuf == NULL)) {
1506 LOG_DEBUG(VHOST_DATA,
1507 "(%"PRIu64") in attach_rxmbuf_zcp: "
1508 "ring_sc_dequeue fail.\n",
1510 put_desc_to_used_list_zcp(vq, desc_idx);
1514 if (unlikely(vpool->buf_size > desc->len)) {
1515 LOG_DEBUG(VHOST_DATA,
1516 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1517 "length(%d) of descriptor idx: %d less than room "
1518 "size required: %d\n",
1519 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1520 put_desc_to_used_list_zcp(vq, desc_idx);
1521 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1525 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1526 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1527 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1528 mbuf->data_len = desc->len;
1529 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1531 LOG_DEBUG(VHOST_DATA,
1532 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1533 "descriptor idx:%d\n",
1534 dev->device_fh, res_base_idx, desc_idx);
1536 __rte_mbuf_raw_free(mbuf);
1542 * Detach an attched packet mbuf -
1543 * - restore original mbuf address and length values.
1544 * - reset pktmbuf data and data_len to their default values.
1545 * All other fields of the given packet mbuf will be left intact.
1548 * The attached packet mbuf.
1550 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1552 const struct rte_mempool *mp = m->pool;
1553 void *buf = RTE_MBUF_TO_BADDR(m);
1555 uint32_t buf_len = mp->elt_size - sizeof(*m);
1556 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1559 m->buf_len = (uint16_t)buf_len;
1561 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1562 RTE_PKTMBUF_HEADROOM : m->buf_len;
1563 m->data_off = buf_ofs;
1569 * This function is called after packets have been transimited. It fetchs mbuf
1570 * from vpool->pool, detached it and put into vpool->ring. It also update the
1571 * used index and kick the guest if necessary.
1573 static inline uint32_t __attribute__((always_inline))
1574 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1576 struct rte_mbuf *mbuf;
1577 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1578 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1580 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1582 LOG_DEBUG(VHOST_DATA,
1583 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1585 dev->device_fh, mbuf_count);
1586 LOG_DEBUG(VHOST_DATA,
1587 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1589 dev->device_fh, rte_ring_count(vpool->ring));
1591 for (index = 0; index < mbuf_count; index++) {
1592 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1593 if (likely(MBUF_EXT_MEM(mbuf)))
1594 pktmbuf_detach_zcp(mbuf);
1595 rte_ring_sp_enqueue(vpool->ring, mbuf);
1597 /* Update used index buffer information. */
1598 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1599 vq->used->ring[used_idx].len = 0;
1601 used_idx = (used_idx + 1) & (vq->size - 1);
1604 LOG_DEBUG(VHOST_DATA,
1605 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1607 dev->device_fh, rte_mempool_count(vpool->pool));
1608 LOG_DEBUG(VHOST_DATA,
1609 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1611 dev->device_fh, rte_ring_count(vpool->ring));
1612 LOG_DEBUG(VHOST_DATA,
1613 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1614 "vq->last_used_idx:%d\n",
1615 dev->device_fh, vq->last_used_idx);
1617 vq->last_used_idx += mbuf_count;
1619 LOG_DEBUG(VHOST_DATA,
1620 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1621 "vq->last_used_idx:%d\n",
1622 dev->device_fh, vq->last_used_idx);
1624 rte_compiler_barrier();
1626 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1628 /* Kick guest if required. */
1629 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1630 eventfd_write((int)vq->callfd, 1);
1636 * This function is called when a virtio device is destroy.
1637 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1639 static void mbuf_destroy_zcp(struct vpool *vpool)
1641 struct rte_mbuf *mbuf = NULL;
1642 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1644 LOG_DEBUG(VHOST_CONFIG,
1645 "in mbuf_destroy_zcp: mbuf count in mempool before "
1646 "mbuf_destroy_zcp is: %d\n",
1648 LOG_DEBUG(VHOST_CONFIG,
1649 "in mbuf_destroy_zcp: mbuf count in ring before "
1650 "mbuf_destroy_zcp is : %d\n",
1651 rte_ring_count(vpool->ring));
1653 for (index = 0; index < mbuf_count; index++) {
1654 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1655 if (likely(mbuf != NULL)) {
1656 if (likely(MBUF_EXT_MEM(mbuf)))
1657 pktmbuf_detach_zcp(mbuf);
1658 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1662 LOG_DEBUG(VHOST_CONFIG,
1663 "in mbuf_destroy_zcp: mbuf count in mempool after "
1664 "mbuf_destroy_zcp is: %d\n",
1665 rte_mempool_count(vpool->pool));
1666 LOG_DEBUG(VHOST_CONFIG,
1667 "in mbuf_destroy_zcp: mbuf count in ring after "
1668 "mbuf_destroy_zcp is : %d\n",
1669 rte_ring_count(vpool->ring));
1673 * This function update the use flag and counter.
1675 static inline uint32_t __attribute__((always_inline))
1676 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1679 struct vhost_virtqueue *vq;
1680 struct vring_desc *desc;
1681 struct rte_mbuf *buff;
1682 /* The virtio_hdr is initialised to 0. */
1683 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1684 = {{0, 0, 0, 0, 0, 0}, 0};
1685 uint64_t buff_hdr_addr = 0;
1686 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1687 uint32_t head_idx, packet_success = 0;
1688 uint16_t res_cur_idx;
1690 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1695 vq = dev->virtqueue[VIRTIO_RXQ];
1696 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1698 res_cur_idx = vq->last_used_idx;
1699 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1700 dev->device_fh, res_cur_idx, res_cur_idx + count);
1702 /* Retrieve all of the head indexes first to avoid caching issues. */
1703 for (head_idx = 0; head_idx < count; head_idx++)
1704 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1706 /*Prefetch descriptor index. */
1707 rte_prefetch0(&vq->desc[head[packet_success]]);
1709 while (packet_success != count) {
1710 /* Get descriptor from available ring */
1711 desc = &vq->desc[head[packet_success]];
1713 buff = pkts[packet_success];
1714 LOG_DEBUG(VHOST_DATA,
1715 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1716 "pkt[%d] descriptor idx: %d\n",
1717 dev->device_fh, packet_success,
1718 MBUF_HEADROOM_UINT32(buff));
1721 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1722 + RTE_PKTMBUF_HEADROOM),
1723 rte_pktmbuf_data_len(buff), 0);
1725 /* Buffer address translation for virtio header. */
1726 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1727 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1730 * If the descriptors are chained the header and data are
1731 * placed in separate buffers.
1733 if (desc->flags & VRING_DESC_F_NEXT) {
1734 desc->len = vq->vhost_hlen;
1735 desc = &vq->desc[desc->next];
1736 desc->len = rte_pktmbuf_data_len(buff);
1738 desc->len = packet_len;
1741 /* Update used ring with desc information */
1742 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1743 = head[packet_success];
1744 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1749 /* A header is required per buffer. */
1750 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1751 (const void *)&virtio_hdr, vq->vhost_hlen);
1753 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1755 if (likely(packet_success < count)) {
1756 /* Prefetch descriptor index. */
1757 rte_prefetch0(&vq->desc[head[packet_success]]);
1761 rte_compiler_barrier();
1763 LOG_DEBUG(VHOST_DATA,
1764 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1765 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1766 dev->device_fh, vq->last_used_idx, vq->used->idx);
1768 *(volatile uint16_t *)&vq->used->idx += count;
1769 vq->last_used_idx += count;
1771 LOG_DEBUG(VHOST_DATA,
1772 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1773 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1774 dev->device_fh, vq->last_used_idx, vq->used->idx);
1776 /* Kick the guest if necessary. */
1777 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1778 eventfd_write((int)vq->callfd, 1);
1784 * This function routes the TX packet to the correct interface.
1785 * This may be a local device or the physical port.
1787 static inline void __attribute__((always_inline))
1788 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1789 uint32_t desc_idx, uint8_t need_copy)
1791 struct mbuf_table *tx_q;
1792 struct rte_mbuf **m_table;
1793 struct rte_mbuf *mbuf = NULL;
1794 unsigned len, ret, offset = 0;
1795 struct vpool *vpool;
1796 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1797 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1799 /*Add packet to the port tx queue*/
1800 tx_q = &tx_queue_zcp[vmdq_rx_q];
1803 /* Allocate an mbuf and populate the structure. */
1804 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1805 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1806 if (unlikely(mbuf == NULL)) {
1807 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1808 RTE_LOG(ERR, VHOST_DATA,
1809 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1811 put_desc_to_used_list_zcp(vq, desc_idx);
1815 if (vm2vm_mode == VM2VM_HARDWARE) {
1816 /* Avoid using a vlan tag from any vm for external pkt, such as
1817 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1818 * selection, MAC address determines it as an external pkt
1819 * which should go to network, while vlan tag determine it as
1820 * a vm2vm pkt should forward to another vm. Hardware confuse
1821 * such a ambiguous situation, so pkt will lost.
1823 vlan_tag = external_pkt_default_vlan_tag;
1824 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1825 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1826 __rte_mbuf_raw_free(mbuf);
1831 mbuf->nb_segs = m->nb_segs;
1832 mbuf->next = m->next;
1833 mbuf->data_len = m->data_len + offset;
1834 mbuf->pkt_len = mbuf->data_len;
1835 if (unlikely(need_copy)) {
1836 /* Copy the packet contents to the mbuf. */
1837 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1838 rte_pktmbuf_mtod(m, void *),
1841 mbuf->data_off = m->data_off;
1842 mbuf->buf_physaddr = m->buf_physaddr;
1843 mbuf->buf_addr = m->buf_addr;
1845 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1846 mbuf->vlan_tci = vlan_tag;
1847 mbuf->l2_len = sizeof(struct ether_hdr);
1848 mbuf->l3_len = sizeof(struct ipv4_hdr);
1849 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1851 tx_q->m_table[len] = mbuf;
1854 LOG_DEBUG(VHOST_DATA,
1855 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1858 (mbuf->next == NULL) ? "null" : "non-null");
1861 dev_statistics[dev->device_fh].tx_total++;
1862 dev_statistics[dev->device_fh].tx++;
1865 if (unlikely(len == MAX_PKT_BURST)) {
1866 m_table = (struct rte_mbuf **)tx_q->m_table;
1867 ret = rte_eth_tx_burst(ports[0],
1868 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1871 * Free any buffers not handled by TX and update
1874 if (unlikely(ret < len)) {
1876 rte_pktmbuf_free(m_table[ret]);
1877 } while (++ret < len);
1881 txmbuf_clean_zcp(dev, vpool);
1890 * This function TX all available packets in virtio TX queue for one
1891 * virtio-net device. If it is first packet, it learns MAC address and
1894 static inline void __attribute__((always_inline))
1895 virtio_dev_tx_zcp(struct virtio_net *dev)
1898 struct vhost_virtqueue *vq;
1899 struct vring_desc *desc;
1900 uint64_t buff_addr = 0, phys_addr;
1901 uint32_t head[MAX_PKT_BURST];
1903 uint16_t free_entries, packet_success = 0;
1905 uint8_t need_copy = 0;
1907 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1909 vq = dev->virtqueue[VIRTIO_TXQ];
1910 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1912 /* If there are no available buffers then return. */
1913 if (vq->last_used_idx_res == avail_idx)
1916 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1918 /* Prefetch available ring to retrieve head indexes. */
1919 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1921 /* Get the number of free entries in the ring */
1922 free_entries = (avail_idx - vq->last_used_idx_res);
1924 /* Limit to MAX_PKT_BURST. */
1926 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1928 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1929 dev->device_fh, free_entries);
1931 /* Retrieve all of the head indexes first to avoid caching issues. */
1932 for (i = 0; i < free_entries; i++)
1934 = vq->avail->ring[(vq->last_used_idx_res + i)
1937 vq->last_used_idx_res += free_entries;
1939 /* Prefetch descriptor index. */
1940 rte_prefetch0(&vq->desc[head[packet_success]]);
1941 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1943 while (packet_success < free_entries) {
1944 desc = &vq->desc[head[packet_success]];
1946 /* Discard first buffer as it is the virtio header */
1947 desc = &vq->desc[desc->next];
1949 /* Buffer address translation. */
1950 buff_addr = gpa_to_vva(dev, desc->addr);
1951 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1952 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1955 if (likely(packet_success < (free_entries - 1)))
1956 /* Prefetch descriptor index. */
1957 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1959 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1960 RTE_LOG(ERR, VHOST_DATA,
1961 "(%"PRIu64") Invalid frame buffer address found"
1962 "when TX packets!\n",
1968 /* Prefetch buffer address. */
1969 rte_prefetch0((void *)(uintptr_t)buff_addr);
1972 * Setup dummy mbuf. This is copied to a real mbuf if
1973 * transmitted out the physical port.
1975 m.data_len = desc->len;
1979 m.buf_addr = (void *)(uintptr_t)buff_addr;
1980 m.buf_physaddr = phys_addr;
1983 * Check if the frame buffer address from guest crosses
1984 * sub-region or not.
1986 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1987 RTE_LOG(ERR, VHOST_DATA,
1988 "(%"PRIu64") Frame buffer address cross "
1989 "sub-regioin found when attaching TX frame "
1990 "buffer address!\n",
1996 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1999 * If this is the first received packet we need to learn
2000 * the MAC and setup VMDQ
2002 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2003 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2005 * Discard frame if device is scheduled for
2006 * removal or a duplicate MAC address is found.
2008 packet_success += free_entries;
2009 vq->last_used_idx += packet_success;
2014 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2020 * This function is called by each data core. It handles all RX/TX registered
2021 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2022 * addresses are compared with all devices in the main linked list.
2025 switch_worker_zcp(__attribute__((unused)) void *arg)
2027 struct virtio_net *dev = NULL;
2028 struct vhost_dev *vdev = NULL;
2029 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2030 struct virtio_net_data_ll *dev_ll;
2031 struct mbuf_table *tx_q;
2032 volatile struct lcore_ll_info *lcore_ll;
2033 const uint64_t drain_tsc
2034 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2035 * BURST_TX_DRAIN_US;
2036 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2038 const uint16_t lcore_id = rte_lcore_id();
2039 uint16_t count_in_ring, rx_count = 0;
2041 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2043 lcore_ll = lcore_info[lcore_id].lcore_ll;
2047 cur_tsc = rte_rdtsc();
2049 /* TX burst queue drain */
2050 diff_tsc = cur_tsc - prev_tsc;
2051 if (unlikely(diff_tsc > drain_tsc)) {
2053 * Get mbuf from vpool.pool and detach mbuf and
2054 * put back into vpool.ring.
2056 dev_ll = lcore_ll->ll_root_used;
2057 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2058 /* Get virtio device ID */
2059 vdev = dev_ll->vdev;
2062 if (likely(!vdev->remove)) {
2063 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2065 LOG_DEBUG(VHOST_DATA,
2066 "TX queue drained after timeout"
2067 " with burst size %u\n",
2071 * Tx any packets in the queue
2073 ret = rte_eth_tx_burst(
2075 (uint16_t)tx_q->txq_id,
2076 (struct rte_mbuf **)
2078 (uint16_t)tx_q->len);
2079 if (unlikely(ret < tx_q->len)) {
2082 tx_q->m_table[ret]);
2083 } while (++ret < tx_q->len);
2087 txmbuf_clean_zcp(dev,
2088 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2091 dev_ll = dev_ll->next;
2096 rte_prefetch0(lcore_ll->ll_root_used);
2099 * Inform the configuration core that we have exited the linked
2100 * list and that no devices are in use if requested.
2102 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2103 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2105 /* Process devices */
2106 dev_ll = lcore_ll->ll_root_used;
2108 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2109 vdev = dev_ll->vdev;
2111 if (unlikely(vdev->remove)) {
2112 dev_ll = dev_ll->next;
2114 vdev->ready = DEVICE_SAFE_REMOVE;
2118 if (likely(vdev->ready == DEVICE_RX)) {
2119 uint32_t index = vdev->vmdq_rx_q;
2122 = rte_ring_count(vpool_array[index].ring);
2123 uint16_t free_entries
2124 = (uint16_t)get_available_ring_num_zcp(dev);
2127 * Attach all mbufs in vpool.ring and put back
2131 i < RTE_MIN(free_entries,
2132 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2134 attach_rxmbuf_zcp(dev);
2136 /* Handle guest RX */
2137 rx_count = rte_eth_rx_burst(ports[0],
2138 vdev->vmdq_rx_q, pkts_burst,
2142 ret_count = virtio_dev_rx_zcp(dev,
2143 pkts_burst, rx_count);
2145 dev_statistics[dev->device_fh].rx_total
2147 dev_statistics[dev->device_fh].rx
2150 while (likely(rx_count)) {
2153 pkts_burst[rx_count]);
2154 rte_ring_sp_enqueue(
2155 vpool_array[index].ring,
2156 (void *)pkts_burst[rx_count]);
2161 if (likely(!vdev->remove))
2162 /* Handle guest TX */
2163 virtio_dev_tx_zcp(dev);
2165 /* Move to the next device in the list */
2166 dev_ll = dev_ll->next;
2175 * Add an entry to a used linked list. A free entry must first be found
2176 * in the free linked list using get_data_ll_free_entry();
2179 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2180 struct virtio_net_data_ll *ll_dev)
2182 struct virtio_net_data_ll *ll = *ll_root_addr;
2184 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2185 ll_dev->next = NULL;
2186 rte_compiler_barrier();
2188 /* If ll == NULL then this is the first device. */
2190 /* Increment to the tail of the linked list. */
2191 while ((ll->next != NULL) )
2196 *ll_root_addr = ll_dev;
2201 * Remove an entry from a used linked list. The entry must then be added to
2202 * the free linked list using put_data_ll_free_entry().
2205 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206 struct virtio_net_data_ll *ll_dev,
2207 struct virtio_net_data_ll *ll_dev_last)
2209 struct virtio_net_data_ll *ll = *ll_root_addr;
2211 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2215 *ll_root_addr = ll_dev->next;
2217 if (likely(ll_dev_last != NULL))
2218 ll_dev_last->next = ll_dev->next;
2220 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2224 * Find and return an entry from the free linked list.
2226 static struct virtio_net_data_ll *
2227 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2229 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2230 struct virtio_net_data_ll *ll_dev;
2232 if (ll_free == NULL)
2236 *ll_root_addr = ll_free->next;
2242 * Place an entry back on to the free linked list.
2245 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2246 struct virtio_net_data_ll *ll_dev)
2248 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2253 ll_dev->next = ll_free;
2254 *ll_root_addr = ll_dev;
2258 * Creates a linked list of a given size.
2260 static struct virtio_net_data_ll *
2261 alloc_data_ll(uint32_t size)
2263 struct virtio_net_data_ll *ll_new;
2266 /* Malloc and then chain the linked list. */
2267 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2268 if (ll_new == NULL) {
2269 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2273 for (i = 0; i < size - 1; i++) {
2274 ll_new[i].vdev = NULL;
2275 ll_new[i].next = &ll_new[i+1];
2277 ll_new[i].next = NULL;
2283 * Create the main linked list along with each individual cores linked list. A used and a free list
2284 * are created to manage entries.
2291 RTE_LCORE_FOREACH_SLAVE(lcore) {
2292 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2293 if (lcore_info[lcore].lcore_ll == NULL) {
2294 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2298 lcore_info[lcore].lcore_ll->device_num = 0;
2299 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2300 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2301 if (num_devices % num_switching_cores)
2302 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2304 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2307 /* Allocate devices up to a maximum of MAX_DEVICES. */
2308 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2314 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2315 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2316 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2319 destroy_device (volatile struct virtio_net *dev)
2321 struct virtio_net_data_ll *ll_lcore_dev_cur;
2322 struct virtio_net_data_ll *ll_main_dev_cur;
2323 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2324 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2325 struct vhost_dev *vdev;
2328 dev->flags &= ~VIRTIO_DEV_RUNNING;
2330 vdev = (struct vhost_dev *)dev->priv;
2331 /*set the remove flag. */
2333 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2337 /* Search for entry to be removed from lcore ll */
2338 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2339 while (ll_lcore_dev_cur != NULL) {
2340 if (ll_lcore_dev_cur->vdev == vdev) {
2343 ll_lcore_dev_last = ll_lcore_dev_cur;
2344 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2348 if (ll_lcore_dev_cur == NULL) {
2349 RTE_LOG(ERR, VHOST_CONFIG,
2350 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2355 /* Search for entry to be removed from main ll */
2356 ll_main_dev_cur = ll_root_used;
2357 ll_main_dev_last = NULL;
2358 while (ll_main_dev_cur != NULL) {
2359 if (ll_main_dev_cur->vdev == vdev) {
2362 ll_main_dev_last = ll_main_dev_cur;
2363 ll_main_dev_cur = ll_main_dev_cur->next;
2367 /* Remove entries from the lcore and main ll. */
2368 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2369 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2371 /* Set the dev_removal_flag on each lcore. */
2372 RTE_LCORE_FOREACH_SLAVE(lcore) {
2373 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2377 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2378 * they can no longer access the device removed from the linked lists and that the devices
2379 * are no longer in use.
2381 RTE_LCORE_FOREACH_SLAVE(lcore) {
2382 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2387 /* Add the entries back to the lcore and main free ll.*/
2388 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2389 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2391 /* Decrement number of device on the lcore. */
2392 lcore_info[vdev->coreid].lcore_ll->device_num--;
2394 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2397 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2399 /* Stop the RX queue. */
2400 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2401 LOG_DEBUG(VHOST_CONFIG,
2402 "(%"PRIu64") In destroy_device: Failed to stop "
2408 LOG_DEBUG(VHOST_CONFIG,
2409 "(%"PRIu64") in destroy_device: Start put mbuf in "
2410 "mempool back to ring for RX queue: %d\n",
2411 dev->device_fh, vdev->vmdq_rx_q);
2413 mbuf_destroy_zcp(vpool);
2415 /* Stop the TX queue. */
2416 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2417 LOG_DEBUG(VHOST_CONFIG,
2418 "(%"PRIu64") In destroy_device: Failed to "
2419 "stop tx queue:%d\n",
2420 dev->device_fh, vdev->vmdq_rx_q);
2423 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2425 LOG_DEBUG(VHOST_CONFIG,
2426 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2427 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2428 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2431 mbuf_destroy_zcp(vpool);
2432 rte_free(vdev->regions_hpa);
2439 * Calculate the region count of physical continous regions for one particular
2440 * region of whose vhost virtual address is continous. The particular region
2441 * start from vva_start, with size of 'size' in argument.
2444 check_hpa_regions(uint64_t vva_start, uint64_t size)
2446 uint32_t i, nregions = 0, page_size = getpagesize();
2447 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2448 if (vva_start % page_size) {
2449 LOG_DEBUG(VHOST_CONFIG,
2450 "in check_countinous: vva start(%p) mod page_size(%d) "
2452 (void *)(uintptr_t)vva_start, page_size);
2455 if (size % page_size) {
2456 LOG_DEBUG(VHOST_CONFIG,
2457 "in check_countinous: "
2458 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2462 for (i = 0; i < size - page_size; i = i + page_size) {
2464 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2465 next_phys_addr = rte_mem_virt2phy(
2466 (void *)(uintptr_t)(vva_start + i + page_size));
2467 if ((cur_phys_addr + page_size) != next_phys_addr) {
2469 LOG_DEBUG(VHOST_CONFIG,
2470 "in check_continuous: hva addr:(%p) is not "
2471 "continuous with hva addr:(%p), diff:%d\n",
2472 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2473 (void *)(uintptr_t)(vva_start + (uint64_t)i
2474 + page_size), page_size);
2475 LOG_DEBUG(VHOST_CONFIG,
2476 "in check_continuous: hpa addr:(%p) is not "
2477 "continuous with hpa addr:(%p), "
2478 "diff:(%"PRIu64")\n",
2479 (void *)(uintptr_t)cur_phys_addr,
2480 (void *)(uintptr_t)next_phys_addr,
2481 (next_phys_addr-cur_phys_addr));
2488 * Divide each region whose vhost virtual address is continous into a few
2489 * sub-regions, make sure the physical address within each sub-region are
2490 * continous. And fill offset(to GPA) and size etc. information of each
2491 * sub-region into regions_hpa.
2494 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2496 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2497 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2499 if (mem_region_hpa == NULL)
2502 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2503 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2504 virtio_memory->regions[regionidx].address_offset;
2505 mem_region_hpa[regionidx_hpa].guest_phys_address
2506 = virtio_memory->regions[regionidx].guest_phys_address;
2507 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2508 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2509 mem_region_hpa[regionidx_hpa].guest_phys_address;
2510 LOG_DEBUG(VHOST_CONFIG,
2511 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2514 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2515 LOG_DEBUG(VHOST_CONFIG,
2516 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2519 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2521 i < virtio_memory->regions[regionidx].memory_size -
2524 cur_phys_addr = rte_mem_virt2phy(
2525 (void *)(uintptr_t)(vva_start + i));
2526 next_phys_addr = rte_mem_virt2phy(
2527 (void *)(uintptr_t)(vva_start +
2529 if ((cur_phys_addr + page_size) != next_phys_addr) {
2530 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2531 mem_region_hpa[regionidx_hpa].guest_phys_address +
2533 mem_region_hpa[regionidx_hpa].memory_size
2535 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2536 "phys addr end [%d]:(%p)\n",
2539 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2540 LOG_DEBUG(VHOST_CONFIG,
2541 "in fill_hpa_regions: guest phys addr "
2545 (mem_region_hpa[regionidx_hpa].memory_size));
2546 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2547 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2549 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2551 mem_region_hpa[regionidx_hpa].guest_phys_address;
2552 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2553 " phys addr start[%d]:(%p)\n",
2556 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2557 LOG_DEBUG(VHOST_CONFIG,
2558 "in fill_hpa_regions: host phys addr "
2562 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2568 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2569 = mem_region_hpa[regionidx_hpa].guest_phys_address
2571 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2572 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2573 "[%d]:(%p)\n", regionidx_hpa,
2575 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2576 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2577 "[%d]:(%p)\n", regionidx_hpa,
2579 (mem_region_hpa[regionidx_hpa].memory_size));
2582 return regionidx_hpa;
2586 * A new device is added to a data core. First the device is added to the main linked list
2587 * and the allocated to a specific data core.
2590 new_device (struct virtio_net *dev)
2592 struct virtio_net_data_ll *ll_dev;
2593 int lcore, core_add = 0;
2594 uint32_t device_num_min = num_devices;
2595 struct vhost_dev *vdev;
2598 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2600 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2608 vdev->nregions_hpa = dev->mem->nregions;
2609 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2611 += check_hpa_regions(
2612 dev->mem->regions[regionidx].guest_phys_address
2613 + dev->mem->regions[regionidx].address_offset,
2614 dev->mem->regions[regionidx].memory_size);
2618 vdev->regions_hpa = rte_calloc("vhost hpa region",
2620 sizeof(struct virtio_memory_regions_hpa),
2621 RTE_CACHE_LINE_SIZE);
2622 if (vdev->regions_hpa == NULL) {
2623 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2629 if (fill_hpa_memory_regions(
2630 vdev->regions_hpa, dev->mem
2631 ) != vdev->nregions_hpa) {
2633 RTE_LOG(ERR, VHOST_CONFIG,
2634 "hpa memory regions number mismatch: "
2635 "[%d]\n", vdev->nregions_hpa);
2636 rte_free(vdev->regions_hpa);
2643 /* Add device to main ll */
2644 ll_dev = get_data_ll_free_entry(&ll_root_free);
2645 if (ll_dev == NULL) {
2646 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2647 "of %d devices per core has been reached\n",
2648 dev->device_fh, num_devices);
2649 if (vdev->regions_hpa)
2650 rte_free(vdev->regions_hpa);
2654 ll_dev->vdev = vdev;
2655 add_data_ll_entry(&ll_root_used, ll_dev);
2657 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2660 uint32_t index = vdev->vmdq_rx_q;
2661 uint32_t count_in_ring, i;
2662 struct mbuf_table *tx_q;
2664 count_in_ring = rte_ring_count(vpool_array[index].ring);
2666 LOG_DEBUG(VHOST_CONFIG,
2667 "(%"PRIu64") in new_device: mbuf count in mempool "
2668 "before attach is: %d\n",
2670 rte_mempool_count(vpool_array[index].pool));
2671 LOG_DEBUG(VHOST_CONFIG,
2672 "(%"PRIu64") in new_device: mbuf count in ring "
2673 "before attach is : %d\n",
2674 dev->device_fh, count_in_ring);
2677 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2679 for (i = 0; i < count_in_ring; i++)
2680 attach_rxmbuf_zcp(dev);
2682 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2683 "mempool after attach is: %d\n",
2685 rte_mempool_count(vpool_array[index].pool));
2686 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2687 "ring after attach is : %d\n",
2689 rte_ring_count(vpool_array[index].ring));
2691 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2692 tx_q->txq_id = vdev->vmdq_rx_q;
2694 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2695 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2697 LOG_DEBUG(VHOST_CONFIG,
2698 "(%"PRIu64") In new_device: Failed to start "
2700 dev->device_fh, vdev->vmdq_rx_q);
2702 mbuf_destroy_zcp(vpool);
2703 rte_free(vdev->regions_hpa);
2708 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2709 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2711 LOG_DEBUG(VHOST_CONFIG,
2712 "(%"PRIu64") In new_device: Failed to start "
2714 dev->device_fh, vdev->vmdq_rx_q);
2716 /* Stop the TX queue. */
2717 if (rte_eth_dev_tx_queue_stop(ports[0],
2718 vdev->vmdq_rx_q) != 0) {
2719 LOG_DEBUG(VHOST_CONFIG,
2720 "(%"PRIu64") In new_device: Failed to "
2721 "stop tx queue:%d\n",
2722 dev->device_fh, vdev->vmdq_rx_q);
2725 mbuf_destroy_zcp(vpool);
2726 rte_free(vdev->regions_hpa);
2733 /*reset ready flag*/
2734 vdev->ready = DEVICE_MAC_LEARNING;
2737 /* Find a suitable lcore to add the device. */
2738 RTE_LCORE_FOREACH_SLAVE(lcore) {
2739 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2740 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2744 /* Add device to lcore ll */
2745 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2746 if (ll_dev == NULL) {
2747 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2748 vdev->ready = DEVICE_SAFE_REMOVE;
2749 destroy_device(dev);
2750 if (vdev->regions_hpa)
2751 rte_free(vdev->regions_hpa);
2755 ll_dev->vdev = vdev;
2756 vdev->coreid = core_add;
2758 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2760 /* Initialize device stats */
2761 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2763 /* Disable notifications. */
2764 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2765 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2766 lcore_info[vdev->coreid].lcore_ll->device_num++;
2767 dev->flags |= VIRTIO_DEV_RUNNING;
2769 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2775 * These callback allow devices to be added to the data core when configuration
2776 * has been fully complete.
2778 static const struct virtio_net_device_ops virtio_net_device_ops =
2780 .new_device = new_device,
2781 .destroy_device = destroy_device,
2785 * This is a thread will wake up after a period to print stats if the user has
2791 struct virtio_net_data_ll *dev_ll;
2792 uint64_t tx_dropped, rx_dropped;
2793 uint64_t tx, tx_total, rx, rx_total;
2795 const char clr[] = { 27, '[', '2', 'J', '\0' };
2796 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2799 sleep(enable_stats);
2801 /* Clear screen and move to top left */
2802 printf("%s%s", clr, top_left);
2804 printf("\nDevice statistics ====================================");
2806 dev_ll = ll_root_used;
2807 while (dev_ll != NULL) {
2808 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2809 tx_total = dev_statistics[device_fh].tx_total;
2810 tx = dev_statistics[device_fh].tx;
2811 tx_dropped = tx_total - tx;
2812 if (zero_copy == 0) {
2813 rx_total = rte_atomic64_read(
2814 &dev_statistics[device_fh].rx_total_atomic);
2815 rx = rte_atomic64_read(
2816 &dev_statistics[device_fh].rx_atomic);
2818 rx_total = dev_statistics[device_fh].rx_total;
2819 rx = dev_statistics[device_fh].rx;
2821 rx_dropped = rx_total - rx;
2823 printf("\nStatistics for device %"PRIu32" ------------------------------"
2824 "\nTX total: %"PRIu64""
2825 "\nTX dropped: %"PRIu64""
2826 "\nTX successful: %"PRIu64""
2827 "\nRX total: %"PRIu64""
2828 "\nRX dropped: %"PRIu64""
2829 "\nRX successful: %"PRIu64"",
2838 dev_ll = dev_ll->next;
2840 printf("\n======================================================\n");
2845 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2846 char *ring_name, uint32_t nb_mbuf)
2848 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2849 vpool_array[index].pool
2850 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2851 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2852 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2853 rte_pktmbuf_init, NULL, socket, 0);
2854 if (vpool_array[index].pool != NULL) {
2855 vpool_array[index].ring
2856 = rte_ring_create(ring_name,
2857 rte_align32pow2(nb_mbuf + 1),
2858 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2859 if (likely(vpool_array[index].ring != NULL)) {
2860 LOG_DEBUG(VHOST_CONFIG,
2861 "in setup_mempool_tbl: mbuf count in "
2863 rte_mempool_count(vpool_array[index].pool));
2864 LOG_DEBUG(VHOST_CONFIG,
2865 "in setup_mempool_tbl: mbuf count in "
2867 rte_ring_count(vpool_array[index].ring));
2869 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2873 /* Need consider head room. */
2874 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2876 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2882 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2883 * device is also registered here to handle the IOCTLs.
2886 main(int argc, char *argv[])
2888 struct rte_mempool *mbuf_pool = NULL;
2889 unsigned lcore_id, core_id = 0;
2890 unsigned nb_ports, valid_num_ports;
2894 static pthread_t tid;
2897 ret = rte_eal_init(argc, argv);
2899 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2903 /* parse app arguments */
2904 ret = us_vhost_parse_args(argc, argv);
2906 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2908 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2909 if (rte_lcore_is_enabled(lcore_id))
2910 lcore_ids[core_id ++] = lcore_id;
2912 if (rte_lcore_count() > RTE_MAX_LCORE)
2913 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2915 /*set the number of swithcing cores available*/
2916 num_switching_cores = rte_lcore_count()-1;
2918 /* Get the number of physical ports. */
2919 nb_ports = rte_eth_dev_count();
2920 if (nb_ports > RTE_MAX_ETHPORTS)
2921 nb_ports = RTE_MAX_ETHPORTS;
2924 * Update the global var NUM_PORTS and global array PORTS
2925 * and get value of var VALID_NUM_PORTS according to system ports number
2927 valid_num_ports = check_ports_num(nb_ports);
2929 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2930 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2931 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2935 if (zero_copy == 0) {
2936 /* Create the mbuf pool. */
2937 mbuf_pool = rte_mempool_create(
2941 MBUF_SIZE, MBUF_CACHE_SIZE,
2942 sizeof(struct rte_pktmbuf_pool_private),
2943 rte_pktmbuf_pool_init, NULL,
2944 rte_pktmbuf_init, NULL,
2945 rte_socket_id(), 0);
2946 if (mbuf_pool == NULL)
2947 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2949 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2950 vpool_array[queue_id].pool = mbuf_pool;
2952 if (vm2vm_mode == VM2VM_HARDWARE) {
2953 /* Enable VT loop back to let L2 switch to do it. */
2954 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2955 LOG_DEBUG(VHOST_CONFIG,
2956 "Enable loop back for L2 switch in vmdq.\n");
2960 char pool_name[RTE_MEMPOOL_NAMESIZE];
2961 char ring_name[RTE_MEMPOOL_NAMESIZE];
2963 nb_mbuf = num_rx_descriptor
2964 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2965 + num_switching_cores * MAX_PKT_BURST;
2967 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2968 snprintf(pool_name, sizeof(pool_name),
2969 "rxmbuf_pool_%u", queue_id);
2970 snprintf(ring_name, sizeof(ring_name),
2971 "rxmbuf_ring_%u", queue_id);
2972 setup_mempool_tbl(rte_socket_id(), queue_id,
2973 pool_name, ring_name, nb_mbuf);
2976 nb_mbuf = num_tx_descriptor
2977 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2978 + num_switching_cores * MAX_PKT_BURST;
2980 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2981 snprintf(pool_name, sizeof(pool_name),
2982 "txmbuf_pool_%u", queue_id);
2983 snprintf(ring_name, sizeof(ring_name),
2984 "txmbuf_ring_%u", queue_id);
2985 setup_mempool_tbl(rte_socket_id(),
2986 (queue_id + MAX_QUEUES),
2987 pool_name, ring_name, nb_mbuf);
2990 if (vm2vm_mode == VM2VM_HARDWARE) {
2991 /* Enable VT loop back to let L2 switch to do it. */
2992 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2993 LOG_DEBUG(VHOST_CONFIG,
2994 "Enable loop back for L2 switch in vmdq.\n");
2997 /* Set log level. */
2998 rte_set_log_level(LOG_LEVEL);
3000 /* initialize all ports */
3001 for (portid = 0; portid < nb_ports; portid++) {
3002 /* skip ports that are not enabled */
3003 if ((enabled_port_mask & (1 << portid)) == 0) {
3004 RTE_LOG(INFO, VHOST_PORT,
3005 "Skipping disabled port %d\n", portid);
3008 if (port_init(portid) != 0)
3009 rte_exit(EXIT_FAILURE,
3010 "Cannot initialize network ports\n");
3013 /* Initialise all linked lists. */
3014 if (init_data_ll() == -1)
3015 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3017 /* Initialize device stats */
3018 memset(&dev_statistics, 0, sizeof(dev_statistics));
3020 /* Enable stats if the user option is set. */
3022 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3024 /* Launch all data cores. */
3025 if (zero_copy == 0) {
3026 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3027 rte_eal_remote_launch(switch_worker,
3028 mbuf_pool, lcore_id);
3031 uint32_t count_in_mempool, index, i;
3032 for (index = 0; index < 2*MAX_QUEUES; index++) {
3033 /* For all RX and TX queues. */
3035 = rte_mempool_count(vpool_array[index].pool);
3038 * Transfer all un-attached mbufs from vpool.pool
3041 for (i = 0; i < count_in_mempool; i++) {
3042 struct rte_mbuf *mbuf
3043 = __rte_mbuf_raw_alloc(
3044 vpool_array[index].pool);
3045 rte_ring_sp_enqueue(vpool_array[index].ring,
3049 LOG_DEBUG(VHOST_CONFIG,
3050 "in main: mbuf count in mempool at initial "
3051 "is: %d\n", count_in_mempool);
3052 LOG_DEBUG(VHOST_CONFIG,
3053 "in main: mbuf count in ring at initial is :"
3055 rte_ring_count(vpool_array[index].ring));
3058 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3059 rte_eal_remote_launch(switch_worker_zcp, NULL,
3064 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3066 /* Register CUSE device to handle IOCTLs. */
3067 ret = rte_vhost_driver_register((char *)&dev_basename);
3069 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3071 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3073 /* Start CUSE session. */
3074 rte_vhost_driver_session_start();