4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 512
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
82 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
85 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
88 #define JUMBO_FRAME_MAX_SIZE 0x2600
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
93 #define DEVICE_SAFE_REMOVE 2
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
104 * Need refine these 2 macros for legacy and DPDK based front end:
105 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106 * And then adjust power 2.
109 * For legacy front end, 128 descriptors,
110 * half for virtio header, another half for mbuf.
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117 + sizeof(struct rte_mbuf)))
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
122 #define INVALID_PORT_ID 0xFF
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
142 #define MBUF_EXT_MEM(mb) (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
144 /* mask of enabled ports */
145 static uint32_t enabled_port_mask = 0;
147 /* Promiscuous mode */
148 static uint32_t promiscuous;
150 /*Number of switching cores enabled*/
151 static uint32_t num_switching_cores = 0;
153 /* number of devices/queues to support*/
154 static uint32_t num_queues = 0;
155 static uint32_t num_devices;
158 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
159 * disabled on default.
161 static uint32_t zero_copy;
162 static int mergeable;
164 /* Do vlan strip on host, enabled on default */
165 static uint32_t vlan_strip = 1;
167 /* number of descriptors to apply*/
168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
172 #define MAX_RING_DESC 4096
175 struct rte_mempool *pool;
176 struct rte_ring *ring;
178 } vpool_array[MAX_QUEUES+MAX_QUEUES];
180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
189 /* The type of host physical address translated from guest physical address. */
191 PHYS_ADDR_CONTINUOUS = 0,
192 PHYS_ADDR_CROSS_SUBREG = 1,
193 PHYS_ADDR_INVALID = 2,
198 static uint32_t enable_stats = 0;
199 /* Enable retries on RX. */
200 static uint32_t enable_retry = 1;
201 /* Specify timeout (in useconds) between retries on RX. */
202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
203 /* Specify the number of retries on RX. */
204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
206 /* Character device basename. Can be set by user. */
207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
209 /* empty vmdq configuration structure. Filled in programatically */
210 static struct rte_eth_conf vmdq_conf_default = {
212 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
214 .header_split = 0, /**< Header Split disabled */
215 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
216 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
218 * It is necessary for 1G NIC such as I350,
219 * this fixes bug of ipv4 forwarding in guest can't
220 * forward pakets from one virtio dev to another virtio dev.
222 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
223 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
224 .hw_strip_crc = 0, /**< CRC stripped by hardware */
228 .mq_mode = ETH_MQ_TX_NONE,
232 * should be overridden separately in code with
236 .nb_queue_pools = ETH_8_POOLS,
237 .enable_default_pool = 0,
240 .pool_map = {{0, 0},},
245 static unsigned lcore_ids[RTE_MAX_LCORE];
246 static uint8_t ports[RTE_MAX_ETHPORTS];
247 static unsigned num_ports = 0; /**< The number of ports specified in command line */
248 static uint16_t num_pf_queues, num_vmdq_queues;
249 static uint16_t vmdq_pool_base, vmdq_queue_base;
250 static uint16_t queues_per_pool;
252 static const uint16_t external_pkt_default_vlan_tag = 2000;
253 const uint16_t vlan_tags[] = {
254 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
255 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
256 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
257 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
258 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
259 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
260 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
261 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
264 /* ethernet addresses of ports */
265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
267 /* heads for the main used and free linked lists for the data path. */
268 static struct virtio_net_data_ll *ll_root_used = NULL;
269 static struct virtio_net_data_ll *ll_root_free = NULL;
271 /* Array of data core structures containing information on individual core linked lists. */
272 static struct lcore_info lcore_info[RTE_MAX_LCORE];
274 /* Used for queueing bursts of TX packets. */
278 struct rte_mbuf *m_table[MAX_PKT_BURST];
281 /* TX queue for each data core. */
282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
284 /* TX queue fori each virtio device for zero copy. */
285 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
287 /* Vlan header struct used to insert vlan tags on TX. */
289 unsigned char h_dest[ETH_ALEN];
290 unsigned char h_source[ETH_ALEN];
293 __be16 h_vlan_encapsulated_proto;
298 uint8_t version_ihl; /**< version and header length */
299 uint8_t type_of_service; /**< type of service */
300 uint16_t total_length; /**< length of packet */
301 uint16_t packet_id; /**< packet ID */
302 uint16_t fragment_offset; /**< fragmentation offset */
303 uint8_t time_to_live; /**< time to live */
304 uint8_t next_proto_id; /**< protocol ID */
305 uint16_t hdr_checksum; /**< header checksum */
306 uint32_t src_addr; /**< source address */
307 uint32_t dst_addr; /**< destination address */
308 } __attribute__((__packed__));
310 /* Header lengths. */
312 #define VLAN_ETH_HLEN 18
314 /* Per-device statistics struct */
315 struct device_statistics {
317 rte_atomic64_t rx_total_atomic;
320 rte_atomic64_t rx_atomic;
322 } __rte_cache_aligned;
323 struct device_statistics dev_statistics[MAX_DEVICES];
326 * Builds up the correct configuration for VMDQ VLAN pool map
327 * according to the pool & queue limits.
330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
332 struct rte_eth_vmdq_rx_conf conf;
333 struct rte_eth_vmdq_rx_conf *def_conf =
334 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
337 memset(&conf, 0, sizeof(conf));
338 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
339 conf.nb_pool_maps = num_devices;
340 conf.enable_loop_back = def_conf->enable_loop_back;
341 conf.rx_mode = def_conf->rx_mode;
343 for (i = 0; i < conf.nb_pool_maps; i++) {
344 conf.pool_map[i].vlan_id = vlan_tags[ i ];
345 conf.pool_map[i].pools = (1UL << i);
348 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
349 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
350 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
355 * Validate the device number according to the max pool number gotten form
356 * dev_info. If the device number is invalid, give the error message and
357 * return -1. Each device must have its own pool.
360 validate_num_devices(uint32_t max_nb_devices)
362 if (num_devices > max_nb_devices) {
363 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
370 * Initialises a given port using global settings and with the rx buffers
371 * coming from the mbuf_pool passed as parameter
374 port_init(uint8_t port)
376 struct rte_eth_dev_info dev_info;
377 struct rte_eth_conf port_conf;
378 struct rte_eth_rxconf *rxconf;
379 struct rte_eth_txconf *txconf;
380 int16_t rx_rings, tx_rings;
381 uint16_t rx_ring_size, tx_ring_size;
385 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
386 rte_eth_dev_info_get (port, &dev_info);
388 if (dev_info.max_rx_queues > MAX_QUEUES) {
389 rte_exit(EXIT_FAILURE,
390 "please define MAX_QUEUES no less than %u in %s\n",
391 dev_info.max_rx_queues, __FILE__);
394 rxconf = &dev_info.default_rxconf;
395 txconf = &dev_info.default_txconf;
396 rxconf->rx_drop_en = 1;
398 /* Enable vlan offload */
399 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
402 * Zero copy defers queue RX/TX start to the time when guest
403 * finishes its startup and packet buffers from that guest are
407 rxconf->rx_deferred_start = 1;
408 rxconf->rx_drop_en = 0;
409 txconf->tx_deferred_start = 1;
412 /*configure the number of supported virtio devices based on VMDQ limits */
413 num_devices = dev_info.max_vmdq_pools;
416 rx_ring_size = num_rx_descriptor;
417 tx_ring_size = num_tx_descriptor;
418 tx_rings = dev_info.max_tx_queues;
420 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
421 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
422 tx_rings = (uint16_t)rte_lcore_count();
425 retval = validate_num_devices(MAX_DEVICES);
429 /* Get port configuration. */
430 retval = get_eth_conf(&port_conf, num_devices);
433 /* NIC queues are divided into pf queues and vmdq queues. */
434 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
435 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
436 num_vmdq_queues = num_devices * queues_per_pool;
437 num_queues = num_pf_queues + num_vmdq_queues;
438 vmdq_queue_base = dev_info.vmdq_queue_base;
439 vmdq_pool_base = dev_info.vmdq_pool_base;
440 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
441 num_pf_queues, num_devices, queues_per_pool);
443 if (port >= rte_eth_dev_count()) return -1;
445 rx_rings = (uint16_t)dev_info.max_rx_queues;
446 /* Configure ethernet device. */
447 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
451 /* Setup the queues. */
452 for (q = 0; q < rx_rings; q ++) {
453 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
454 rte_eth_dev_socket_id(port),
456 vpool_array[q].pool);
460 for (q = 0; q < tx_rings; q ++) {
461 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462 rte_eth_dev_socket_id(port),
468 /* Start the device. */
469 retval = rte_eth_dev_start(port);
471 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476 rte_eth_promiscuous_enable(port);
478 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
479 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
480 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
481 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
483 vmdq_ports_eth_addr[port].addr_bytes[0],
484 vmdq_ports_eth_addr[port].addr_bytes[1],
485 vmdq_ports_eth_addr[port].addr_bytes[2],
486 vmdq_ports_eth_addr[port].addr_bytes[3],
487 vmdq_ports_eth_addr[port].addr_bytes[4],
488 vmdq_ports_eth_addr[port].addr_bytes[5]);
494 * Set character device basename.
497 us_vhost_parse_basename(const char *q_arg)
499 /* parse number string */
501 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
504 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
510 * Parse the portmask provided at run time.
513 parse_portmask(const char *portmask)
520 /* parse hexadecimal string */
521 pm = strtoul(portmask, &end, 16);
522 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
533 * Parse num options at run time.
536 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
543 /* parse unsigned int string */
544 num = strtoul(q_arg, &end, 10);
545 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
548 if (num > max_valid_value)
559 us_vhost_usage(const char *prgname)
561 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
563 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
564 " --dev-basename <name>\n"
566 " -p PORTMASK: Set mask for ports to be used by application\n"
567 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
568 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
569 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
570 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
571 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
572 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
573 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574 " --dev-basename: The basename to be used for the character device.\n"
575 " --zero-copy [0|1]: disable(default)/enable rx/tx "
577 " --rx-desc-num [0-N]: the number of descriptors on rx, "
578 "used only when zero copy is enabled.\n"
579 " --tx-desc-num [0-N]: the number of descriptors on tx, "
580 "used only when zero copy is enabled.\n",
585 * Parse the arguments given in the command line of the application.
588 us_vhost_parse_args(int argc, char **argv)
593 const char *prgname = argv[0];
594 static struct option long_option[] = {
595 {"vm2vm", required_argument, NULL, 0},
596 {"rx-retry", required_argument, NULL, 0},
597 {"rx-retry-delay", required_argument, NULL, 0},
598 {"rx-retry-num", required_argument, NULL, 0},
599 {"mergeable", required_argument, NULL, 0},
600 {"vlan-strip", required_argument, NULL, 0},
601 {"stats", required_argument, NULL, 0},
602 {"dev-basename", required_argument, NULL, 0},
603 {"zero-copy", required_argument, NULL, 0},
604 {"rx-desc-num", required_argument, NULL, 0},
605 {"tx-desc-num", required_argument, NULL, 0},
609 /* Parse command line */
610 while ((opt = getopt_long(argc, argv, "p:P",
611 long_option, &option_index)) != EOF) {
615 enabled_port_mask = parse_portmask(optarg);
616 if (enabled_port_mask == 0) {
617 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
618 us_vhost_usage(prgname);
625 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
626 ETH_VMDQ_ACCEPT_BROADCAST |
627 ETH_VMDQ_ACCEPT_MULTICAST;
628 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
633 /* Enable/disable vm2vm comms. */
634 if (!strncmp(long_option[option_index].name, "vm2vm",
636 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
638 RTE_LOG(INFO, VHOST_CONFIG,
639 "Invalid argument for "
641 us_vhost_usage(prgname);
644 vm2vm_mode = (vm2vm_type)ret;
648 /* Enable/disable retries on RX. */
649 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
650 ret = parse_num_opt(optarg, 1);
652 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
653 us_vhost_usage(prgname);
660 /* Specify the retries delay time (in useconds) on RX. */
661 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
662 ret = parse_num_opt(optarg, INT32_MAX);
664 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
665 us_vhost_usage(prgname);
668 burst_rx_delay_time = ret;
672 /* Specify the retries number on RX. */
673 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
674 ret = parse_num_opt(optarg, INT32_MAX);
676 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
677 us_vhost_usage(prgname);
680 burst_rx_retry_num = ret;
684 /* Enable/disable RX mergeable buffers. */
685 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
686 ret = parse_num_opt(optarg, 1);
688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
689 us_vhost_usage(prgname);
694 vmdq_conf_default.rxmode.jumbo_frame = 1;
695 vmdq_conf_default.rxmode.max_rx_pkt_len
696 = JUMBO_FRAME_MAX_SIZE;
701 /* Enable/disable RX VLAN strip on host. */
702 if (!strncmp(long_option[option_index].name,
703 "vlan-strip", MAX_LONG_OPT_SZ)) {
704 ret = parse_num_opt(optarg, 1);
706 RTE_LOG(INFO, VHOST_CONFIG,
707 "Invalid argument for VLAN strip [0|1]\n");
708 us_vhost_usage(prgname);
712 vmdq_conf_default.rxmode.hw_vlan_strip =
717 /* Enable/disable stats. */
718 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
719 ret = parse_num_opt(optarg, INT32_MAX);
721 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
722 us_vhost_usage(prgname);
729 /* Set character device basename. */
730 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
731 if (us_vhost_parse_basename(optarg) == -1) {
732 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
733 us_vhost_usage(prgname);
738 /* Enable/disable rx/tx zero copy. */
739 if (!strncmp(long_option[option_index].name,
740 "zero-copy", MAX_LONG_OPT_SZ)) {
741 ret = parse_num_opt(optarg, 1);
743 RTE_LOG(INFO, VHOST_CONFIG,
745 " for zero-copy [0|1]\n");
746 us_vhost_usage(prgname);
752 #ifdef RTE_MBUF_REFCNT
753 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
754 "zero copy vhost APP, please "
755 "disable RTE_MBUF_REFCNT\n"
756 "in config file and then rebuild DPDK "
758 "Otherwise please disable zero copy "
759 "flag in command line!\n");
765 /* Specify the descriptor number on RX. */
766 if (!strncmp(long_option[option_index].name,
767 "rx-desc-num", MAX_LONG_OPT_SZ)) {
768 ret = parse_num_opt(optarg, MAX_RING_DESC);
769 if ((ret == -1) || (!POWEROF2(ret))) {
770 RTE_LOG(INFO, VHOST_CONFIG,
771 "Invalid argument for rx-desc-num[0-N],"
772 "power of 2 required.\n");
773 us_vhost_usage(prgname);
776 num_rx_descriptor = ret;
780 /* Specify the descriptor number on TX. */
781 if (!strncmp(long_option[option_index].name,
782 "tx-desc-num", MAX_LONG_OPT_SZ)) {
783 ret = parse_num_opt(optarg, MAX_RING_DESC);
784 if ((ret == -1) || (!POWEROF2(ret))) {
785 RTE_LOG(INFO, VHOST_CONFIG,
786 "Invalid argument for tx-desc-num [0-N],"
787 "power of 2 required.\n");
788 us_vhost_usage(prgname);
791 num_tx_descriptor = ret;
797 /* Invalid option - print options. */
799 us_vhost_usage(prgname);
804 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
805 if (enabled_port_mask & (1 << i))
806 ports[num_ports++] = (uint8_t)i;
809 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
810 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
811 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
815 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
816 RTE_LOG(INFO, VHOST_PORT,
817 "Vhost zero copy doesn't support software vm2vm,"
818 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
822 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
823 RTE_LOG(INFO, VHOST_PORT,
824 "Vhost zero copy doesn't support jumbo frame,"
825 "please specify '--mergeable 0' to disable the "
826 "mergeable feature.\n");
834 * Update the global var NUM_PORTS and array PORTS according to system ports number
835 * and return valid ports number
837 static unsigned check_ports_num(unsigned nb_ports)
839 unsigned valid_num_ports = num_ports;
842 if (num_ports > nb_ports) {
843 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
844 num_ports, nb_ports);
845 num_ports = nb_ports;
848 for (portid = 0; portid < num_ports; portid ++) {
849 if (ports[portid] >= nb_ports) {
850 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
851 ports[portid], (nb_ports - 1));
852 ports[portid] = INVALID_PORT_ID;
856 return valid_num_ports;
860 * Macro to print out packet contents. Wrapped in debug define so that the
861 * data path is not effected when debug is disabled.
864 #define PRINT_PACKET(device, addr, size, header) do { \
865 char *pkt_addr = (char*)(addr); \
866 unsigned int index; \
867 char packet[MAX_PRINT_BUFF]; \
870 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
872 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
873 for (index = 0; index < (size); index++) { \
874 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
875 "%02hhx ", pkt_addr[index]); \
877 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
879 LOG_DEBUG(VHOST_DATA, "%s", packet); \
882 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
886 * Function to convert guest physical addresses to vhost physical addresses.
887 * This is used to convert virtio buffer addresses.
889 static inline uint64_t __attribute__((always_inline))
890 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
891 uint32_t buf_len, hpa_type *addr_type)
893 struct virtio_memory_regions_hpa *region;
895 uint64_t vhost_pa = 0;
897 *addr_type = PHYS_ADDR_INVALID;
899 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
900 region = &vdev->regions_hpa[regionidx];
901 if ((guest_pa >= region->guest_phys_address) &&
902 (guest_pa <= region->guest_phys_address_end)) {
903 vhost_pa = region->host_phys_addr_offset + guest_pa;
904 if (likely((guest_pa + buf_len - 1)
905 <= region->guest_phys_address_end))
906 *addr_type = PHYS_ADDR_CONTINUOUS;
908 *addr_type = PHYS_ADDR_CROSS_SUBREG;
913 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
914 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
915 (void *)(uintptr_t)vhost_pa);
921 * Compares a packet destination MAC address to a device MAC address.
923 static inline int __attribute__((always_inline))
924 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
926 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
930 * This function learns the MAC address of the device and registers this along with a
931 * vlan tag to a VMDQ.
934 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
936 struct ether_hdr *pkt_hdr;
937 struct virtio_net_data_ll *dev_ll;
938 struct virtio_net *dev = vdev->dev;
941 /* Learn MAC address of guest device from packet */
942 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
944 dev_ll = ll_root_used;
946 while (dev_ll != NULL) {
947 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
948 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
951 dev_ll = dev_ll->next;
954 for (i = 0; i < ETHER_ADDR_LEN; i++)
955 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
957 /* vlan_tag currently uses the device_id. */
958 vdev->vlan_tag = vlan_tags[dev->device_fh];
960 /* Print out VMDQ registration info. */
961 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
963 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
964 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
965 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
968 /* Register the MAC address. */
969 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
970 (uint32_t)dev->device_fh + vmdq_pool_base);
972 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
975 /* Enable stripping of the vlan tag as we handle routing. */
977 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
978 (uint16_t)vdev->vmdq_rx_q, 1);
980 /* Set device as ready for RX. */
981 vdev->ready = DEVICE_RX;
987 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
988 * queue before disabling RX on the device.
991 unlink_vmdq(struct vhost_dev *vdev)
995 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
997 if (vdev->ready == DEVICE_RX) {
998 /*clear MAC and VLAN settings*/
999 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1000 for (i = 0; i < 6; i++)
1001 vdev->mac_address.addr_bytes[i] = 0;
1005 /*Clear out the receive buffers*/
1006 rx_count = rte_eth_rx_burst(ports[0],
1007 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1010 for (i = 0; i < rx_count; i++)
1011 rte_pktmbuf_free(pkts_burst[i]);
1013 rx_count = rte_eth_rx_burst(ports[0],
1014 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1017 vdev->ready = DEVICE_MAC_LEARNING;
1022 * Check if the packet destination MAC address is for a local device. If so then put
1023 * the packet on that devices RX queue. If not then return.
1025 static inline int __attribute__((always_inline))
1026 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1028 struct virtio_net_data_ll *dev_ll;
1029 struct ether_hdr *pkt_hdr;
1031 struct virtio_net *dev = vdev->dev;
1032 struct virtio_net *tdev; /* destination virito device */
1034 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1036 /*get the used devices list*/
1037 dev_ll = ll_root_used;
1039 while (dev_ll != NULL) {
1040 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1041 &dev_ll->vdev->mac_address)) {
1043 /* Drop the packet if the TX packet is destined for the TX device. */
1044 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1045 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1049 tdev = dev_ll->vdev->dev;
1052 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1054 if (unlikely(dev_ll->vdev->remove)) {
1055 /*drop the packet if the device is marked for removal*/
1056 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1058 /*send the packet to the local virtio device*/
1059 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1062 &dev_statistics[tdev->device_fh].rx_total_atomic,
1065 &dev_statistics[tdev->device_fh].rx_atomic,
1067 dev_statistics[tdev->device_fh].tx_total++;
1068 dev_statistics[tdev->device_fh].tx += ret;
1074 dev_ll = dev_ll->next;
1081 * Check if the destination MAC of a packet is one local VM,
1082 * and get its vlan tag, and offset if it is.
1084 static inline int __attribute__((always_inline))
1085 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1086 uint32_t *offset, uint16_t *vlan_tag)
1088 struct virtio_net_data_ll *dev_ll = ll_root_used;
1089 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1091 while (dev_ll != NULL) {
1092 if ((dev_ll->vdev->ready == DEVICE_RX)
1093 && ether_addr_cmp(&(pkt_hdr->d_addr),
1094 &dev_ll->vdev->mac_address)) {
1096 * Drop the packet if the TX packet is
1097 * destined for the TX device.
1099 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1100 LOG_DEBUG(VHOST_DATA,
1101 "(%"PRIu64") TX: Source and destination"
1102 " MAC addresses are the same. Dropping "
1104 dev_ll->vdev->dev->device_fh);
1109 * HW vlan strip will reduce the packet length
1110 * by minus length of vlan tag, so need restore
1111 * the packet length by plus it.
1113 *offset = VLAN_HLEN;
1116 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1118 LOG_DEBUG(VHOST_DATA,
1119 "(%"PRIu64") TX: pkt to local VM device id:"
1120 "(%"PRIu64") vlan tag: %d.\n",
1121 dev->device_fh, dev_ll->vdev->dev->device_fh,
1126 dev_ll = dev_ll->next;
1132 * This function routes the TX packet to the correct interface. This may be a local device
1133 * or the physical port.
1135 static inline void __attribute__((always_inline))
1136 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1138 struct mbuf_table *tx_q;
1139 struct rte_mbuf **m_table;
1140 unsigned len, ret, offset = 0;
1141 const uint16_t lcore_id = rte_lcore_id();
1142 struct virtio_net *dev = vdev->dev;
1143 struct ether_hdr *nh;
1145 /*check if destination is local VM*/
1146 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1147 rte_pktmbuf_free(m);
1151 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1152 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1153 rte_pktmbuf_free(m);
1158 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1160 /*Add packet to the port tx queue*/
1161 tx_q = &lcore_tx_queue[lcore_id];
1164 nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1165 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1166 /* Guest has inserted the vlan tag. */
1167 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1168 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1169 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1170 (vh->vlan_tci != vlan_tag_be))
1171 vh->vlan_tci = vlan_tag_be;
1173 m->ol_flags = PKT_TX_VLAN_PKT;
1176 * Find the right seg to adjust the data len when offset is
1177 * bigger than tail room size.
1179 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1180 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1181 m->data_len += offset;
1183 struct rte_mbuf *seg = m;
1185 while ((seg->next != NULL) &&
1186 (offset > rte_pktmbuf_tailroom(seg)))
1189 seg->data_len += offset;
1191 m->pkt_len += offset;
1194 m->vlan_tci = vlan_tag;
1197 tx_q->m_table[len] = m;
1200 dev_statistics[dev->device_fh].tx_total++;
1201 dev_statistics[dev->device_fh].tx++;
1204 if (unlikely(len == MAX_PKT_BURST)) {
1205 m_table = (struct rte_mbuf **)tx_q->m_table;
1206 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1207 /* Free any buffers not handled by TX and update the port stats. */
1208 if (unlikely(ret < len)) {
1210 rte_pktmbuf_free(m_table[ret]);
1211 } while (++ret < len);
1221 * This function is called by each data core. It handles all RX/TX registered with the
1222 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1223 * with all devices in the main linked list.
1226 switch_worker(__attribute__((unused)) void *arg)
1228 struct rte_mempool *mbuf_pool = arg;
1229 struct virtio_net *dev = NULL;
1230 struct vhost_dev *vdev = NULL;
1231 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1232 struct virtio_net_data_ll *dev_ll;
1233 struct mbuf_table *tx_q;
1234 volatile struct lcore_ll_info *lcore_ll;
1235 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1236 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1238 const uint16_t lcore_id = rte_lcore_id();
1239 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1240 uint16_t rx_count = 0;
1244 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1245 lcore_ll = lcore_info[lcore_id].lcore_ll;
1248 tx_q = &lcore_tx_queue[lcore_id];
1249 for (i = 0; i < num_cores; i ++) {
1250 if (lcore_ids[i] == lcore_id) {
1257 cur_tsc = rte_rdtsc();
1259 * TX burst queue drain
1261 diff_tsc = cur_tsc - prev_tsc;
1262 if (unlikely(diff_tsc > drain_tsc)) {
1265 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1267 /*Tx any packets in the queue*/
1268 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1269 (struct rte_mbuf **)tx_q->m_table,
1270 (uint16_t)tx_q->len);
1271 if (unlikely(ret < tx_q->len)) {
1273 rte_pktmbuf_free(tx_q->m_table[ret]);
1274 } while (++ret < tx_q->len);
1284 rte_prefetch0(lcore_ll->ll_root_used);
1286 * Inform the configuration core that we have exited the linked list and that no devices are
1287 * in use if requested.
1289 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1290 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1295 dev_ll = lcore_ll->ll_root_used;
1297 while (dev_ll != NULL) {
1298 /*get virtio device ID*/
1299 vdev = dev_ll->vdev;
1302 if (unlikely(vdev->remove)) {
1303 dev_ll = dev_ll->next;
1305 vdev->ready = DEVICE_SAFE_REMOVE;
1308 if (likely(vdev->ready == DEVICE_RX)) {
1310 rx_count = rte_eth_rx_burst(ports[0],
1311 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1315 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1316 * Here MAX_PKT_BURST must be less than virtio queue size
1318 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1319 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1320 rte_delay_us(burst_rx_delay_time);
1321 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1325 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1328 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1331 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1333 while (likely(rx_count)) {
1335 rte_pktmbuf_free(pkts_burst[rx_count]);
1341 if (likely(!vdev->remove)) {
1342 /* Handle guest TX*/
1343 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1344 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1345 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1346 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1348 rte_pktmbuf_free(pkts_burst[--tx_count]);
1352 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1355 /*move to the next device in the list*/
1356 dev_ll = dev_ll->next;
1364 * This function gets available ring number for zero copy rx.
1365 * Only one thread will call this funciton for a paticular virtio device,
1366 * so, it is designed as non-thread-safe function.
1368 static inline uint32_t __attribute__((always_inline))
1369 get_available_ring_num_zcp(struct virtio_net *dev)
1371 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1374 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1375 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1379 * This function gets available ring index for zero copy rx,
1380 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1381 * Only one thread will call this funciton for a paticular virtio device,
1382 * so, it is designed as non-thread-safe function.
1384 static inline uint32_t __attribute__((always_inline))
1385 get_available_ring_index_zcp(struct virtio_net *dev,
1386 uint16_t *res_base_idx, uint32_t count)
1388 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1391 uint16_t free_entries;
1393 *res_base_idx = vq->last_used_idx_res;
1394 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1395 free_entries = (avail_idx - *res_base_idx);
1397 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1399 "res base idx:%d, free entries:%d\n",
1400 dev->device_fh, avail_idx, *res_base_idx,
1404 * If retry is enabled and the queue is full then we wait
1405 * and retry to avoid packet loss.
1407 if (enable_retry && unlikely(count > free_entries)) {
1408 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1409 rte_delay_us(burst_rx_delay_time);
1410 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1411 free_entries = (avail_idx - *res_base_idx);
1412 if (count <= free_entries)
1417 /*check that we have enough buffers*/
1418 if (unlikely(count > free_entries))
1419 count = free_entries;
1421 if (unlikely(count == 0)) {
1422 LOG_DEBUG(VHOST_DATA,
1423 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1424 "avail idx: %d, res base idx:%d, free entries:%d\n",
1425 dev->device_fh, avail_idx,
1426 *res_base_idx, free_entries);
1430 vq->last_used_idx_res = *res_base_idx + count;
1436 * This function put descriptor back to used list.
1438 static inline void __attribute__((always_inline))
1439 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1441 uint16_t res_cur_idx = vq->last_used_idx;
1442 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1443 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1444 rte_compiler_barrier();
1445 *(volatile uint16_t *)&vq->used->idx += 1;
1446 vq->last_used_idx += 1;
1448 /* Kick the guest if necessary. */
1449 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1450 eventfd_write((int)vq->kickfd, 1);
1454 * This function get available descriptor from vitio vring and un-attached mbuf
1455 * from vpool->ring, and then attach them together. It needs adjust the offset
1456 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1457 * frame data may be put to wrong location in mbuf.
1459 static inline void __attribute__((always_inline))
1460 attach_rxmbuf_zcp(struct virtio_net *dev)
1462 uint16_t res_base_idx, desc_idx;
1463 uint64_t buff_addr, phys_addr;
1464 struct vhost_virtqueue *vq;
1465 struct vring_desc *desc;
1466 struct rte_mbuf *mbuf = NULL;
1467 struct vpool *vpool;
1469 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1471 vpool = &vpool_array[vdev->vmdq_rx_q];
1472 vq = dev->virtqueue[VIRTIO_RXQ];
1475 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1478 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1480 desc = &vq->desc[desc_idx];
1481 if (desc->flags & VRING_DESC_F_NEXT) {
1482 desc = &vq->desc[desc->next];
1483 buff_addr = gpa_to_vva(dev, desc->addr);
1484 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1487 buff_addr = gpa_to_vva(dev,
1488 desc->addr + vq->vhost_hlen);
1489 phys_addr = gpa_to_hpa(vdev,
1490 desc->addr + vq->vhost_hlen,
1491 desc->len, &addr_type);
1494 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1495 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1496 " address found when attaching RX frame buffer"
1497 " address!\n", dev->device_fh);
1498 put_desc_to_used_list_zcp(vq, desc_idx);
1503 * Check if the frame buffer address from guest crosses
1504 * sub-region or not.
1506 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1507 RTE_LOG(ERR, VHOST_DATA,
1508 "(%"PRIu64") Frame buffer address cross "
1509 "sub-regioin found when attaching RX frame "
1510 "buffer address!\n",
1512 put_desc_to_used_list_zcp(vq, desc_idx);
1515 } while (unlikely(phys_addr == 0));
1517 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1518 if (unlikely(mbuf == NULL)) {
1519 LOG_DEBUG(VHOST_DATA,
1520 "(%"PRIu64") in attach_rxmbuf_zcp: "
1521 "ring_sc_dequeue fail.\n",
1523 put_desc_to_used_list_zcp(vq, desc_idx);
1527 if (unlikely(vpool->buf_size > desc->len)) {
1528 LOG_DEBUG(VHOST_DATA,
1529 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1530 "length(%d) of descriptor idx: %d less than room "
1531 "size required: %d\n",
1532 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1533 put_desc_to_used_list_zcp(vq, desc_idx);
1534 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1538 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1539 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1540 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1541 mbuf->data_len = desc->len;
1542 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1544 LOG_DEBUG(VHOST_DATA,
1545 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1546 "descriptor idx:%d\n",
1547 dev->device_fh, res_base_idx, desc_idx);
1549 __rte_mbuf_raw_free(mbuf);
1555 * Detach an attched packet mbuf -
1556 * - restore original mbuf address and length values.
1557 * - reset pktmbuf data and data_len to their default values.
1558 * All other fields of the given packet mbuf will be left intact.
1561 * The attached packet mbuf.
1563 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1565 const struct rte_mempool *mp = m->pool;
1566 void *buf = RTE_MBUF_TO_BADDR(m);
1568 uint32_t buf_len = mp->elt_size - sizeof(*m);
1569 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1572 m->buf_len = (uint16_t)buf_len;
1574 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1575 RTE_PKTMBUF_HEADROOM : m->buf_len;
1576 m->data_off = buf_ofs;
1582 * This function is called after packets have been transimited. It fetchs mbuf
1583 * from vpool->pool, detached it and put into vpool->ring. It also update the
1584 * used index and kick the guest if necessary.
1586 static inline uint32_t __attribute__((always_inline))
1587 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1589 struct rte_mbuf *mbuf;
1590 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1591 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1593 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1595 LOG_DEBUG(VHOST_DATA,
1596 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1598 dev->device_fh, mbuf_count);
1599 LOG_DEBUG(VHOST_DATA,
1600 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1602 dev->device_fh, rte_ring_count(vpool->ring));
1604 for (index = 0; index < mbuf_count; index++) {
1605 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1606 if (likely(MBUF_EXT_MEM(mbuf)))
1607 pktmbuf_detach_zcp(mbuf);
1608 rte_ring_sp_enqueue(vpool->ring, mbuf);
1610 /* Update used index buffer information. */
1611 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1612 vq->used->ring[used_idx].len = 0;
1614 used_idx = (used_idx + 1) & (vq->size - 1);
1617 LOG_DEBUG(VHOST_DATA,
1618 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1620 dev->device_fh, rte_mempool_count(vpool->pool));
1621 LOG_DEBUG(VHOST_DATA,
1622 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1624 dev->device_fh, rte_ring_count(vpool->ring));
1625 LOG_DEBUG(VHOST_DATA,
1626 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1627 "vq->last_used_idx:%d\n",
1628 dev->device_fh, vq->last_used_idx);
1630 vq->last_used_idx += mbuf_count;
1632 LOG_DEBUG(VHOST_DATA,
1633 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1634 "vq->last_used_idx:%d\n",
1635 dev->device_fh, vq->last_used_idx);
1637 rte_compiler_barrier();
1639 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1641 /* Kick guest if required. */
1642 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1643 eventfd_write((int)vq->kickfd, 1);
1649 * This function is called when a virtio device is destroy.
1650 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1652 static void mbuf_destroy_zcp(struct vpool *vpool)
1654 struct rte_mbuf *mbuf = NULL;
1655 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1657 LOG_DEBUG(VHOST_CONFIG,
1658 "in mbuf_destroy_zcp: mbuf count in mempool before "
1659 "mbuf_destroy_zcp is: %d\n",
1661 LOG_DEBUG(VHOST_CONFIG,
1662 "in mbuf_destroy_zcp: mbuf count in ring before "
1663 "mbuf_destroy_zcp is : %d\n",
1664 rte_ring_count(vpool->ring));
1666 for (index = 0; index < mbuf_count; index++) {
1667 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1668 if (likely(mbuf != NULL)) {
1669 if (likely(MBUF_EXT_MEM(mbuf)))
1670 pktmbuf_detach_zcp(mbuf);
1671 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1675 LOG_DEBUG(VHOST_CONFIG,
1676 "in mbuf_destroy_zcp: mbuf count in mempool after "
1677 "mbuf_destroy_zcp is: %d\n",
1678 rte_mempool_count(vpool->pool));
1679 LOG_DEBUG(VHOST_CONFIG,
1680 "in mbuf_destroy_zcp: mbuf count in ring after "
1681 "mbuf_destroy_zcp is : %d\n",
1682 rte_ring_count(vpool->ring));
1686 * This function update the use flag and counter.
1688 static inline uint32_t __attribute__((always_inline))
1689 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1692 struct vhost_virtqueue *vq;
1693 struct vring_desc *desc;
1694 struct rte_mbuf *buff;
1695 /* The virtio_hdr is initialised to 0. */
1696 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1697 = {{0, 0, 0, 0, 0, 0}, 0};
1698 uint64_t buff_hdr_addr = 0;
1699 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1700 uint32_t head_idx, packet_success = 0;
1701 uint16_t res_cur_idx;
1703 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1708 vq = dev->virtqueue[VIRTIO_RXQ];
1709 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1711 res_cur_idx = vq->last_used_idx;
1712 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1713 dev->device_fh, res_cur_idx, res_cur_idx + count);
1715 /* Retrieve all of the head indexes first to avoid caching issues. */
1716 for (head_idx = 0; head_idx < count; head_idx++)
1717 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1719 /*Prefetch descriptor index. */
1720 rte_prefetch0(&vq->desc[head[packet_success]]);
1722 while (packet_success != count) {
1723 /* Get descriptor from available ring */
1724 desc = &vq->desc[head[packet_success]];
1726 buff = pkts[packet_success];
1727 LOG_DEBUG(VHOST_DATA,
1728 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1729 "pkt[%d] descriptor idx: %d\n",
1730 dev->device_fh, packet_success,
1731 MBUF_HEADROOM_UINT32(buff));
1734 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1735 + RTE_PKTMBUF_HEADROOM),
1736 rte_pktmbuf_data_len(buff), 0);
1738 /* Buffer address translation for virtio header. */
1739 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1740 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1743 * If the descriptors are chained the header and data are
1744 * placed in separate buffers.
1746 if (desc->flags & VRING_DESC_F_NEXT) {
1747 desc->len = vq->vhost_hlen;
1748 desc = &vq->desc[desc->next];
1749 desc->len = rte_pktmbuf_data_len(buff);
1751 desc->len = packet_len;
1754 /* Update used ring with desc information */
1755 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1756 = head[packet_success];
1757 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1762 /* A header is required per buffer. */
1763 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1764 (const void *)&virtio_hdr, vq->vhost_hlen);
1766 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1768 if (likely(packet_success < count)) {
1769 /* Prefetch descriptor index. */
1770 rte_prefetch0(&vq->desc[head[packet_success]]);
1774 rte_compiler_barrier();
1776 LOG_DEBUG(VHOST_DATA,
1777 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1778 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1779 dev->device_fh, vq->last_used_idx, vq->used->idx);
1781 *(volatile uint16_t *)&vq->used->idx += count;
1782 vq->last_used_idx += count;
1784 LOG_DEBUG(VHOST_DATA,
1785 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1786 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1787 dev->device_fh, vq->last_used_idx, vq->used->idx);
1789 /* Kick the guest if necessary. */
1790 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1791 eventfd_write((int)vq->kickfd, 1);
1797 * This function routes the TX packet to the correct interface.
1798 * This may be a local device or the physical port.
1800 static inline void __attribute__((always_inline))
1801 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1802 uint32_t desc_idx, uint8_t need_copy)
1804 struct mbuf_table *tx_q;
1805 struct rte_mbuf **m_table;
1806 struct rte_mbuf *mbuf = NULL;
1807 unsigned len, ret, offset = 0;
1808 struct vpool *vpool;
1809 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1810 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1812 /*Add packet to the port tx queue*/
1813 tx_q = &tx_queue_zcp[vmdq_rx_q];
1816 /* Allocate an mbuf and populate the structure. */
1817 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1818 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1819 if (unlikely(mbuf == NULL)) {
1820 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1821 RTE_LOG(ERR, VHOST_DATA,
1822 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1824 put_desc_to_used_list_zcp(vq, desc_idx);
1828 if (vm2vm_mode == VM2VM_HARDWARE) {
1829 /* Avoid using a vlan tag from any vm for external pkt, such as
1830 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1831 * selection, MAC address determines it as an external pkt
1832 * which should go to network, while vlan tag determine it as
1833 * a vm2vm pkt should forward to another vm. Hardware confuse
1834 * such a ambiguous situation, so pkt will lost.
1836 vlan_tag = external_pkt_default_vlan_tag;
1837 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1838 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1839 __rte_mbuf_raw_free(mbuf);
1844 mbuf->nb_segs = m->nb_segs;
1845 mbuf->next = m->next;
1846 mbuf->data_len = m->data_len + offset;
1847 mbuf->pkt_len = mbuf->data_len;
1848 if (unlikely(need_copy)) {
1849 /* Copy the packet contents to the mbuf. */
1850 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1851 rte_pktmbuf_mtod(m, void *),
1854 mbuf->data_off = m->data_off;
1855 mbuf->buf_physaddr = m->buf_physaddr;
1856 mbuf->buf_addr = m->buf_addr;
1858 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1859 mbuf->vlan_tci = vlan_tag;
1860 mbuf->l2_len = sizeof(struct ether_hdr);
1861 mbuf->l3_len = sizeof(struct ipv4_hdr);
1862 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1864 tx_q->m_table[len] = mbuf;
1867 LOG_DEBUG(VHOST_DATA,
1868 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1871 (mbuf->next == NULL) ? "null" : "non-null");
1874 dev_statistics[dev->device_fh].tx_total++;
1875 dev_statistics[dev->device_fh].tx++;
1878 if (unlikely(len == MAX_PKT_BURST)) {
1879 m_table = (struct rte_mbuf **)tx_q->m_table;
1880 ret = rte_eth_tx_burst(ports[0],
1881 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1884 * Free any buffers not handled by TX and update
1887 if (unlikely(ret < len)) {
1889 rte_pktmbuf_free(m_table[ret]);
1890 } while (++ret < len);
1894 txmbuf_clean_zcp(dev, vpool);
1903 * This function TX all available packets in virtio TX queue for one
1904 * virtio-net device. If it is first packet, it learns MAC address and
1907 static inline void __attribute__((always_inline))
1908 virtio_dev_tx_zcp(struct virtio_net *dev)
1911 struct vhost_virtqueue *vq;
1912 struct vring_desc *desc;
1913 uint64_t buff_addr = 0, phys_addr;
1914 uint32_t head[MAX_PKT_BURST];
1916 uint16_t free_entries, packet_success = 0;
1918 uint8_t need_copy = 0;
1920 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1922 vq = dev->virtqueue[VIRTIO_TXQ];
1923 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1925 /* If there are no available buffers then return. */
1926 if (vq->last_used_idx_res == avail_idx)
1929 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1931 /* Prefetch available ring to retrieve head indexes. */
1932 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1934 /* Get the number of free entries in the ring */
1935 free_entries = (avail_idx - vq->last_used_idx_res);
1937 /* Limit to MAX_PKT_BURST. */
1939 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1941 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1942 dev->device_fh, free_entries);
1944 /* Retrieve all of the head indexes first to avoid caching issues. */
1945 for (i = 0; i < free_entries; i++)
1947 = vq->avail->ring[(vq->last_used_idx_res + i)
1950 vq->last_used_idx_res += free_entries;
1952 /* Prefetch descriptor index. */
1953 rte_prefetch0(&vq->desc[head[packet_success]]);
1954 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1956 while (packet_success < free_entries) {
1957 desc = &vq->desc[head[packet_success]];
1959 /* Discard first buffer as it is the virtio header */
1960 desc = &vq->desc[desc->next];
1962 /* Buffer address translation. */
1963 buff_addr = gpa_to_vva(dev, desc->addr);
1964 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1965 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1968 if (likely(packet_success < (free_entries - 1)))
1969 /* Prefetch descriptor index. */
1970 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1972 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1973 RTE_LOG(ERR, VHOST_DATA,
1974 "(%"PRIu64") Invalid frame buffer address found"
1975 "when TX packets!\n",
1981 /* Prefetch buffer address. */
1982 rte_prefetch0((void *)(uintptr_t)buff_addr);
1985 * Setup dummy mbuf. This is copied to a real mbuf if
1986 * transmitted out the physical port.
1988 m.data_len = desc->len;
1992 m.buf_addr = (void *)(uintptr_t)buff_addr;
1993 m.buf_physaddr = phys_addr;
1996 * Check if the frame buffer address from guest crosses
1997 * sub-region or not.
1999 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2000 RTE_LOG(ERR, VHOST_DATA,
2001 "(%"PRIu64") Frame buffer address cross "
2002 "sub-regioin found when attaching TX frame "
2003 "buffer address!\n",
2009 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2012 * If this is the first received packet we need to learn
2013 * the MAC and setup VMDQ
2015 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2016 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2018 * Discard frame if device is scheduled for
2019 * removal or a duplicate MAC address is found.
2021 packet_success += free_entries;
2022 vq->last_used_idx += packet_success;
2027 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2033 * This function is called by each data core. It handles all RX/TX registered
2034 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2035 * addresses are compared with all devices in the main linked list.
2038 switch_worker_zcp(__attribute__((unused)) void *arg)
2040 struct virtio_net *dev = NULL;
2041 struct vhost_dev *vdev = NULL;
2042 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2043 struct virtio_net_data_ll *dev_ll;
2044 struct mbuf_table *tx_q;
2045 volatile struct lcore_ll_info *lcore_ll;
2046 const uint64_t drain_tsc
2047 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2048 * BURST_TX_DRAIN_US;
2049 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2051 const uint16_t lcore_id = rte_lcore_id();
2052 uint16_t count_in_ring, rx_count = 0;
2054 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2056 lcore_ll = lcore_info[lcore_id].lcore_ll;
2060 cur_tsc = rte_rdtsc();
2062 /* TX burst queue drain */
2063 diff_tsc = cur_tsc - prev_tsc;
2064 if (unlikely(diff_tsc > drain_tsc)) {
2066 * Get mbuf from vpool.pool and detach mbuf and
2067 * put back into vpool.ring.
2069 dev_ll = lcore_ll->ll_root_used;
2070 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2071 /* Get virtio device ID */
2072 vdev = dev_ll->vdev;
2075 if (likely(!vdev->remove)) {
2076 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2078 LOG_DEBUG(VHOST_DATA,
2079 "TX queue drained after timeout"
2080 " with burst size %u\n",
2084 * Tx any packets in the queue
2086 ret = rte_eth_tx_burst(
2088 (uint16_t)tx_q->txq_id,
2089 (struct rte_mbuf **)
2091 (uint16_t)tx_q->len);
2092 if (unlikely(ret < tx_q->len)) {
2095 tx_q->m_table[ret]);
2096 } while (++ret < tx_q->len);
2100 txmbuf_clean_zcp(dev,
2101 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2104 dev_ll = dev_ll->next;
2109 rte_prefetch0(lcore_ll->ll_root_used);
2112 * Inform the configuration core that we have exited the linked
2113 * list and that no devices are in use if requested.
2115 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2116 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2118 /* Process devices */
2119 dev_ll = lcore_ll->ll_root_used;
2121 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2122 vdev = dev_ll->vdev;
2124 if (unlikely(vdev->remove)) {
2125 dev_ll = dev_ll->next;
2127 vdev->ready = DEVICE_SAFE_REMOVE;
2131 if (likely(vdev->ready == DEVICE_RX)) {
2132 uint32_t index = vdev->vmdq_rx_q;
2135 = rte_ring_count(vpool_array[index].ring);
2136 uint16_t free_entries
2137 = (uint16_t)get_available_ring_num_zcp(dev);
2140 * Attach all mbufs in vpool.ring and put back
2144 i < RTE_MIN(free_entries,
2145 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2147 attach_rxmbuf_zcp(dev);
2149 /* Handle guest RX */
2150 rx_count = rte_eth_rx_burst(ports[0],
2151 vdev->vmdq_rx_q, pkts_burst,
2155 ret_count = virtio_dev_rx_zcp(dev,
2156 pkts_burst, rx_count);
2158 dev_statistics[dev->device_fh].rx_total
2160 dev_statistics[dev->device_fh].rx
2163 while (likely(rx_count)) {
2166 pkts_burst[rx_count]);
2167 rte_ring_sp_enqueue(
2168 vpool_array[index].ring,
2169 (void *)pkts_burst[rx_count]);
2174 if (likely(!vdev->remove))
2175 /* Handle guest TX */
2176 virtio_dev_tx_zcp(dev);
2178 /* Move to the next device in the list */
2179 dev_ll = dev_ll->next;
2188 * Add an entry to a used linked list. A free entry must first be found
2189 * in the free linked list using get_data_ll_free_entry();
2192 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2193 struct virtio_net_data_ll *ll_dev)
2195 struct virtio_net_data_ll *ll = *ll_root_addr;
2197 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2198 ll_dev->next = NULL;
2199 rte_compiler_barrier();
2201 /* If ll == NULL then this is the first device. */
2203 /* Increment to the tail of the linked list. */
2204 while ((ll->next != NULL) )
2209 *ll_root_addr = ll_dev;
2214 * Remove an entry from a used linked list. The entry must then be added to
2215 * the free linked list using put_data_ll_free_entry().
2218 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2219 struct virtio_net_data_ll *ll_dev,
2220 struct virtio_net_data_ll *ll_dev_last)
2222 struct virtio_net_data_ll *ll = *ll_root_addr;
2224 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2228 *ll_root_addr = ll_dev->next;
2230 if (likely(ll_dev_last != NULL))
2231 ll_dev_last->next = ll_dev->next;
2233 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2237 * Find and return an entry from the free linked list.
2239 static struct virtio_net_data_ll *
2240 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2242 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2243 struct virtio_net_data_ll *ll_dev;
2245 if (ll_free == NULL)
2249 *ll_root_addr = ll_free->next;
2255 * Place an entry back on to the free linked list.
2258 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2259 struct virtio_net_data_ll *ll_dev)
2261 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2266 ll_dev->next = ll_free;
2267 *ll_root_addr = ll_dev;
2271 * Creates a linked list of a given size.
2273 static struct virtio_net_data_ll *
2274 alloc_data_ll(uint32_t size)
2276 struct virtio_net_data_ll *ll_new;
2279 /* Malloc and then chain the linked list. */
2280 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2281 if (ll_new == NULL) {
2282 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2286 for (i = 0; i < size - 1; i++) {
2287 ll_new[i].vdev = NULL;
2288 ll_new[i].next = &ll_new[i+1];
2290 ll_new[i].next = NULL;
2296 * Create the main linked list along with each individual cores linked list. A used and a free list
2297 * are created to manage entries.
2304 RTE_LCORE_FOREACH_SLAVE(lcore) {
2305 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2306 if (lcore_info[lcore].lcore_ll == NULL) {
2307 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2311 lcore_info[lcore].lcore_ll->device_num = 0;
2312 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2313 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2314 if (num_devices % num_switching_cores)
2315 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2317 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2320 /* Allocate devices up to a maximum of MAX_DEVICES. */
2321 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2327 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2328 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2329 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2332 destroy_device (volatile struct virtio_net *dev)
2334 struct virtio_net_data_ll *ll_lcore_dev_cur;
2335 struct virtio_net_data_ll *ll_main_dev_cur;
2336 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2337 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2338 struct vhost_dev *vdev;
2341 dev->flags &= ~VIRTIO_DEV_RUNNING;
2343 vdev = (struct vhost_dev *)dev->priv;
2344 /*set the remove flag. */
2346 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2350 /* Search for entry to be removed from lcore ll */
2351 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2352 while (ll_lcore_dev_cur != NULL) {
2353 if (ll_lcore_dev_cur->vdev == vdev) {
2356 ll_lcore_dev_last = ll_lcore_dev_cur;
2357 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2361 if (ll_lcore_dev_cur == NULL) {
2362 RTE_LOG(ERR, VHOST_CONFIG,
2363 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2368 /* Search for entry to be removed from main ll */
2369 ll_main_dev_cur = ll_root_used;
2370 ll_main_dev_last = NULL;
2371 while (ll_main_dev_cur != NULL) {
2372 if (ll_main_dev_cur->vdev == vdev) {
2375 ll_main_dev_last = ll_main_dev_cur;
2376 ll_main_dev_cur = ll_main_dev_cur->next;
2380 /* Remove entries from the lcore and main ll. */
2381 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2382 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2384 /* Set the dev_removal_flag on each lcore. */
2385 RTE_LCORE_FOREACH_SLAVE(lcore) {
2386 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2390 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2391 * they can no longer access the device removed from the linked lists and that the devices
2392 * are no longer in use.
2394 RTE_LCORE_FOREACH_SLAVE(lcore) {
2395 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2400 /* Add the entries back to the lcore and main free ll.*/
2401 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2402 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2404 /* Decrement number of device on the lcore. */
2405 lcore_info[vdev->coreid].lcore_ll->device_num--;
2407 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2410 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2412 /* Stop the RX queue. */
2413 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2414 LOG_DEBUG(VHOST_CONFIG,
2415 "(%"PRIu64") In destroy_device: Failed to stop "
2421 LOG_DEBUG(VHOST_CONFIG,
2422 "(%"PRIu64") in destroy_device: Start put mbuf in "
2423 "mempool back to ring for RX queue: %d\n",
2424 dev->device_fh, vdev->vmdq_rx_q);
2426 mbuf_destroy_zcp(vpool);
2428 /* Stop the TX queue. */
2429 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2430 LOG_DEBUG(VHOST_CONFIG,
2431 "(%"PRIu64") In destroy_device: Failed to "
2432 "stop tx queue:%d\n",
2433 dev->device_fh, vdev->vmdq_rx_q);
2436 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2438 LOG_DEBUG(VHOST_CONFIG,
2439 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2440 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2441 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2444 mbuf_destroy_zcp(vpool);
2445 rte_free(vdev->regions_hpa);
2452 * Calculate the region count of physical continous regions for one particular
2453 * region of whose vhost virtual address is continous. The particular region
2454 * start from vva_start, with size of 'size' in argument.
2457 check_hpa_regions(uint64_t vva_start, uint64_t size)
2459 uint32_t i, nregions = 0, page_size = getpagesize();
2460 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2461 if (vva_start % page_size) {
2462 LOG_DEBUG(VHOST_CONFIG,
2463 "in check_countinous: vva start(%p) mod page_size(%d) "
2465 (void *)(uintptr_t)vva_start, page_size);
2468 if (size % page_size) {
2469 LOG_DEBUG(VHOST_CONFIG,
2470 "in check_countinous: "
2471 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2475 for (i = 0; i < size - page_size; i = i + page_size) {
2477 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2478 next_phys_addr = rte_mem_virt2phy(
2479 (void *)(uintptr_t)(vva_start + i + page_size));
2480 if ((cur_phys_addr + page_size) != next_phys_addr) {
2482 LOG_DEBUG(VHOST_CONFIG,
2483 "in check_continuous: hva addr:(%p) is not "
2484 "continuous with hva addr:(%p), diff:%d\n",
2485 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2486 (void *)(uintptr_t)(vva_start + (uint64_t)i
2487 + page_size), page_size);
2488 LOG_DEBUG(VHOST_CONFIG,
2489 "in check_continuous: hpa addr:(%p) is not "
2490 "continuous with hpa addr:(%p), "
2491 "diff:(%"PRIu64")\n",
2492 (void *)(uintptr_t)cur_phys_addr,
2493 (void *)(uintptr_t)next_phys_addr,
2494 (next_phys_addr-cur_phys_addr));
2501 * Divide each region whose vhost virtual address is continous into a few
2502 * sub-regions, make sure the physical address within each sub-region are
2503 * continous. And fill offset(to GPA) and size etc. information of each
2504 * sub-region into regions_hpa.
2507 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2509 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2510 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2512 if (mem_region_hpa == NULL)
2515 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2516 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2517 virtio_memory->regions[regionidx].address_offset;
2518 mem_region_hpa[regionidx_hpa].guest_phys_address
2519 = virtio_memory->regions[regionidx].guest_phys_address;
2520 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2521 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2522 mem_region_hpa[regionidx_hpa].guest_phys_address;
2523 LOG_DEBUG(VHOST_CONFIG,
2524 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2527 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2528 LOG_DEBUG(VHOST_CONFIG,
2529 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2532 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2534 i < virtio_memory->regions[regionidx].memory_size -
2537 cur_phys_addr = rte_mem_virt2phy(
2538 (void *)(uintptr_t)(vva_start + i));
2539 next_phys_addr = rte_mem_virt2phy(
2540 (void *)(uintptr_t)(vva_start +
2542 if ((cur_phys_addr + page_size) != next_phys_addr) {
2543 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2544 mem_region_hpa[regionidx_hpa].guest_phys_address +
2546 mem_region_hpa[regionidx_hpa].memory_size
2548 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2549 "phys addr end [%d]:(%p)\n",
2552 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2553 LOG_DEBUG(VHOST_CONFIG,
2554 "in fill_hpa_regions: guest phys addr "
2558 (mem_region_hpa[regionidx_hpa].memory_size));
2559 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2560 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2562 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2564 mem_region_hpa[regionidx_hpa].guest_phys_address;
2565 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2566 " phys addr start[%d]:(%p)\n",
2569 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2570 LOG_DEBUG(VHOST_CONFIG,
2571 "in fill_hpa_regions: host phys addr "
2575 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2581 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2582 = mem_region_hpa[regionidx_hpa].guest_phys_address
2584 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2585 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2586 "[%d]:(%p)\n", regionidx_hpa,
2588 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2589 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2590 "[%d]:(%p)\n", regionidx_hpa,
2592 (mem_region_hpa[regionidx_hpa].memory_size));
2595 return regionidx_hpa;
2599 * A new device is added to a data core. First the device is added to the main linked list
2600 * and the allocated to a specific data core.
2603 new_device (struct virtio_net *dev)
2605 struct virtio_net_data_ll *ll_dev;
2606 int lcore, core_add = 0;
2607 uint32_t device_num_min = num_devices;
2608 struct vhost_dev *vdev;
2611 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2613 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2621 vdev->nregions_hpa = dev->mem->nregions;
2622 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2624 += check_hpa_regions(
2625 dev->mem->regions[regionidx].guest_phys_address
2626 + dev->mem->regions[regionidx].address_offset,
2627 dev->mem->regions[regionidx].memory_size);
2631 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2632 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2633 RTE_CACHE_LINE_SIZE);
2634 if (vdev->regions_hpa == NULL) {
2635 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2641 if (fill_hpa_memory_regions(
2642 vdev->regions_hpa, dev->mem
2643 ) != vdev->nregions_hpa) {
2645 RTE_LOG(ERR, VHOST_CONFIG,
2646 "hpa memory regions number mismatch: "
2647 "[%d]\n", vdev->nregions_hpa);
2648 rte_free(vdev->regions_hpa);
2655 /* Add device to main ll */
2656 ll_dev = get_data_ll_free_entry(&ll_root_free);
2657 if (ll_dev == NULL) {
2658 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2659 "of %d devices per core has been reached\n",
2660 dev->device_fh, num_devices);
2661 if (vdev->regions_hpa)
2662 rte_free(vdev->regions_hpa);
2666 ll_dev->vdev = vdev;
2667 add_data_ll_entry(&ll_root_used, ll_dev);
2669 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2672 uint32_t index = vdev->vmdq_rx_q;
2673 uint32_t count_in_ring, i;
2674 struct mbuf_table *tx_q;
2676 count_in_ring = rte_ring_count(vpool_array[index].ring);
2678 LOG_DEBUG(VHOST_CONFIG,
2679 "(%"PRIu64") in new_device: mbuf count in mempool "
2680 "before attach is: %d\n",
2682 rte_mempool_count(vpool_array[index].pool));
2683 LOG_DEBUG(VHOST_CONFIG,
2684 "(%"PRIu64") in new_device: mbuf count in ring "
2685 "before attach is : %d\n",
2686 dev->device_fh, count_in_ring);
2689 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2691 for (i = 0; i < count_in_ring; i++)
2692 attach_rxmbuf_zcp(dev);
2694 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2695 "mempool after attach is: %d\n",
2697 rte_mempool_count(vpool_array[index].pool));
2698 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2699 "ring after attach is : %d\n",
2701 rte_ring_count(vpool_array[index].ring));
2703 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2704 tx_q->txq_id = vdev->vmdq_rx_q;
2706 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2707 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2709 LOG_DEBUG(VHOST_CONFIG,
2710 "(%"PRIu64") In new_device: Failed to start "
2712 dev->device_fh, vdev->vmdq_rx_q);
2714 mbuf_destroy_zcp(vpool);
2715 rte_free(vdev->regions_hpa);
2720 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2721 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2723 LOG_DEBUG(VHOST_CONFIG,
2724 "(%"PRIu64") In new_device: Failed to start "
2726 dev->device_fh, vdev->vmdq_rx_q);
2728 /* Stop the TX queue. */
2729 if (rte_eth_dev_tx_queue_stop(ports[0],
2730 vdev->vmdq_rx_q) != 0) {
2731 LOG_DEBUG(VHOST_CONFIG,
2732 "(%"PRIu64") In new_device: Failed to "
2733 "stop tx queue:%d\n",
2734 dev->device_fh, vdev->vmdq_rx_q);
2737 mbuf_destroy_zcp(vpool);
2738 rte_free(vdev->regions_hpa);
2745 /*reset ready flag*/
2746 vdev->ready = DEVICE_MAC_LEARNING;
2749 /* Find a suitable lcore to add the device. */
2750 RTE_LCORE_FOREACH_SLAVE(lcore) {
2751 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2752 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2756 /* Add device to lcore ll */
2757 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2758 if (ll_dev == NULL) {
2759 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2760 vdev->ready = DEVICE_SAFE_REMOVE;
2761 destroy_device(dev);
2762 if (vdev->regions_hpa)
2763 rte_free(vdev->regions_hpa);
2767 ll_dev->vdev = vdev;
2768 vdev->coreid = core_add;
2770 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2772 /* Initialize device stats */
2773 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2775 /* Disable notifications. */
2776 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2777 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2778 lcore_info[vdev->coreid].lcore_ll->device_num++;
2779 dev->flags |= VIRTIO_DEV_RUNNING;
2781 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2787 * These callback allow devices to be added to the data core when configuration
2788 * has been fully complete.
2790 static const struct virtio_net_device_ops virtio_net_device_ops =
2792 .new_device = new_device,
2793 .destroy_device = destroy_device,
2797 * This is a thread will wake up after a period to print stats if the user has
2803 struct virtio_net_data_ll *dev_ll;
2804 uint64_t tx_dropped, rx_dropped;
2805 uint64_t tx, tx_total, rx, rx_total;
2807 const char clr[] = { 27, '[', '2', 'J', '\0' };
2808 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2811 sleep(enable_stats);
2813 /* Clear screen and move to top left */
2814 printf("%s%s", clr, top_left);
2816 printf("\nDevice statistics ====================================");
2818 dev_ll = ll_root_used;
2819 while (dev_ll != NULL) {
2820 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2821 tx_total = dev_statistics[device_fh].tx_total;
2822 tx = dev_statistics[device_fh].tx;
2823 tx_dropped = tx_total - tx;
2824 if (zero_copy == 0) {
2825 rx_total = rte_atomic64_read(
2826 &dev_statistics[device_fh].rx_total_atomic);
2827 rx = rte_atomic64_read(
2828 &dev_statistics[device_fh].rx_atomic);
2830 rx_total = dev_statistics[device_fh].rx_total;
2831 rx = dev_statistics[device_fh].rx;
2833 rx_dropped = rx_total - rx;
2835 printf("\nStatistics for device %"PRIu32" ------------------------------"
2836 "\nTX total: %"PRIu64""
2837 "\nTX dropped: %"PRIu64""
2838 "\nTX successful: %"PRIu64""
2839 "\nRX total: %"PRIu64""
2840 "\nRX dropped: %"PRIu64""
2841 "\nRX successful: %"PRIu64"",
2850 dev_ll = dev_ll->next;
2852 printf("\n======================================================\n");
2857 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2858 char *ring_name, uint32_t nb_mbuf)
2860 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2861 vpool_array[index].pool
2862 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2863 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2864 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2865 rte_pktmbuf_init, NULL, socket, 0);
2866 if (vpool_array[index].pool != NULL) {
2867 vpool_array[index].ring
2868 = rte_ring_create(ring_name,
2869 rte_align32pow2(nb_mbuf + 1),
2870 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2871 if (likely(vpool_array[index].ring != NULL)) {
2872 LOG_DEBUG(VHOST_CONFIG,
2873 "in setup_mempool_tbl: mbuf count in "
2875 rte_mempool_count(vpool_array[index].pool));
2876 LOG_DEBUG(VHOST_CONFIG,
2877 "in setup_mempool_tbl: mbuf count in "
2879 rte_ring_count(vpool_array[index].ring));
2881 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2885 /* Need consider head room. */
2886 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2888 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2894 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2895 * device is also registered here to handle the IOCTLs.
2898 main(int argc, char *argv[])
2900 struct rte_mempool *mbuf_pool = NULL;
2901 unsigned lcore_id, core_id = 0;
2902 unsigned nb_ports, valid_num_ports;
2906 static pthread_t tid;
2909 ret = rte_eal_init(argc, argv);
2911 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2915 /* parse app arguments */
2916 ret = us_vhost_parse_args(argc, argv);
2918 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2920 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2921 if (rte_lcore_is_enabled(lcore_id))
2922 lcore_ids[core_id ++] = lcore_id;
2924 if (rte_lcore_count() > RTE_MAX_LCORE)
2925 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2927 /*set the number of swithcing cores available*/
2928 num_switching_cores = rte_lcore_count()-1;
2930 /* Get the number of physical ports. */
2931 nb_ports = rte_eth_dev_count();
2932 if (nb_ports > RTE_MAX_ETHPORTS)
2933 nb_ports = RTE_MAX_ETHPORTS;
2936 * Update the global var NUM_PORTS and global array PORTS
2937 * and get value of var VALID_NUM_PORTS according to system ports number
2939 valid_num_ports = check_ports_num(nb_ports);
2941 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2942 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2943 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2947 if (zero_copy == 0) {
2948 /* Create the mbuf pool. */
2949 mbuf_pool = rte_mempool_create(
2953 MBUF_SIZE, MBUF_CACHE_SIZE,
2954 sizeof(struct rte_pktmbuf_pool_private),
2955 rte_pktmbuf_pool_init, NULL,
2956 rte_pktmbuf_init, NULL,
2957 rte_socket_id(), 0);
2958 if (mbuf_pool == NULL)
2959 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2961 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2962 vpool_array[queue_id].pool = mbuf_pool;
2964 if (vm2vm_mode == VM2VM_HARDWARE) {
2965 /* Enable VT loop back to let L2 switch to do it. */
2966 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2967 LOG_DEBUG(VHOST_CONFIG,
2968 "Enable loop back for L2 switch in vmdq.\n");
2972 char pool_name[RTE_MEMPOOL_NAMESIZE];
2973 char ring_name[RTE_MEMPOOL_NAMESIZE];
2975 nb_mbuf = num_rx_descriptor
2976 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2977 + num_switching_cores * MAX_PKT_BURST;
2979 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2980 snprintf(pool_name, sizeof(pool_name),
2981 "rxmbuf_pool_%u", queue_id);
2982 snprintf(ring_name, sizeof(ring_name),
2983 "rxmbuf_ring_%u", queue_id);
2984 setup_mempool_tbl(rte_socket_id(), queue_id,
2985 pool_name, ring_name, nb_mbuf);
2988 nb_mbuf = num_tx_descriptor
2989 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2990 + num_switching_cores * MAX_PKT_BURST;
2992 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2993 snprintf(pool_name, sizeof(pool_name),
2994 "txmbuf_pool_%u", queue_id);
2995 snprintf(ring_name, sizeof(ring_name),
2996 "txmbuf_ring_%u", queue_id);
2997 setup_mempool_tbl(rte_socket_id(),
2998 (queue_id + MAX_QUEUES),
2999 pool_name, ring_name, nb_mbuf);
3002 if (vm2vm_mode == VM2VM_HARDWARE) {
3003 /* Enable VT loop back to let L2 switch to do it. */
3004 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3005 LOG_DEBUG(VHOST_CONFIG,
3006 "Enable loop back for L2 switch in vmdq.\n");
3009 /* Set log level. */
3010 rte_set_log_level(LOG_LEVEL);
3012 /* initialize all ports */
3013 for (portid = 0; portid < nb_ports; portid++) {
3014 /* skip ports that are not enabled */
3015 if ((enabled_port_mask & (1 << portid)) == 0) {
3016 RTE_LOG(INFO, VHOST_PORT,
3017 "Skipping disabled port %d\n", portid);
3020 if (port_init(portid) != 0)
3021 rte_exit(EXIT_FAILURE,
3022 "Cannot initialize network ports\n");
3025 /* Initialise all linked lists. */
3026 if (init_data_ll() == -1)
3027 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3029 /* Initialize device stats */
3030 memset(&dev_statistics, 0, sizeof(dev_statistics));
3032 /* Enable stats if the user option is set. */
3034 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3036 /* Launch all data cores. */
3037 if (zero_copy == 0) {
3038 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3039 rte_eal_remote_launch(switch_worker,
3040 mbuf_pool, lcore_id);
3043 uint32_t count_in_mempool, index, i;
3044 for (index = 0; index < 2*MAX_QUEUES; index++) {
3045 /* For all RX and TX queues. */
3047 = rte_mempool_count(vpool_array[index].pool);
3050 * Transfer all un-attached mbufs from vpool.pool
3053 for (i = 0; i < count_in_mempool; i++) {
3054 struct rte_mbuf *mbuf
3055 = __rte_mbuf_raw_alloc(
3056 vpool_array[index].pool);
3057 rte_ring_sp_enqueue(vpool_array[index].ring,
3061 LOG_DEBUG(VHOST_CONFIG,
3062 "in main: mbuf count in mempool at initial "
3063 "is: %d\n", count_in_mempool);
3064 LOG_DEBUG(VHOST_CONFIG,
3065 "in main: mbuf count in ring at initial is :"
3067 rte_ring_count(vpool_array[index].ring));
3070 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3071 rte_eal_remote_launch(switch_worker_zcp, NULL,
3076 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3078 /* Register CUSE device to handle IOCTLs. */
3079 ret = rte_vhost_driver_register((char *)&dev_basename);
3081 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3083 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3085 /* Start CUSE session. */
3086 rte_vhost_driver_session_start();