4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 512
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM
78 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE
79 #define MBUF_CACHE_SIZE_ZCP 0
81 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
82 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
84 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
85 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
87 #define JUMBO_FRAME_MAX_SIZE 0x2600
89 /* State of virtio device. */
90 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_SAFE_REMOVE 2
94 /* Config_core_flag status definitions. */
95 #define REQUEST_DEV_REMOVAL 1
96 #define ACK_DEV_REMOVAL 0
98 /* Configurable number of RX/TX ring descriptors */
99 #define RTE_TEST_RX_DESC_DEFAULT 1024
100 #define RTE_TEST_TX_DESC_DEFAULT 512
103 * Need refine these 2 macros for legacy and DPDK based front end:
104 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
105 * And then adjust power 2.
108 * For legacy front end, 128 descriptors,
109 * half for virtio header, another half for mbuf.
111 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
112 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
114 /* Get first 4 bytes in mbuf headroom. */
115 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
116 + sizeof(struct rte_mbuf)))
118 /* true if x is a power of 2 */
119 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121 #define INVALID_PORT_ID 0xFF
123 /* Max number of devices. Limited by vmdq. */
124 #define MAX_DEVICES 64
126 /* Size of buffers used for snprintfs. */
127 #define MAX_PRINT_BUFF 6072
129 /* Maximum character device basename size. */
130 #define MAX_BASENAME_SZ 10
132 /* Maximum long option length for option parsing. */
133 #define MAX_LONG_OPT_SZ 64
135 /* Used to compare MAC addresses. */
136 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138 /* Number of descriptors per cacheline. */
139 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb))
143 /* mask of enabled ports */
144 static uint32_t enabled_port_mask = 0;
146 /* Promiscuous mode */
147 static uint32_t promiscuous;
149 /*Number of switching cores enabled*/
150 static uint32_t num_switching_cores = 0;
152 /* number of devices/queues to support*/
153 static uint32_t num_queues = 0;
154 static uint32_t num_devices;
157 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
158 * disabled on default.
160 static uint32_t zero_copy;
161 static int mergeable;
163 /* Do vlan strip on host, enabled on default */
164 static uint32_t vlan_strip = 1;
166 /* number of descriptors to apply*/
167 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
168 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
170 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
171 #define MAX_RING_DESC 4096
174 struct rte_mempool *pool;
175 struct rte_ring *ring;
177 } vpool_array[MAX_QUEUES+MAX_QUEUES];
179 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
186 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
188 /* The type of host physical address translated from guest physical address. */
190 PHYS_ADDR_CONTINUOUS = 0,
191 PHYS_ADDR_CROSS_SUBREG = 1,
192 PHYS_ADDR_INVALID = 2,
197 static uint32_t enable_stats = 0;
198 /* Enable retries on RX. */
199 static uint32_t enable_retry = 1;
200 /* Specify timeout (in useconds) between retries on RX. */
201 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
202 /* Specify the number of retries on RX. */
203 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
205 /* Character device basename. Can be set by user. */
206 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
208 /* empty vmdq configuration structure. Filled in programatically */
209 static struct rte_eth_conf vmdq_conf_default = {
211 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
213 .header_split = 0, /**< Header Split disabled */
214 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
215 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
217 * It is necessary for 1G NIC such as I350,
218 * this fixes bug of ipv4 forwarding in guest can't
219 * forward pakets from one virtio dev to another virtio dev.
221 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
222 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
223 .hw_strip_crc = 0, /**< CRC stripped by hardware */
227 .mq_mode = ETH_MQ_TX_NONE,
231 * should be overridden separately in code with
235 .nb_queue_pools = ETH_8_POOLS,
236 .enable_default_pool = 0,
239 .pool_map = {{0, 0},},
244 static unsigned lcore_ids[RTE_MAX_LCORE];
245 static uint8_t ports[RTE_MAX_ETHPORTS];
246 static unsigned num_ports = 0; /**< The number of ports specified in command line */
247 static uint16_t num_pf_queues, num_vmdq_queues;
248 static uint16_t vmdq_pool_base, vmdq_queue_base;
249 static uint16_t queues_per_pool;
251 static const uint16_t external_pkt_default_vlan_tag = 2000;
252 const uint16_t vlan_tags[] = {
253 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
254 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
255 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
256 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
257 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
258 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
259 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
260 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
263 /* ethernet addresses of ports */
264 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
266 /* heads for the main used and free linked lists for the data path. */
267 static struct virtio_net_data_ll *ll_root_used = NULL;
268 static struct virtio_net_data_ll *ll_root_free = NULL;
270 /* Array of data core structures containing information on individual core linked lists. */
271 static struct lcore_info lcore_info[RTE_MAX_LCORE];
273 /* Used for queueing bursts of TX packets. */
277 struct rte_mbuf *m_table[MAX_PKT_BURST];
280 /* TX queue for each data core. */
281 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
283 /* TX queue fori each virtio device for zero copy. */
284 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
286 /* Vlan header struct used to insert vlan tags on TX. */
288 unsigned char h_dest[ETH_ALEN];
289 unsigned char h_source[ETH_ALEN];
292 __be16 h_vlan_encapsulated_proto;
297 uint8_t version_ihl; /**< version and header length */
298 uint8_t type_of_service; /**< type of service */
299 uint16_t total_length; /**< length of packet */
300 uint16_t packet_id; /**< packet ID */
301 uint16_t fragment_offset; /**< fragmentation offset */
302 uint8_t time_to_live; /**< time to live */
303 uint8_t next_proto_id; /**< protocol ID */
304 uint16_t hdr_checksum; /**< header checksum */
305 uint32_t src_addr; /**< source address */
306 uint32_t dst_addr; /**< destination address */
307 } __attribute__((__packed__));
309 /* Header lengths. */
311 #define VLAN_ETH_HLEN 18
313 /* Per-device statistics struct */
314 struct device_statistics {
316 rte_atomic64_t rx_total_atomic;
319 rte_atomic64_t rx_atomic;
321 } __rte_cache_aligned;
322 struct device_statistics dev_statistics[MAX_DEVICES];
325 * Builds up the correct configuration for VMDQ VLAN pool map
326 * according to the pool & queue limits.
329 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
331 struct rte_eth_vmdq_rx_conf conf;
332 struct rte_eth_vmdq_rx_conf *def_conf =
333 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
336 memset(&conf, 0, sizeof(conf));
337 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
338 conf.nb_pool_maps = num_devices;
339 conf.enable_loop_back = def_conf->enable_loop_back;
340 conf.rx_mode = def_conf->rx_mode;
342 for (i = 0; i < conf.nb_pool_maps; i++) {
343 conf.pool_map[i].vlan_id = vlan_tags[ i ];
344 conf.pool_map[i].pools = (1UL << i);
347 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
348 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
349 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
354 * Validate the device number according to the max pool number gotten form
355 * dev_info. If the device number is invalid, give the error message and
356 * return -1. Each device must have its own pool.
359 validate_num_devices(uint32_t max_nb_devices)
361 if (num_devices > max_nb_devices) {
362 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
369 * Initialises a given port using global settings and with the rx buffers
370 * coming from the mbuf_pool passed as parameter
373 port_init(uint8_t port)
375 struct rte_eth_dev_info dev_info;
376 struct rte_eth_conf port_conf;
377 struct rte_eth_rxconf *rxconf;
378 struct rte_eth_txconf *txconf;
379 int16_t rx_rings, tx_rings;
380 uint16_t rx_ring_size, tx_ring_size;
384 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
385 rte_eth_dev_info_get (port, &dev_info);
387 if (dev_info.max_rx_queues > MAX_QUEUES) {
388 rte_exit(EXIT_FAILURE,
389 "please define MAX_QUEUES no less than %u in %s\n",
390 dev_info.max_rx_queues, __FILE__);
393 rxconf = &dev_info.default_rxconf;
394 txconf = &dev_info.default_txconf;
395 rxconf->rx_drop_en = 1;
397 /* Enable vlan offload */
398 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
401 * Zero copy defers queue RX/TX start to the time when guest
402 * finishes its startup and packet buffers from that guest are
406 rxconf->rx_deferred_start = 1;
407 rxconf->rx_drop_en = 0;
408 txconf->tx_deferred_start = 1;
411 /*configure the number of supported virtio devices based on VMDQ limits */
412 num_devices = dev_info.max_vmdq_pools;
415 rx_ring_size = num_rx_descriptor;
416 tx_ring_size = num_tx_descriptor;
417 tx_rings = dev_info.max_tx_queues;
419 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
420 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
421 tx_rings = (uint16_t)rte_lcore_count();
424 retval = validate_num_devices(MAX_DEVICES);
428 /* Get port configuration. */
429 retval = get_eth_conf(&port_conf, num_devices);
432 /* NIC queues are divided into pf queues and vmdq queues. */
433 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
434 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
435 num_vmdq_queues = num_devices * queues_per_pool;
436 num_queues = num_pf_queues + num_vmdq_queues;
437 vmdq_queue_base = dev_info.vmdq_queue_base;
438 vmdq_pool_base = dev_info.vmdq_pool_base;
439 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
440 num_pf_queues, num_devices, queues_per_pool);
442 if (port >= rte_eth_dev_count()) return -1;
444 rx_rings = (uint16_t)dev_info.max_rx_queues;
445 /* Configure ethernet device. */
446 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
450 /* Setup the queues. */
451 for (q = 0; q < rx_rings; q ++) {
452 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
453 rte_eth_dev_socket_id(port),
455 vpool_array[q].pool);
459 for (q = 0; q < tx_rings; q ++) {
460 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
461 rte_eth_dev_socket_id(port),
467 /* Start the device. */
468 retval = rte_eth_dev_start(port);
470 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
475 rte_eth_promiscuous_enable(port);
477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482 vmdq_ports_eth_addr[port].addr_bytes[0],
483 vmdq_ports_eth_addr[port].addr_bytes[1],
484 vmdq_ports_eth_addr[port].addr_bytes[2],
485 vmdq_ports_eth_addr[port].addr_bytes[3],
486 vmdq_ports_eth_addr[port].addr_bytes[4],
487 vmdq_ports_eth_addr[port].addr_bytes[5]);
493 * Set character device basename.
496 us_vhost_parse_basename(const char *q_arg)
498 /* parse number string */
500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 * Parse the portmask provided at run time.
512 parse_portmask(const char *portmask)
519 /* parse hexadecimal string */
520 pm = strtoul(portmask, &end, 16);
521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
532 * Parse num options at run time.
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
542 /* parse unsigned int string */
543 num = strtoul(q_arg, &end, 10);
544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547 if (num > max_valid_value)
558 us_vhost_usage(const char *prgname)
560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 " --dev-basename <name>\n"
565 " -p PORTMASK: Set mask for ports to be used by application\n"
566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
572 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
573 " --dev-basename: The basename to be used for the character device.\n"
574 " --zero-copy [0|1]: disable(default)/enable rx/tx "
576 " --rx-desc-num [0-N]: the number of descriptors on rx, "
577 "used only when zero copy is enabled.\n"
578 " --tx-desc-num [0-N]: the number of descriptors on tx, "
579 "used only when zero copy is enabled.\n",
584 * Parse the arguments given in the command line of the application.
587 us_vhost_parse_args(int argc, char **argv)
592 const char *prgname = argv[0];
593 static struct option long_option[] = {
594 {"vm2vm", required_argument, NULL, 0},
595 {"rx-retry", required_argument, NULL, 0},
596 {"rx-retry-delay", required_argument, NULL, 0},
597 {"rx-retry-num", required_argument, NULL, 0},
598 {"mergeable", required_argument, NULL, 0},
599 {"vlan-strip", required_argument, NULL, 0},
600 {"stats", required_argument, NULL, 0},
601 {"dev-basename", required_argument, NULL, 0},
602 {"zero-copy", required_argument, NULL, 0},
603 {"rx-desc-num", required_argument, NULL, 0},
604 {"tx-desc-num", required_argument, NULL, 0},
608 /* Parse command line */
609 while ((opt = getopt_long(argc, argv, "p:P",
610 long_option, &option_index)) != EOF) {
614 enabled_port_mask = parse_portmask(optarg);
615 if (enabled_port_mask == 0) {
616 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617 us_vhost_usage(prgname);
624 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625 ETH_VMDQ_ACCEPT_BROADCAST |
626 ETH_VMDQ_ACCEPT_MULTICAST;
627 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
632 /* Enable/disable vm2vm comms. */
633 if (!strncmp(long_option[option_index].name, "vm2vm",
635 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
637 RTE_LOG(INFO, VHOST_CONFIG,
638 "Invalid argument for "
640 us_vhost_usage(prgname);
643 vm2vm_mode = (vm2vm_type)ret;
647 /* Enable/disable retries on RX. */
648 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649 ret = parse_num_opt(optarg, 1);
651 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652 us_vhost_usage(prgname);
659 /* Specify the retries delay time (in useconds) on RX. */
660 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661 ret = parse_num_opt(optarg, INT32_MAX);
663 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664 us_vhost_usage(prgname);
667 burst_rx_delay_time = ret;
671 /* Specify the retries number on RX. */
672 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673 ret = parse_num_opt(optarg, INT32_MAX);
675 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676 us_vhost_usage(prgname);
679 burst_rx_retry_num = ret;
683 /* Enable/disable RX mergeable buffers. */
684 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685 ret = parse_num_opt(optarg, 1);
687 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688 us_vhost_usage(prgname);
693 vmdq_conf_default.rxmode.jumbo_frame = 1;
694 vmdq_conf_default.rxmode.max_rx_pkt_len
695 = JUMBO_FRAME_MAX_SIZE;
700 /* Enable/disable RX VLAN strip on host. */
701 if (!strncmp(long_option[option_index].name,
702 "vlan-strip", MAX_LONG_OPT_SZ)) {
703 ret = parse_num_opt(optarg, 1);
705 RTE_LOG(INFO, VHOST_CONFIG,
706 "Invalid argument for VLAN strip [0|1]\n");
707 us_vhost_usage(prgname);
711 vmdq_conf_default.rxmode.hw_vlan_strip =
716 /* Enable/disable stats. */
717 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
718 ret = parse_num_opt(optarg, INT32_MAX);
720 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
721 us_vhost_usage(prgname);
728 /* Set character device basename. */
729 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
730 if (us_vhost_parse_basename(optarg) == -1) {
731 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
732 us_vhost_usage(prgname);
737 /* Enable/disable rx/tx zero copy. */
738 if (!strncmp(long_option[option_index].name,
739 "zero-copy", MAX_LONG_OPT_SZ)) {
740 ret = parse_num_opt(optarg, 1);
742 RTE_LOG(INFO, VHOST_CONFIG,
744 " for zero-copy [0|1]\n");
745 us_vhost_usage(prgname);
751 /* Specify the descriptor number on RX. */
752 if (!strncmp(long_option[option_index].name,
753 "rx-desc-num", MAX_LONG_OPT_SZ)) {
754 ret = parse_num_opt(optarg, MAX_RING_DESC);
755 if ((ret == -1) || (!POWEROF2(ret))) {
756 RTE_LOG(INFO, VHOST_CONFIG,
757 "Invalid argument for rx-desc-num[0-N],"
758 "power of 2 required.\n");
759 us_vhost_usage(prgname);
762 num_rx_descriptor = ret;
766 /* Specify the descriptor number on TX. */
767 if (!strncmp(long_option[option_index].name,
768 "tx-desc-num", MAX_LONG_OPT_SZ)) {
769 ret = parse_num_opt(optarg, MAX_RING_DESC);
770 if ((ret == -1) || (!POWEROF2(ret))) {
771 RTE_LOG(INFO, VHOST_CONFIG,
772 "Invalid argument for tx-desc-num [0-N],"
773 "power of 2 required.\n");
774 us_vhost_usage(prgname);
777 num_tx_descriptor = ret;
783 /* Invalid option - print options. */
785 us_vhost_usage(prgname);
790 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
791 if (enabled_port_mask & (1 << i))
792 ports[num_ports++] = (uint8_t)i;
795 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
796 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
797 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
801 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
802 RTE_LOG(INFO, VHOST_PORT,
803 "Vhost zero copy doesn't support software vm2vm,"
804 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
808 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
809 RTE_LOG(INFO, VHOST_PORT,
810 "Vhost zero copy doesn't support jumbo frame,"
811 "please specify '--mergeable 0' to disable the "
812 "mergeable feature.\n");
820 * Update the global var NUM_PORTS and array PORTS according to system ports number
821 * and return valid ports number
823 static unsigned check_ports_num(unsigned nb_ports)
825 unsigned valid_num_ports = num_ports;
828 if (num_ports > nb_ports) {
829 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
830 num_ports, nb_ports);
831 num_ports = nb_ports;
834 for (portid = 0; portid < num_ports; portid ++) {
835 if (ports[portid] >= nb_ports) {
836 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
837 ports[portid], (nb_ports - 1));
838 ports[portid] = INVALID_PORT_ID;
842 return valid_num_ports;
846 * Macro to print out packet contents. Wrapped in debug define so that the
847 * data path is not effected when debug is disabled.
850 #define PRINT_PACKET(device, addr, size, header) do { \
851 char *pkt_addr = (char*)(addr); \
852 unsigned int index; \
853 char packet[MAX_PRINT_BUFF]; \
856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
858 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
859 for (index = 0; index < (size); index++) { \
860 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
861 "%02hhx ", pkt_addr[index]); \
863 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
865 LOG_DEBUG(VHOST_DATA, "%s", packet); \
868 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
872 * Function to convert guest physical addresses to vhost physical addresses.
873 * This is used to convert virtio buffer addresses.
875 static inline uint64_t __attribute__((always_inline))
876 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
877 uint32_t buf_len, hpa_type *addr_type)
879 struct virtio_memory_regions_hpa *region;
881 uint64_t vhost_pa = 0;
883 *addr_type = PHYS_ADDR_INVALID;
885 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
886 region = &vdev->regions_hpa[regionidx];
887 if ((guest_pa >= region->guest_phys_address) &&
888 (guest_pa <= region->guest_phys_address_end)) {
889 vhost_pa = region->host_phys_addr_offset + guest_pa;
890 if (likely((guest_pa + buf_len - 1)
891 <= region->guest_phys_address_end))
892 *addr_type = PHYS_ADDR_CONTINUOUS;
894 *addr_type = PHYS_ADDR_CROSS_SUBREG;
899 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
900 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
901 (void *)(uintptr_t)vhost_pa);
907 * Compares a packet destination MAC address to a device MAC address.
909 static inline int __attribute__((always_inline))
910 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
912 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
916 * This function learns the MAC address of the device and registers this along with a
917 * vlan tag to a VMDQ.
920 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
922 struct ether_hdr *pkt_hdr;
923 struct virtio_net_data_ll *dev_ll;
924 struct virtio_net *dev = vdev->dev;
927 /* Learn MAC address of guest device from packet */
928 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
930 dev_ll = ll_root_used;
932 while (dev_ll != NULL) {
933 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
934 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
937 dev_ll = dev_ll->next;
940 for (i = 0; i < ETHER_ADDR_LEN; i++)
941 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
943 /* vlan_tag currently uses the device_id. */
944 vdev->vlan_tag = vlan_tags[dev->device_fh];
946 /* Print out VMDQ registration info. */
947 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
949 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
950 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
951 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
954 /* Register the MAC address. */
955 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
956 (uint32_t)dev->device_fh + vmdq_pool_base);
958 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
961 /* Enable stripping of the vlan tag as we handle routing. */
963 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
964 (uint16_t)vdev->vmdq_rx_q, 1);
966 /* Set device as ready for RX. */
967 vdev->ready = DEVICE_RX;
973 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
974 * queue before disabling RX on the device.
977 unlink_vmdq(struct vhost_dev *vdev)
981 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
983 if (vdev->ready == DEVICE_RX) {
984 /*clear MAC and VLAN settings*/
985 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
986 for (i = 0; i < 6; i++)
987 vdev->mac_address.addr_bytes[i] = 0;
991 /*Clear out the receive buffers*/
992 rx_count = rte_eth_rx_burst(ports[0],
993 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
996 for (i = 0; i < rx_count; i++)
997 rte_pktmbuf_free(pkts_burst[i]);
999 rx_count = rte_eth_rx_burst(ports[0],
1000 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1003 vdev->ready = DEVICE_MAC_LEARNING;
1008 * Check if the packet destination MAC address is for a local device. If so then put
1009 * the packet on that devices RX queue. If not then return.
1011 static inline int __attribute__((always_inline))
1012 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1014 struct virtio_net_data_ll *dev_ll;
1015 struct ether_hdr *pkt_hdr;
1017 struct virtio_net *dev = vdev->dev;
1018 struct virtio_net *tdev; /* destination virito device */
1020 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1022 /*get the used devices list*/
1023 dev_ll = ll_root_used;
1025 while (dev_ll != NULL) {
1026 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1027 &dev_ll->vdev->mac_address)) {
1029 /* Drop the packet if the TX packet is destined for the TX device. */
1030 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1031 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1035 tdev = dev_ll->vdev->dev;
1038 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1040 if (unlikely(dev_ll->vdev->remove)) {
1041 /*drop the packet if the device is marked for removal*/
1042 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1044 /*send the packet to the local virtio device*/
1045 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1048 &dev_statistics[tdev->device_fh].rx_total_atomic,
1051 &dev_statistics[tdev->device_fh].rx_atomic,
1053 dev_statistics[tdev->device_fh].tx_total++;
1054 dev_statistics[tdev->device_fh].tx += ret;
1060 dev_ll = dev_ll->next;
1067 * Check if the destination MAC of a packet is one local VM,
1068 * and get its vlan tag, and offset if it is.
1070 static inline int __attribute__((always_inline))
1071 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1072 uint32_t *offset, uint16_t *vlan_tag)
1074 struct virtio_net_data_ll *dev_ll = ll_root_used;
1075 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1077 while (dev_ll != NULL) {
1078 if ((dev_ll->vdev->ready == DEVICE_RX)
1079 && ether_addr_cmp(&(pkt_hdr->d_addr),
1080 &dev_ll->vdev->mac_address)) {
1082 * Drop the packet if the TX packet is
1083 * destined for the TX device.
1085 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1086 LOG_DEBUG(VHOST_DATA,
1087 "(%"PRIu64") TX: Source and destination"
1088 " MAC addresses are the same. Dropping "
1090 dev_ll->vdev->dev->device_fh);
1095 * HW vlan strip will reduce the packet length
1096 * by minus length of vlan tag, so need restore
1097 * the packet length by plus it.
1099 *offset = VLAN_HLEN;
1102 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1104 LOG_DEBUG(VHOST_DATA,
1105 "(%"PRIu64") TX: pkt to local VM device id:"
1106 "(%"PRIu64") vlan tag: %d.\n",
1107 dev->device_fh, dev_ll->vdev->dev->device_fh,
1112 dev_ll = dev_ll->next;
1118 * This function routes the TX packet to the correct interface. This may be a local device
1119 * or the physical port.
1121 static inline void __attribute__((always_inline))
1122 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1124 struct mbuf_table *tx_q;
1125 struct rte_mbuf **m_table;
1126 unsigned len, ret, offset = 0;
1127 const uint16_t lcore_id = rte_lcore_id();
1128 struct virtio_net *dev = vdev->dev;
1129 struct ether_hdr *nh;
1131 /*check if destination is local VM*/
1132 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1133 rte_pktmbuf_free(m);
1137 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1138 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1139 rte_pktmbuf_free(m);
1144 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1146 /*Add packet to the port tx queue*/
1147 tx_q = &lcore_tx_queue[lcore_id];
1150 nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1151 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1152 /* Guest has inserted the vlan tag. */
1153 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1154 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1155 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1156 (vh->vlan_tci != vlan_tag_be))
1157 vh->vlan_tci = vlan_tag_be;
1159 m->ol_flags = PKT_TX_VLAN_PKT;
1162 * Find the right seg to adjust the data len when offset is
1163 * bigger than tail room size.
1165 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1166 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1167 m->data_len += offset;
1169 struct rte_mbuf *seg = m;
1171 while ((seg->next != NULL) &&
1172 (offset > rte_pktmbuf_tailroom(seg)))
1175 seg->data_len += offset;
1177 m->pkt_len += offset;
1180 m->vlan_tci = vlan_tag;
1183 tx_q->m_table[len] = m;
1186 dev_statistics[dev->device_fh].tx_total++;
1187 dev_statistics[dev->device_fh].tx++;
1190 if (unlikely(len == MAX_PKT_BURST)) {
1191 m_table = (struct rte_mbuf **)tx_q->m_table;
1192 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1193 /* Free any buffers not handled by TX and update the port stats. */
1194 if (unlikely(ret < len)) {
1196 rte_pktmbuf_free(m_table[ret]);
1197 } while (++ret < len);
1207 * This function is called by each data core. It handles all RX/TX registered with the
1208 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1209 * with all devices in the main linked list.
1212 switch_worker(__attribute__((unused)) void *arg)
1214 struct rte_mempool *mbuf_pool = arg;
1215 struct virtio_net *dev = NULL;
1216 struct vhost_dev *vdev = NULL;
1217 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1218 struct virtio_net_data_ll *dev_ll;
1219 struct mbuf_table *tx_q;
1220 volatile struct lcore_ll_info *lcore_ll;
1221 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1222 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1224 const uint16_t lcore_id = rte_lcore_id();
1225 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1226 uint16_t rx_count = 0;
1230 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1231 lcore_ll = lcore_info[lcore_id].lcore_ll;
1234 tx_q = &lcore_tx_queue[lcore_id];
1235 for (i = 0; i < num_cores; i ++) {
1236 if (lcore_ids[i] == lcore_id) {
1243 cur_tsc = rte_rdtsc();
1245 * TX burst queue drain
1247 diff_tsc = cur_tsc - prev_tsc;
1248 if (unlikely(diff_tsc > drain_tsc)) {
1251 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1253 /*Tx any packets in the queue*/
1254 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1255 (struct rte_mbuf **)tx_q->m_table,
1256 (uint16_t)tx_q->len);
1257 if (unlikely(ret < tx_q->len)) {
1259 rte_pktmbuf_free(tx_q->m_table[ret]);
1260 } while (++ret < tx_q->len);
1270 rte_prefetch0(lcore_ll->ll_root_used);
1272 * Inform the configuration core that we have exited the linked list and that no devices are
1273 * in use if requested.
1275 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1276 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1281 dev_ll = lcore_ll->ll_root_used;
1283 while (dev_ll != NULL) {
1284 /*get virtio device ID*/
1285 vdev = dev_ll->vdev;
1288 if (unlikely(vdev->remove)) {
1289 dev_ll = dev_ll->next;
1291 vdev->ready = DEVICE_SAFE_REMOVE;
1294 if (likely(vdev->ready == DEVICE_RX)) {
1296 rx_count = rte_eth_rx_burst(ports[0],
1297 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1301 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1302 * Here MAX_PKT_BURST must be less than virtio queue size
1304 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1305 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1306 rte_delay_us(burst_rx_delay_time);
1307 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1311 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1314 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1317 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1319 while (likely(rx_count)) {
1321 rte_pktmbuf_free(pkts_burst[rx_count]);
1327 if (likely(!vdev->remove)) {
1328 /* Handle guest TX*/
1329 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1330 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1331 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1332 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1334 rte_pktmbuf_free(pkts_burst[--tx_count]);
1338 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1341 /*move to the next device in the list*/
1342 dev_ll = dev_ll->next;
1350 * This function gets available ring number for zero copy rx.
1351 * Only one thread will call this funciton for a paticular virtio device,
1352 * so, it is designed as non-thread-safe function.
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_num_zcp(struct virtio_net *dev)
1357 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1360 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1365 * This function gets available ring index for zero copy rx,
1366 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1367 * Only one thread will call this funciton for a paticular virtio device,
1368 * so, it is designed as non-thread-safe function.
1370 static inline uint32_t __attribute__((always_inline))
1371 get_available_ring_index_zcp(struct virtio_net *dev,
1372 uint16_t *res_base_idx, uint32_t count)
1374 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1377 uint16_t free_entries;
1379 *res_base_idx = vq->last_used_idx_res;
1380 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381 free_entries = (avail_idx - *res_base_idx);
1383 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1385 "res base idx:%d, free entries:%d\n",
1386 dev->device_fh, avail_idx, *res_base_idx,
1390 * If retry is enabled and the queue is full then we wait
1391 * and retry to avoid packet loss.
1393 if (enable_retry && unlikely(count > free_entries)) {
1394 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1395 rte_delay_us(burst_rx_delay_time);
1396 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1397 free_entries = (avail_idx - *res_base_idx);
1398 if (count <= free_entries)
1403 /*check that we have enough buffers*/
1404 if (unlikely(count > free_entries))
1405 count = free_entries;
1407 if (unlikely(count == 0)) {
1408 LOG_DEBUG(VHOST_DATA,
1409 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1410 "avail idx: %d, res base idx:%d, free entries:%d\n",
1411 dev->device_fh, avail_idx,
1412 *res_base_idx, free_entries);
1416 vq->last_used_idx_res = *res_base_idx + count;
1422 * This function put descriptor back to used list.
1424 static inline void __attribute__((always_inline))
1425 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1427 uint16_t res_cur_idx = vq->last_used_idx;
1428 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1429 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1430 rte_compiler_barrier();
1431 *(volatile uint16_t *)&vq->used->idx += 1;
1432 vq->last_used_idx += 1;
1434 /* Kick the guest if necessary. */
1435 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1436 eventfd_write((int)vq->callfd, 1);
1440 * This function get available descriptor from vitio vring and un-attached mbuf
1441 * from vpool->ring, and then attach them together. It needs adjust the offset
1442 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1443 * frame data may be put to wrong location in mbuf.
1445 static inline void __attribute__((always_inline))
1446 attach_rxmbuf_zcp(struct virtio_net *dev)
1448 uint16_t res_base_idx, desc_idx;
1449 uint64_t buff_addr, phys_addr;
1450 struct vhost_virtqueue *vq;
1451 struct vring_desc *desc;
1452 struct rte_mbuf *mbuf = NULL;
1453 struct vpool *vpool;
1455 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1457 vpool = &vpool_array[vdev->vmdq_rx_q];
1458 vq = dev->virtqueue[VIRTIO_RXQ];
1461 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1464 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1466 desc = &vq->desc[desc_idx];
1467 if (desc->flags & VRING_DESC_F_NEXT) {
1468 desc = &vq->desc[desc->next];
1469 buff_addr = gpa_to_vva(dev, desc->addr);
1470 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1473 buff_addr = gpa_to_vva(dev,
1474 desc->addr + vq->vhost_hlen);
1475 phys_addr = gpa_to_hpa(vdev,
1476 desc->addr + vq->vhost_hlen,
1477 desc->len, &addr_type);
1480 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1481 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1482 " address found when attaching RX frame buffer"
1483 " address!\n", dev->device_fh);
1484 put_desc_to_used_list_zcp(vq, desc_idx);
1489 * Check if the frame buffer address from guest crosses
1490 * sub-region or not.
1492 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1493 RTE_LOG(ERR, VHOST_DATA,
1494 "(%"PRIu64") Frame buffer address cross "
1495 "sub-regioin found when attaching RX frame "
1496 "buffer address!\n",
1498 put_desc_to_used_list_zcp(vq, desc_idx);
1501 } while (unlikely(phys_addr == 0));
1503 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1504 if (unlikely(mbuf == NULL)) {
1505 LOG_DEBUG(VHOST_DATA,
1506 "(%"PRIu64") in attach_rxmbuf_zcp: "
1507 "ring_sc_dequeue fail.\n",
1509 put_desc_to_used_list_zcp(vq, desc_idx);
1513 if (unlikely(vpool->buf_size > desc->len)) {
1514 LOG_DEBUG(VHOST_DATA,
1515 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1516 "length(%d) of descriptor idx: %d less than room "
1517 "size required: %d\n",
1518 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1519 put_desc_to_used_list_zcp(vq, desc_idx);
1520 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1524 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1525 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1526 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1527 mbuf->data_len = desc->len;
1528 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1530 LOG_DEBUG(VHOST_DATA,
1531 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1532 "descriptor idx:%d\n",
1533 dev->device_fh, res_base_idx, desc_idx);
1535 __rte_mbuf_raw_free(mbuf);
1541 * Detach an attched packet mbuf -
1542 * - restore original mbuf address and length values.
1543 * - reset pktmbuf data and data_len to their default values.
1544 * All other fields of the given packet mbuf will be left intact.
1547 * The attached packet mbuf.
1549 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1551 const struct rte_mempool *mp = m->pool;
1552 void *buf = rte_mbuf_to_baddr(m);
1554 uint32_t buf_len = mp->elt_size - sizeof(*m);
1555 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1558 m->buf_len = (uint16_t)buf_len;
1560 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1561 RTE_PKTMBUF_HEADROOM : m->buf_len;
1562 m->data_off = buf_ofs;
1568 * This function is called after packets have been transimited. It fetchs mbuf
1569 * from vpool->pool, detached it and put into vpool->ring. It also update the
1570 * used index and kick the guest if necessary.
1572 static inline uint32_t __attribute__((always_inline))
1573 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1575 struct rte_mbuf *mbuf;
1576 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1577 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1579 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1581 LOG_DEBUG(VHOST_DATA,
1582 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1584 dev->device_fh, mbuf_count);
1585 LOG_DEBUG(VHOST_DATA,
1586 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1588 dev->device_fh, rte_ring_count(vpool->ring));
1590 for (index = 0; index < mbuf_count; index++) {
1591 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1592 if (likely(MBUF_EXT_MEM(mbuf)))
1593 pktmbuf_detach_zcp(mbuf);
1594 rte_ring_sp_enqueue(vpool->ring, mbuf);
1596 /* Update used index buffer information. */
1597 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1598 vq->used->ring[used_idx].len = 0;
1600 used_idx = (used_idx + 1) & (vq->size - 1);
1603 LOG_DEBUG(VHOST_DATA,
1604 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1606 dev->device_fh, rte_mempool_count(vpool->pool));
1607 LOG_DEBUG(VHOST_DATA,
1608 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1610 dev->device_fh, rte_ring_count(vpool->ring));
1611 LOG_DEBUG(VHOST_DATA,
1612 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1613 "vq->last_used_idx:%d\n",
1614 dev->device_fh, vq->last_used_idx);
1616 vq->last_used_idx += mbuf_count;
1618 LOG_DEBUG(VHOST_DATA,
1619 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1620 "vq->last_used_idx:%d\n",
1621 dev->device_fh, vq->last_used_idx);
1623 rte_compiler_barrier();
1625 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1627 /* Kick guest if required. */
1628 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1629 eventfd_write((int)vq->callfd, 1);
1635 * This function is called when a virtio device is destroy.
1636 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1638 static void mbuf_destroy_zcp(struct vpool *vpool)
1640 struct rte_mbuf *mbuf = NULL;
1641 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1643 LOG_DEBUG(VHOST_CONFIG,
1644 "in mbuf_destroy_zcp: mbuf count in mempool before "
1645 "mbuf_destroy_zcp is: %d\n",
1647 LOG_DEBUG(VHOST_CONFIG,
1648 "in mbuf_destroy_zcp: mbuf count in ring before "
1649 "mbuf_destroy_zcp is : %d\n",
1650 rte_ring_count(vpool->ring));
1652 for (index = 0; index < mbuf_count; index++) {
1653 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1654 if (likely(mbuf != NULL)) {
1655 if (likely(MBUF_EXT_MEM(mbuf)))
1656 pktmbuf_detach_zcp(mbuf);
1657 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1661 LOG_DEBUG(VHOST_CONFIG,
1662 "in mbuf_destroy_zcp: mbuf count in mempool after "
1663 "mbuf_destroy_zcp is: %d\n",
1664 rte_mempool_count(vpool->pool));
1665 LOG_DEBUG(VHOST_CONFIG,
1666 "in mbuf_destroy_zcp: mbuf count in ring after "
1667 "mbuf_destroy_zcp is : %d\n",
1668 rte_ring_count(vpool->ring));
1672 * This function update the use flag and counter.
1674 static inline uint32_t __attribute__((always_inline))
1675 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1678 struct vhost_virtqueue *vq;
1679 struct vring_desc *desc;
1680 struct rte_mbuf *buff;
1681 /* The virtio_hdr is initialised to 0. */
1682 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1683 = {{0, 0, 0, 0, 0, 0}, 0};
1684 uint64_t buff_hdr_addr = 0;
1685 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1686 uint32_t head_idx, packet_success = 0;
1687 uint16_t res_cur_idx;
1689 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1694 vq = dev->virtqueue[VIRTIO_RXQ];
1695 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1697 res_cur_idx = vq->last_used_idx;
1698 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1699 dev->device_fh, res_cur_idx, res_cur_idx + count);
1701 /* Retrieve all of the head indexes first to avoid caching issues. */
1702 for (head_idx = 0; head_idx < count; head_idx++)
1703 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1705 /*Prefetch descriptor index. */
1706 rte_prefetch0(&vq->desc[head[packet_success]]);
1708 while (packet_success != count) {
1709 /* Get descriptor from available ring */
1710 desc = &vq->desc[head[packet_success]];
1712 buff = pkts[packet_success];
1713 LOG_DEBUG(VHOST_DATA,
1714 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1715 "pkt[%d] descriptor idx: %d\n",
1716 dev->device_fh, packet_success,
1717 MBUF_HEADROOM_UINT32(buff));
1720 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1721 + RTE_PKTMBUF_HEADROOM),
1722 rte_pktmbuf_data_len(buff), 0);
1724 /* Buffer address translation for virtio header. */
1725 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1726 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1729 * If the descriptors are chained the header and data are
1730 * placed in separate buffers.
1732 if (desc->flags & VRING_DESC_F_NEXT) {
1733 desc->len = vq->vhost_hlen;
1734 desc = &vq->desc[desc->next];
1735 desc->len = rte_pktmbuf_data_len(buff);
1737 desc->len = packet_len;
1740 /* Update used ring with desc information */
1741 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1742 = head[packet_success];
1743 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1748 /* A header is required per buffer. */
1749 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1750 (const void *)&virtio_hdr, vq->vhost_hlen);
1752 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1754 if (likely(packet_success < count)) {
1755 /* Prefetch descriptor index. */
1756 rte_prefetch0(&vq->desc[head[packet_success]]);
1760 rte_compiler_barrier();
1762 LOG_DEBUG(VHOST_DATA,
1763 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1764 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1765 dev->device_fh, vq->last_used_idx, vq->used->idx);
1767 *(volatile uint16_t *)&vq->used->idx += count;
1768 vq->last_used_idx += count;
1770 LOG_DEBUG(VHOST_DATA,
1771 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1772 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1773 dev->device_fh, vq->last_used_idx, vq->used->idx);
1775 /* Kick the guest if necessary. */
1776 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1777 eventfd_write((int)vq->callfd, 1);
1783 * This function routes the TX packet to the correct interface.
1784 * This may be a local device or the physical port.
1786 static inline void __attribute__((always_inline))
1787 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1788 uint32_t desc_idx, uint8_t need_copy)
1790 struct mbuf_table *tx_q;
1791 struct rte_mbuf **m_table;
1792 struct rte_mbuf *mbuf = NULL;
1793 unsigned len, ret, offset = 0;
1794 struct vpool *vpool;
1795 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1796 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1798 /*Add packet to the port tx queue*/
1799 tx_q = &tx_queue_zcp[vmdq_rx_q];
1802 /* Allocate an mbuf and populate the structure. */
1803 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1804 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1805 if (unlikely(mbuf == NULL)) {
1806 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1807 RTE_LOG(ERR, VHOST_DATA,
1808 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1810 put_desc_to_used_list_zcp(vq, desc_idx);
1814 if (vm2vm_mode == VM2VM_HARDWARE) {
1815 /* Avoid using a vlan tag from any vm for external pkt, such as
1816 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1817 * selection, MAC address determines it as an external pkt
1818 * which should go to network, while vlan tag determine it as
1819 * a vm2vm pkt should forward to another vm. Hardware confuse
1820 * such a ambiguous situation, so pkt will lost.
1822 vlan_tag = external_pkt_default_vlan_tag;
1823 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1824 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1825 __rte_mbuf_raw_free(mbuf);
1830 mbuf->nb_segs = m->nb_segs;
1831 mbuf->next = m->next;
1832 mbuf->data_len = m->data_len + offset;
1833 mbuf->pkt_len = mbuf->data_len;
1834 if (unlikely(need_copy)) {
1835 /* Copy the packet contents to the mbuf. */
1836 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1837 rte_pktmbuf_mtod(m, void *),
1840 mbuf->data_off = m->data_off;
1841 mbuf->buf_physaddr = m->buf_physaddr;
1842 mbuf->buf_addr = m->buf_addr;
1844 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1845 mbuf->vlan_tci = vlan_tag;
1846 mbuf->l2_len = sizeof(struct ether_hdr);
1847 mbuf->l3_len = sizeof(struct ipv4_hdr);
1848 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1850 tx_q->m_table[len] = mbuf;
1853 LOG_DEBUG(VHOST_DATA,
1854 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1857 (mbuf->next == NULL) ? "null" : "non-null");
1860 dev_statistics[dev->device_fh].tx_total++;
1861 dev_statistics[dev->device_fh].tx++;
1864 if (unlikely(len == MAX_PKT_BURST)) {
1865 m_table = (struct rte_mbuf **)tx_q->m_table;
1866 ret = rte_eth_tx_burst(ports[0],
1867 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1870 * Free any buffers not handled by TX and update
1873 if (unlikely(ret < len)) {
1875 rte_pktmbuf_free(m_table[ret]);
1876 } while (++ret < len);
1880 txmbuf_clean_zcp(dev, vpool);
1889 * This function TX all available packets in virtio TX queue for one
1890 * virtio-net device. If it is first packet, it learns MAC address and
1893 static inline void __attribute__((always_inline))
1894 virtio_dev_tx_zcp(struct virtio_net *dev)
1897 struct vhost_virtqueue *vq;
1898 struct vring_desc *desc;
1899 uint64_t buff_addr = 0, phys_addr;
1900 uint32_t head[MAX_PKT_BURST];
1902 uint16_t free_entries, packet_success = 0;
1904 uint8_t need_copy = 0;
1906 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1908 vq = dev->virtqueue[VIRTIO_TXQ];
1909 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1911 /* If there are no available buffers then return. */
1912 if (vq->last_used_idx_res == avail_idx)
1915 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1917 /* Prefetch available ring to retrieve head indexes. */
1918 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1920 /* Get the number of free entries in the ring */
1921 free_entries = (avail_idx - vq->last_used_idx_res);
1923 /* Limit to MAX_PKT_BURST. */
1925 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1927 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1928 dev->device_fh, free_entries);
1930 /* Retrieve all of the head indexes first to avoid caching issues. */
1931 for (i = 0; i < free_entries; i++)
1933 = vq->avail->ring[(vq->last_used_idx_res + i)
1936 vq->last_used_idx_res += free_entries;
1938 /* Prefetch descriptor index. */
1939 rte_prefetch0(&vq->desc[head[packet_success]]);
1940 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1942 while (packet_success < free_entries) {
1943 desc = &vq->desc[head[packet_success]];
1945 /* Discard first buffer as it is the virtio header */
1946 desc = &vq->desc[desc->next];
1948 /* Buffer address translation. */
1949 buff_addr = gpa_to_vva(dev, desc->addr);
1950 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1951 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1954 if (likely(packet_success < (free_entries - 1)))
1955 /* Prefetch descriptor index. */
1956 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1958 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1959 RTE_LOG(ERR, VHOST_DATA,
1960 "(%"PRIu64") Invalid frame buffer address found"
1961 "when TX packets!\n",
1967 /* Prefetch buffer address. */
1968 rte_prefetch0((void *)(uintptr_t)buff_addr);
1971 * Setup dummy mbuf. This is copied to a real mbuf if
1972 * transmitted out the physical port.
1974 m.data_len = desc->len;
1978 m.buf_addr = (void *)(uintptr_t)buff_addr;
1979 m.buf_physaddr = phys_addr;
1982 * Check if the frame buffer address from guest crosses
1983 * sub-region or not.
1985 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1986 RTE_LOG(ERR, VHOST_DATA,
1987 "(%"PRIu64") Frame buffer address cross "
1988 "sub-regioin found when attaching TX frame "
1989 "buffer address!\n",
1995 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1998 * If this is the first received packet we need to learn
1999 * the MAC and setup VMDQ
2001 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2002 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2004 * Discard frame if device is scheduled for
2005 * removal or a duplicate MAC address is found.
2007 packet_success += free_entries;
2008 vq->last_used_idx += packet_success;
2013 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2019 * This function is called by each data core. It handles all RX/TX registered
2020 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2021 * addresses are compared with all devices in the main linked list.
2024 switch_worker_zcp(__attribute__((unused)) void *arg)
2026 struct virtio_net *dev = NULL;
2027 struct vhost_dev *vdev = NULL;
2028 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2029 struct virtio_net_data_ll *dev_ll;
2030 struct mbuf_table *tx_q;
2031 volatile struct lcore_ll_info *lcore_ll;
2032 const uint64_t drain_tsc
2033 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2034 * BURST_TX_DRAIN_US;
2035 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2037 const uint16_t lcore_id = rte_lcore_id();
2038 uint16_t count_in_ring, rx_count = 0;
2040 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2042 lcore_ll = lcore_info[lcore_id].lcore_ll;
2046 cur_tsc = rte_rdtsc();
2048 /* TX burst queue drain */
2049 diff_tsc = cur_tsc - prev_tsc;
2050 if (unlikely(diff_tsc > drain_tsc)) {
2052 * Get mbuf from vpool.pool and detach mbuf and
2053 * put back into vpool.ring.
2055 dev_ll = lcore_ll->ll_root_used;
2056 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2057 /* Get virtio device ID */
2058 vdev = dev_ll->vdev;
2061 if (likely(!vdev->remove)) {
2062 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2064 LOG_DEBUG(VHOST_DATA,
2065 "TX queue drained after timeout"
2066 " with burst size %u\n",
2070 * Tx any packets in the queue
2072 ret = rte_eth_tx_burst(
2074 (uint16_t)tx_q->txq_id,
2075 (struct rte_mbuf **)
2077 (uint16_t)tx_q->len);
2078 if (unlikely(ret < tx_q->len)) {
2081 tx_q->m_table[ret]);
2082 } while (++ret < tx_q->len);
2086 txmbuf_clean_zcp(dev,
2087 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2090 dev_ll = dev_ll->next;
2095 rte_prefetch0(lcore_ll->ll_root_used);
2098 * Inform the configuration core that we have exited the linked
2099 * list and that no devices are in use if requested.
2101 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2102 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2104 /* Process devices */
2105 dev_ll = lcore_ll->ll_root_used;
2107 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2108 vdev = dev_ll->vdev;
2110 if (unlikely(vdev->remove)) {
2111 dev_ll = dev_ll->next;
2113 vdev->ready = DEVICE_SAFE_REMOVE;
2117 if (likely(vdev->ready == DEVICE_RX)) {
2118 uint32_t index = vdev->vmdq_rx_q;
2121 = rte_ring_count(vpool_array[index].ring);
2122 uint16_t free_entries
2123 = (uint16_t)get_available_ring_num_zcp(dev);
2126 * Attach all mbufs in vpool.ring and put back
2130 i < RTE_MIN(free_entries,
2131 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2133 attach_rxmbuf_zcp(dev);
2135 /* Handle guest RX */
2136 rx_count = rte_eth_rx_burst(ports[0],
2137 vdev->vmdq_rx_q, pkts_burst,
2141 ret_count = virtio_dev_rx_zcp(dev,
2142 pkts_burst, rx_count);
2144 dev_statistics[dev->device_fh].rx_total
2146 dev_statistics[dev->device_fh].rx
2149 while (likely(rx_count)) {
2152 pkts_burst[rx_count]);
2153 rte_ring_sp_enqueue(
2154 vpool_array[index].ring,
2155 (void *)pkts_burst[rx_count]);
2160 if (likely(!vdev->remove))
2161 /* Handle guest TX */
2162 virtio_dev_tx_zcp(dev);
2164 /* Move to the next device in the list */
2165 dev_ll = dev_ll->next;
2174 * Add an entry to a used linked list. A free entry must first be found
2175 * in the free linked list using get_data_ll_free_entry();
2178 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2179 struct virtio_net_data_ll *ll_dev)
2181 struct virtio_net_data_ll *ll = *ll_root_addr;
2183 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2184 ll_dev->next = NULL;
2185 rte_compiler_barrier();
2187 /* If ll == NULL then this is the first device. */
2189 /* Increment to the tail of the linked list. */
2190 while ((ll->next != NULL) )
2195 *ll_root_addr = ll_dev;
2200 * Remove an entry from a used linked list. The entry must then be added to
2201 * the free linked list using put_data_ll_free_entry().
2204 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2205 struct virtio_net_data_ll *ll_dev,
2206 struct virtio_net_data_ll *ll_dev_last)
2208 struct virtio_net_data_ll *ll = *ll_root_addr;
2210 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2214 *ll_root_addr = ll_dev->next;
2216 if (likely(ll_dev_last != NULL))
2217 ll_dev_last->next = ll_dev->next;
2219 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2223 * Find and return an entry from the free linked list.
2225 static struct virtio_net_data_ll *
2226 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2228 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2229 struct virtio_net_data_ll *ll_dev;
2231 if (ll_free == NULL)
2235 *ll_root_addr = ll_free->next;
2241 * Place an entry back on to the free linked list.
2244 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2245 struct virtio_net_data_ll *ll_dev)
2247 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2252 ll_dev->next = ll_free;
2253 *ll_root_addr = ll_dev;
2257 * Creates a linked list of a given size.
2259 static struct virtio_net_data_ll *
2260 alloc_data_ll(uint32_t size)
2262 struct virtio_net_data_ll *ll_new;
2265 /* Malloc and then chain the linked list. */
2266 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2267 if (ll_new == NULL) {
2268 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2272 for (i = 0; i < size - 1; i++) {
2273 ll_new[i].vdev = NULL;
2274 ll_new[i].next = &ll_new[i+1];
2276 ll_new[i].next = NULL;
2282 * Create the main linked list along with each individual cores linked list. A used and a free list
2283 * are created to manage entries.
2290 RTE_LCORE_FOREACH_SLAVE(lcore) {
2291 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2292 if (lcore_info[lcore].lcore_ll == NULL) {
2293 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2297 lcore_info[lcore].lcore_ll->device_num = 0;
2298 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2299 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2300 if (num_devices % num_switching_cores)
2301 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2303 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2306 /* Allocate devices up to a maximum of MAX_DEVICES. */
2307 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2313 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2314 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2315 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2318 destroy_device (volatile struct virtio_net *dev)
2320 struct virtio_net_data_ll *ll_lcore_dev_cur;
2321 struct virtio_net_data_ll *ll_main_dev_cur;
2322 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2323 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2324 struct vhost_dev *vdev;
2327 dev->flags &= ~VIRTIO_DEV_RUNNING;
2329 vdev = (struct vhost_dev *)dev->priv;
2330 /*set the remove flag. */
2332 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2336 /* Search for entry to be removed from lcore ll */
2337 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2338 while (ll_lcore_dev_cur != NULL) {
2339 if (ll_lcore_dev_cur->vdev == vdev) {
2342 ll_lcore_dev_last = ll_lcore_dev_cur;
2343 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2347 if (ll_lcore_dev_cur == NULL) {
2348 RTE_LOG(ERR, VHOST_CONFIG,
2349 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2354 /* Search for entry to be removed from main ll */
2355 ll_main_dev_cur = ll_root_used;
2356 ll_main_dev_last = NULL;
2357 while (ll_main_dev_cur != NULL) {
2358 if (ll_main_dev_cur->vdev == vdev) {
2361 ll_main_dev_last = ll_main_dev_cur;
2362 ll_main_dev_cur = ll_main_dev_cur->next;
2366 /* Remove entries from the lcore and main ll. */
2367 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2368 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2370 /* Set the dev_removal_flag on each lcore. */
2371 RTE_LCORE_FOREACH_SLAVE(lcore) {
2372 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2376 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2377 * they can no longer access the device removed from the linked lists and that the devices
2378 * are no longer in use.
2380 RTE_LCORE_FOREACH_SLAVE(lcore) {
2381 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2386 /* Add the entries back to the lcore and main free ll.*/
2387 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2388 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2390 /* Decrement number of device on the lcore. */
2391 lcore_info[vdev->coreid].lcore_ll->device_num--;
2393 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2396 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2398 /* Stop the RX queue. */
2399 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2400 LOG_DEBUG(VHOST_CONFIG,
2401 "(%"PRIu64") In destroy_device: Failed to stop "
2407 LOG_DEBUG(VHOST_CONFIG,
2408 "(%"PRIu64") in destroy_device: Start put mbuf in "
2409 "mempool back to ring for RX queue: %d\n",
2410 dev->device_fh, vdev->vmdq_rx_q);
2412 mbuf_destroy_zcp(vpool);
2414 /* Stop the TX queue. */
2415 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2416 LOG_DEBUG(VHOST_CONFIG,
2417 "(%"PRIu64") In destroy_device: Failed to "
2418 "stop tx queue:%d\n",
2419 dev->device_fh, vdev->vmdq_rx_q);
2422 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2424 LOG_DEBUG(VHOST_CONFIG,
2425 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2426 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2427 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2430 mbuf_destroy_zcp(vpool);
2431 rte_free(vdev->regions_hpa);
2438 * Calculate the region count of physical continous regions for one particular
2439 * region of whose vhost virtual address is continous. The particular region
2440 * start from vva_start, with size of 'size' in argument.
2443 check_hpa_regions(uint64_t vva_start, uint64_t size)
2445 uint32_t i, nregions = 0, page_size = getpagesize();
2446 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2447 if (vva_start % page_size) {
2448 LOG_DEBUG(VHOST_CONFIG,
2449 "in check_countinous: vva start(%p) mod page_size(%d) "
2451 (void *)(uintptr_t)vva_start, page_size);
2454 if (size % page_size) {
2455 LOG_DEBUG(VHOST_CONFIG,
2456 "in check_countinous: "
2457 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2461 for (i = 0; i < size - page_size; i = i + page_size) {
2463 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2464 next_phys_addr = rte_mem_virt2phy(
2465 (void *)(uintptr_t)(vva_start + i + page_size));
2466 if ((cur_phys_addr + page_size) != next_phys_addr) {
2468 LOG_DEBUG(VHOST_CONFIG,
2469 "in check_continuous: hva addr:(%p) is not "
2470 "continuous with hva addr:(%p), diff:%d\n",
2471 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2472 (void *)(uintptr_t)(vva_start + (uint64_t)i
2473 + page_size), page_size);
2474 LOG_DEBUG(VHOST_CONFIG,
2475 "in check_continuous: hpa addr:(%p) is not "
2476 "continuous with hpa addr:(%p), "
2477 "diff:(%"PRIu64")\n",
2478 (void *)(uintptr_t)cur_phys_addr,
2479 (void *)(uintptr_t)next_phys_addr,
2480 (next_phys_addr-cur_phys_addr));
2487 * Divide each region whose vhost virtual address is continous into a few
2488 * sub-regions, make sure the physical address within each sub-region are
2489 * continous. And fill offset(to GPA) and size etc. information of each
2490 * sub-region into regions_hpa.
2493 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2495 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2496 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2498 if (mem_region_hpa == NULL)
2501 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2502 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2503 virtio_memory->regions[regionidx].address_offset;
2504 mem_region_hpa[regionidx_hpa].guest_phys_address
2505 = virtio_memory->regions[regionidx].guest_phys_address;
2506 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2507 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2508 mem_region_hpa[regionidx_hpa].guest_phys_address;
2509 LOG_DEBUG(VHOST_CONFIG,
2510 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2513 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2514 LOG_DEBUG(VHOST_CONFIG,
2515 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2518 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2520 i < virtio_memory->regions[regionidx].memory_size -
2523 cur_phys_addr = rte_mem_virt2phy(
2524 (void *)(uintptr_t)(vva_start + i));
2525 next_phys_addr = rte_mem_virt2phy(
2526 (void *)(uintptr_t)(vva_start +
2528 if ((cur_phys_addr + page_size) != next_phys_addr) {
2529 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2530 mem_region_hpa[regionidx_hpa].guest_phys_address +
2532 mem_region_hpa[regionidx_hpa].memory_size
2534 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2535 "phys addr end [%d]:(%p)\n",
2538 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2539 LOG_DEBUG(VHOST_CONFIG,
2540 "in fill_hpa_regions: guest phys addr "
2544 (mem_region_hpa[regionidx_hpa].memory_size));
2545 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2546 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2548 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2550 mem_region_hpa[regionidx_hpa].guest_phys_address;
2551 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2552 " phys addr start[%d]:(%p)\n",
2555 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2556 LOG_DEBUG(VHOST_CONFIG,
2557 "in fill_hpa_regions: host phys addr "
2561 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2567 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2568 = mem_region_hpa[regionidx_hpa].guest_phys_address
2570 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2571 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2572 "[%d]:(%p)\n", regionidx_hpa,
2574 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2575 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2576 "[%d]:(%p)\n", regionidx_hpa,
2578 (mem_region_hpa[regionidx_hpa].memory_size));
2581 return regionidx_hpa;
2585 * A new device is added to a data core. First the device is added to the main linked list
2586 * and the allocated to a specific data core.
2589 new_device (struct virtio_net *dev)
2591 struct virtio_net_data_ll *ll_dev;
2592 int lcore, core_add = 0;
2593 uint32_t device_num_min = num_devices;
2594 struct vhost_dev *vdev;
2597 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2599 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2607 vdev->nregions_hpa = dev->mem->nregions;
2608 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2610 += check_hpa_regions(
2611 dev->mem->regions[regionidx].guest_phys_address
2612 + dev->mem->regions[regionidx].address_offset,
2613 dev->mem->regions[regionidx].memory_size);
2617 vdev->regions_hpa = rte_calloc("vhost hpa region",
2619 sizeof(struct virtio_memory_regions_hpa),
2620 RTE_CACHE_LINE_SIZE);
2621 if (vdev->regions_hpa == NULL) {
2622 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2628 if (fill_hpa_memory_regions(
2629 vdev->regions_hpa, dev->mem
2630 ) != vdev->nregions_hpa) {
2632 RTE_LOG(ERR, VHOST_CONFIG,
2633 "hpa memory regions number mismatch: "
2634 "[%d]\n", vdev->nregions_hpa);
2635 rte_free(vdev->regions_hpa);
2642 /* Add device to main ll */
2643 ll_dev = get_data_ll_free_entry(&ll_root_free);
2644 if (ll_dev == NULL) {
2645 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2646 "of %d devices per core has been reached\n",
2647 dev->device_fh, num_devices);
2648 if (vdev->regions_hpa)
2649 rte_free(vdev->regions_hpa);
2653 ll_dev->vdev = vdev;
2654 add_data_ll_entry(&ll_root_used, ll_dev);
2656 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2659 uint32_t index = vdev->vmdq_rx_q;
2660 uint32_t count_in_ring, i;
2661 struct mbuf_table *tx_q;
2663 count_in_ring = rte_ring_count(vpool_array[index].ring);
2665 LOG_DEBUG(VHOST_CONFIG,
2666 "(%"PRIu64") in new_device: mbuf count in mempool "
2667 "before attach is: %d\n",
2669 rte_mempool_count(vpool_array[index].pool));
2670 LOG_DEBUG(VHOST_CONFIG,
2671 "(%"PRIu64") in new_device: mbuf count in ring "
2672 "before attach is : %d\n",
2673 dev->device_fh, count_in_ring);
2676 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2678 for (i = 0; i < count_in_ring; i++)
2679 attach_rxmbuf_zcp(dev);
2681 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2682 "mempool after attach is: %d\n",
2684 rte_mempool_count(vpool_array[index].pool));
2685 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2686 "ring after attach is : %d\n",
2688 rte_ring_count(vpool_array[index].ring));
2690 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2691 tx_q->txq_id = vdev->vmdq_rx_q;
2693 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2694 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2696 LOG_DEBUG(VHOST_CONFIG,
2697 "(%"PRIu64") In new_device: Failed to start "
2699 dev->device_fh, vdev->vmdq_rx_q);
2701 mbuf_destroy_zcp(vpool);
2702 rte_free(vdev->regions_hpa);
2707 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2708 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2710 LOG_DEBUG(VHOST_CONFIG,
2711 "(%"PRIu64") In new_device: Failed to start "
2713 dev->device_fh, vdev->vmdq_rx_q);
2715 /* Stop the TX queue. */
2716 if (rte_eth_dev_tx_queue_stop(ports[0],
2717 vdev->vmdq_rx_q) != 0) {
2718 LOG_DEBUG(VHOST_CONFIG,
2719 "(%"PRIu64") In new_device: Failed to "
2720 "stop tx queue:%d\n",
2721 dev->device_fh, vdev->vmdq_rx_q);
2724 mbuf_destroy_zcp(vpool);
2725 rte_free(vdev->regions_hpa);
2732 /*reset ready flag*/
2733 vdev->ready = DEVICE_MAC_LEARNING;
2736 /* Find a suitable lcore to add the device. */
2737 RTE_LCORE_FOREACH_SLAVE(lcore) {
2738 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2739 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2743 /* Add device to lcore ll */
2744 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2745 if (ll_dev == NULL) {
2746 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2747 vdev->ready = DEVICE_SAFE_REMOVE;
2748 destroy_device(dev);
2749 rte_free(vdev->regions_hpa);
2753 ll_dev->vdev = vdev;
2754 vdev->coreid = core_add;
2756 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2758 /* Initialize device stats */
2759 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2761 /* Disable notifications. */
2762 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2763 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2764 lcore_info[vdev->coreid].lcore_ll->device_num++;
2765 dev->flags |= VIRTIO_DEV_RUNNING;
2767 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2773 * These callback allow devices to be added to the data core when configuration
2774 * has been fully complete.
2776 static const struct virtio_net_device_ops virtio_net_device_ops =
2778 .new_device = new_device,
2779 .destroy_device = destroy_device,
2783 * This is a thread will wake up after a period to print stats if the user has
2789 struct virtio_net_data_ll *dev_ll;
2790 uint64_t tx_dropped, rx_dropped;
2791 uint64_t tx, tx_total, rx, rx_total;
2793 const char clr[] = { 27, '[', '2', 'J', '\0' };
2794 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2797 sleep(enable_stats);
2799 /* Clear screen and move to top left */
2800 printf("%s%s", clr, top_left);
2802 printf("\nDevice statistics ====================================");
2804 dev_ll = ll_root_used;
2805 while (dev_ll != NULL) {
2806 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2807 tx_total = dev_statistics[device_fh].tx_total;
2808 tx = dev_statistics[device_fh].tx;
2809 tx_dropped = tx_total - tx;
2810 if (zero_copy == 0) {
2811 rx_total = rte_atomic64_read(
2812 &dev_statistics[device_fh].rx_total_atomic);
2813 rx = rte_atomic64_read(
2814 &dev_statistics[device_fh].rx_atomic);
2816 rx_total = dev_statistics[device_fh].rx_total;
2817 rx = dev_statistics[device_fh].rx;
2819 rx_dropped = rx_total - rx;
2821 printf("\nStatistics for device %"PRIu32" ------------------------------"
2822 "\nTX total: %"PRIu64""
2823 "\nTX dropped: %"PRIu64""
2824 "\nTX successful: %"PRIu64""
2825 "\nRX total: %"PRIu64""
2826 "\nRX dropped: %"PRIu64""
2827 "\nRX successful: %"PRIu64"",
2836 dev_ll = dev_ll->next;
2838 printf("\n======================================================\n");
2843 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2844 char *ring_name, uint32_t nb_mbuf)
2846 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2847 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2848 if (vpool_array[index].pool != NULL) {
2849 vpool_array[index].ring
2850 = rte_ring_create(ring_name,
2851 rte_align32pow2(nb_mbuf + 1),
2852 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2853 if (likely(vpool_array[index].ring != NULL)) {
2854 LOG_DEBUG(VHOST_CONFIG,
2855 "in setup_mempool_tbl: mbuf count in "
2857 rte_mempool_count(vpool_array[index].pool));
2858 LOG_DEBUG(VHOST_CONFIG,
2859 "in setup_mempool_tbl: mbuf count in "
2861 rte_ring_count(vpool_array[index].ring));
2863 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2867 /* Need consider head room. */
2868 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2870 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2876 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2877 * device is also registered here to handle the IOCTLs.
2880 main(int argc, char *argv[])
2882 struct rte_mempool *mbuf_pool = NULL;
2883 unsigned lcore_id, core_id = 0;
2884 unsigned nb_ports, valid_num_ports;
2888 static pthread_t tid;
2891 ret = rte_eal_init(argc, argv);
2893 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2897 /* parse app arguments */
2898 ret = us_vhost_parse_args(argc, argv);
2900 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2902 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2903 if (rte_lcore_is_enabled(lcore_id))
2904 lcore_ids[core_id ++] = lcore_id;
2906 if (rte_lcore_count() > RTE_MAX_LCORE)
2907 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2909 /*set the number of swithcing cores available*/
2910 num_switching_cores = rte_lcore_count()-1;
2912 /* Get the number of physical ports. */
2913 nb_ports = rte_eth_dev_count();
2914 if (nb_ports > RTE_MAX_ETHPORTS)
2915 nb_ports = RTE_MAX_ETHPORTS;
2918 * Update the global var NUM_PORTS and global array PORTS
2919 * and get value of var VALID_NUM_PORTS according to system ports number
2921 valid_num_ports = check_ports_num(nb_ports);
2923 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2924 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2925 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2929 if (zero_copy == 0) {
2930 /* Create the mbuf pool. */
2931 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2932 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2933 0, MBUF_DATA_SIZE, rte_socket_id());
2934 if (mbuf_pool == NULL)
2935 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2937 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2938 vpool_array[queue_id].pool = mbuf_pool;
2940 if (vm2vm_mode == VM2VM_HARDWARE) {
2941 /* Enable VT loop back to let L2 switch to do it. */
2942 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2943 LOG_DEBUG(VHOST_CONFIG,
2944 "Enable loop back for L2 switch in vmdq.\n");
2948 char pool_name[RTE_MEMPOOL_NAMESIZE];
2949 char ring_name[RTE_MEMPOOL_NAMESIZE];
2951 nb_mbuf = num_rx_descriptor
2952 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2953 + num_switching_cores * MAX_PKT_BURST;
2955 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2956 snprintf(pool_name, sizeof(pool_name),
2957 "rxmbuf_pool_%u", queue_id);
2958 snprintf(ring_name, sizeof(ring_name),
2959 "rxmbuf_ring_%u", queue_id);
2960 setup_mempool_tbl(rte_socket_id(), queue_id,
2961 pool_name, ring_name, nb_mbuf);
2964 nb_mbuf = num_tx_descriptor
2965 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2966 + num_switching_cores * MAX_PKT_BURST;
2968 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2969 snprintf(pool_name, sizeof(pool_name),
2970 "txmbuf_pool_%u", queue_id);
2971 snprintf(ring_name, sizeof(ring_name),
2972 "txmbuf_ring_%u", queue_id);
2973 setup_mempool_tbl(rte_socket_id(),
2974 (queue_id + MAX_QUEUES),
2975 pool_name, ring_name, nb_mbuf);
2978 if (vm2vm_mode == VM2VM_HARDWARE) {
2979 /* Enable VT loop back to let L2 switch to do it. */
2980 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2981 LOG_DEBUG(VHOST_CONFIG,
2982 "Enable loop back for L2 switch in vmdq.\n");
2985 /* Set log level. */
2986 rte_set_log_level(LOG_LEVEL);
2988 /* initialize all ports */
2989 for (portid = 0; portid < nb_ports; portid++) {
2990 /* skip ports that are not enabled */
2991 if ((enabled_port_mask & (1 << portid)) == 0) {
2992 RTE_LOG(INFO, VHOST_PORT,
2993 "Skipping disabled port %d\n", portid);
2996 if (port_init(portid) != 0)
2997 rte_exit(EXIT_FAILURE,
2998 "Cannot initialize network ports\n");
3001 /* Initialise all linked lists. */
3002 if (init_data_ll() == -1)
3003 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3005 /* Initialize device stats */
3006 memset(&dev_statistics, 0, sizeof(dev_statistics));
3008 /* Enable stats if the user option is set. */
3010 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3012 /* Launch all data cores. */
3013 if (zero_copy == 0) {
3014 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3015 rte_eal_remote_launch(switch_worker,
3016 mbuf_pool, lcore_id);
3019 uint32_t count_in_mempool, index, i;
3020 for (index = 0; index < 2*MAX_QUEUES; index++) {
3021 /* For all RX and TX queues. */
3023 = rte_mempool_count(vpool_array[index].pool);
3026 * Transfer all un-attached mbufs from vpool.pool
3029 for (i = 0; i < count_in_mempool; i++) {
3030 struct rte_mbuf *mbuf
3031 = __rte_mbuf_raw_alloc(
3032 vpool_array[index].pool);
3033 rte_ring_sp_enqueue(vpool_array[index].ring,
3037 LOG_DEBUG(VHOST_CONFIG,
3038 "in main: mbuf count in mempool at initial "
3039 "is: %d\n", count_in_mempool);
3040 LOG_DEBUG(VHOST_CONFIG,
3041 "in main: mbuf count in ring at initial is :"
3043 rte_ring_count(vpool_array[index].ring));
3046 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3047 rte_eal_remote_launch(switch_worker_zcp, NULL,
3052 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3054 /* Register CUSE device to handle IOCTLs. */
3055 ret = rte_vhost_driver_register((char *)&dev_basename);
3057 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3059 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3061 /* Start CUSE session. */
3062 rte_vhost_driver_session_start();