4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
57 #define MAX_QUEUES 128
60 /* the maximum number of external ports supported */
61 #define MAX_SUP_PORTS 1
64 * Calculate the number of buffers needed per port
66 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
67 (num_switching_cores*MAX_PKT_BURST) + \
68 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
69 (num_switching_cores*MBUF_CACHE_SIZE))
71 #define MBUF_CACHE_SIZE 128
72 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
75 * No frame data buffer allocated from host are required for zero copy
76 * implementation, guest will allocate the frame data buffer, and vhost
79 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM
80 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE
81 #define MBUF_CACHE_SIZE_ZCP 0
83 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
84 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
86 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
87 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
89 #define JUMBO_FRAME_MAX_SIZE 0x2600
91 /* State of virtio device. */
92 #define DEVICE_MAC_LEARNING 0
94 #define DEVICE_SAFE_REMOVE 2
96 /* Config_core_flag status definitions. */
97 #define REQUEST_DEV_REMOVAL 1
98 #define ACK_DEV_REMOVAL 0
100 /* Configurable number of RX/TX ring descriptors */
101 #define RTE_TEST_RX_DESC_DEFAULT 1024
102 #define RTE_TEST_TX_DESC_DEFAULT 512
105 * Need refine these 2 macros for legacy and DPDK based front end:
106 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
107 * And then adjust power 2.
110 * For legacy front end, 128 descriptors,
111 * half for virtio header, another half for mbuf.
113 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
114 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
116 /* Get first 4 bytes in mbuf headroom. */
117 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
118 + sizeof(struct rte_mbuf)))
120 /* true if x is a power of 2 */
121 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
123 #define INVALID_PORT_ID 0xFF
125 /* Max number of devices. Limited by vmdq. */
126 #define MAX_DEVICES 64
128 /* Size of buffers used for snprintfs. */
129 #define MAX_PRINT_BUFF 6072
131 /* Maximum character device basename size. */
132 #define MAX_BASENAME_SZ 10
134 /* Maximum long option length for option parsing. */
135 #define MAX_LONG_OPT_SZ 64
137 /* Used to compare MAC addresses. */
138 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
140 /* Number of descriptors per cacheline. */
141 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
143 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb))
145 /* mask of enabled ports */
146 static uint32_t enabled_port_mask = 0;
148 /* Promiscuous mode */
149 static uint32_t promiscuous;
151 /*Number of switching cores enabled*/
152 static uint32_t num_switching_cores = 0;
154 /* number of devices/queues to support*/
155 static uint32_t num_queues = 0;
156 static uint32_t num_devices;
159 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
160 * disabled on default.
162 static uint32_t zero_copy;
163 static int mergeable;
165 /* Do vlan strip on host, enabled on default */
166 static uint32_t vlan_strip = 1;
168 /* number of descriptors to apply*/
169 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
170 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
172 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
173 #define MAX_RING_DESC 4096
176 struct rte_mempool *pool;
177 struct rte_ring *ring;
179 } vpool_array[MAX_QUEUES+MAX_QUEUES];
181 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
188 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
190 /* The type of host physical address translated from guest physical address. */
192 PHYS_ADDR_CONTINUOUS = 0,
193 PHYS_ADDR_CROSS_SUBREG = 1,
194 PHYS_ADDR_INVALID = 2,
199 static uint32_t enable_stats = 0;
200 /* Enable retries on RX. */
201 static uint32_t enable_retry = 1;
202 /* Specify timeout (in useconds) between retries on RX. */
203 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
204 /* Specify the number of retries on RX. */
205 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
207 /* Character device basename. Can be set by user. */
208 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
210 /* empty vmdq configuration structure. Filled in programatically */
211 static struct rte_eth_conf vmdq_conf_default = {
213 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
215 .header_split = 0, /**< Header Split disabled */
216 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
217 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
219 * It is necessary for 1G NIC such as I350,
220 * this fixes bug of ipv4 forwarding in guest can't
221 * forward pakets from one virtio dev to another virtio dev.
223 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
224 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
225 .hw_strip_crc = 0, /**< CRC stripped by hardware */
229 .mq_mode = ETH_MQ_TX_NONE,
233 * should be overridden separately in code with
237 .nb_queue_pools = ETH_8_POOLS,
238 .enable_default_pool = 0,
241 .pool_map = {{0, 0},},
246 static unsigned lcore_ids[RTE_MAX_LCORE];
247 static uint8_t ports[RTE_MAX_ETHPORTS];
248 static unsigned num_ports = 0; /**< The number of ports specified in command line */
249 static uint16_t num_pf_queues, num_vmdq_queues;
250 static uint16_t vmdq_pool_base, vmdq_queue_base;
251 static uint16_t queues_per_pool;
253 static const uint16_t external_pkt_default_vlan_tag = 2000;
254 const uint16_t vlan_tags[] = {
255 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
256 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
257 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
258 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
259 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
260 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
261 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
262 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
265 /* ethernet addresses of ports */
266 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
268 /* heads for the main used and free linked lists for the data path. */
269 static struct virtio_net_data_ll *ll_root_used = NULL;
270 static struct virtio_net_data_ll *ll_root_free = NULL;
272 /* Array of data core structures containing information on individual core linked lists. */
273 static struct lcore_info lcore_info[RTE_MAX_LCORE];
275 /* Used for queueing bursts of TX packets. */
279 struct rte_mbuf *m_table[MAX_PKT_BURST];
282 /* TX queue for each data core. */
283 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
285 /* TX queue fori each virtio device for zero copy. */
286 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
288 /* Vlan header struct used to insert vlan tags on TX. */
290 unsigned char h_dest[ETH_ALEN];
291 unsigned char h_source[ETH_ALEN];
294 __be16 h_vlan_encapsulated_proto;
299 uint8_t version_ihl; /**< version and header length */
300 uint8_t type_of_service; /**< type of service */
301 uint16_t total_length; /**< length of packet */
302 uint16_t packet_id; /**< packet ID */
303 uint16_t fragment_offset; /**< fragmentation offset */
304 uint8_t time_to_live; /**< time to live */
305 uint8_t next_proto_id; /**< protocol ID */
306 uint16_t hdr_checksum; /**< header checksum */
307 uint32_t src_addr; /**< source address */
308 uint32_t dst_addr; /**< destination address */
309 } __attribute__((__packed__));
311 /* Header lengths. */
313 #define VLAN_ETH_HLEN 18
315 /* Per-device statistics struct */
316 struct device_statistics {
318 rte_atomic64_t rx_total_atomic;
321 rte_atomic64_t rx_atomic;
323 } __rte_cache_aligned;
324 struct device_statistics dev_statistics[MAX_DEVICES];
327 * Builds up the correct configuration for VMDQ VLAN pool map
328 * according to the pool & queue limits.
331 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
333 struct rte_eth_vmdq_rx_conf conf;
334 struct rte_eth_vmdq_rx_conf *def_conf =
335 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
338 memset(&conf, 0, sizeof(conf));
339 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
340 conf.nb_pool_maps = num_devices;
341 conf.enable_loop_back = def_conf->enable_loop_back;
342 conf.rx_mode = def_conf->rx_mode;
344 for (i = 0; i < conf.nb_pool_maps; i++) {
345 conf.pool_map[i].vlan_id = vlan_tags[ i ];
346 conf.pool_map[i].pools = (1UL << i);
349 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
350 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
351 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
356 * Validate the device number according to the max pool number gotten form
357 * dev_info. If the device number is invalid, give the error message and
358 * return -1. Each device must have its own pool.
361 validate_num_devices(uint32_t max_nb_devices)
363 if (num_devices > max_nb_devices) {
364 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
371 * Initialises a given port using global settings and with the rx buffers
372 * coming from the mbuf_pool passed as parameter
375 port_init(uint8_t port)
377 struct rte_eth_dev_info dev_info;
378 struct rte_eth_conf port_conf;
379 struct rte_eth_rxconf *rxconf;
380 struct rte_eth_txconf *txconf;
381 int16_t rx_rings, tx_rings;
382 uint16_t rx_ring_size, tx_ring_size;
386 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
387 rte_eth_dev_info_get (port, &dev_info);
389 if (dev_info.max_rx_queues > MAX_QUEUES) {
390 rte_exit(EXIT_FAILURE,
391 "please define MAX_QUEUES no less than %u in %s\n",
392 dev_info.max_rx_queues, __FILE__);
395 rxconf = &dev_info.default_rxconf;
396 txconf = &dev_info.default_txconf;
397 rxconf->rx_drop_en = 1;
399 /* Enable vlan offload */
400 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
403 * Zero copy defers queue RX/TX start to the time when guest
404 * finishes its startup and packet buffers from that guest are
408 rxconf->rx_deferred_start = 1;
409 rxconf->rx_drop_en = 0;
410 txconf->tx_deferred_start = 1;
413 /*configure the number of supported virtio devices based on VMDQ limits */
414 num_devices = dev_info.max_vmdq_pools;
417 rx_ring_size = num_rx_descriptor;
418 tx_ring_size = num_tx_descriptor;
419 tx_rings = dev_info.max_tx_queues;
421 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
422 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
423 tx_rings = (uint16_t)rte_lcore_count();
426 retval = validate_num_devices(MAX_DEVICES);
430 /* Get port configuration. */
431 retval = get_eth_conf(&port_conf, num_devices);
434 /* NIC queues are divided into pf queues and vmdq queues. */
435 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
436 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
437 num_vmdq_queues = num_devices * queues_per_pool;
438 num_queues = num_pf_queues + num_vmdq_queues;
439 vmdq_queue_base = dev_info.vmdq_queue_base;
440 vmdq_pool_base = dev_info.vmdq_pool_base;
441 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
442 num_pf_queues, num_devices, queues_per_pool);
444 if (port >= rte_eth_dev_count()) return -1;
446 rx_rings = (uint16_t)dev_info.max_rx_queues;
447 /* Configure ethernet device. */
448 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452 /* Setup the queues. */
453 for (q = 0; q < rx_rings; q ++) {
454 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455 rte_eth_dev_socket_id(port),
457 vpool_array[q].pool);
461 for (q = 0; q < tx_rings; q ++) {
462 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
463 rte_eth_dev_socket_id(port),
469 /* Start the device. */
470 retval = rte_eth_dev_start(port);
472 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
477 rte_eth_promiscuous_enable(port);
479 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
480 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
481 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
482 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
484 vmdq_ports_eth_addr[port].addr_bytes[0],
485 vmdq_ports_eth_addr[port].addr_bytes[1],
486 vmdq_ports_eth_addr[port].addr_bytes[2],
487 vmdq_ports_eth_addr[port].addr_bytes[3],
488 vmdq_ports_eth_addr[port].addr_bytes[4],
489 vmdq_ports_eth_addr[port].addr_bytes[5]);
495 * Set character device basename.
498 us_vhost_parse_basename(const char *q_arg)
500 /* parse number string */
502 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
505 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
511 * Parse the portmask provided at run time.
514 parse_portmask(const char *portmask)
521 /* parse hexadecimal string */
522 pm = strtoul(portmask, &end, 16);
523 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
534 * Parse num options at run time.
537 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
544 /* parse unsigned int string */
545 num = strtoul(q_arg, &end, 10);
546 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
549 if (num > max_valid_value)
560 us_vhost_usage(const char *prgname)
562 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
564 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
565 " --dev-basename <name>\n"
567 " -p PORTMASK: Set mask for ports to be used by application\n"
568 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
569 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
570 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
571 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
572 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
573 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
574 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
575 " --dev-basename: The basename to be used for the character device.\n"
576 " --zero-copy [0|1]: disable(default)/enable rx/tx "
578 " --rx-desc-num [0-N]: the number of descriptors on rx, "
579 "used only when zero copy is enabled.\n"
580 " --tx-desc-num [0-N]: the number of descriptors on tx, "
581 "used only when zero copy is enabled.\n",
586 * Parse the arguments given in the command line of the application.
589 us_vhost_parse_args(int argc, char **argv)
594 const char *prgname = argv[0];
595 static struct option long_option[] = {
596 {"vm2vm", required_argument, NULL, 0},
597 {"rx-retry", required_argument, NULL, 0},
598 {"rx-retry-delay", required_argument, NULL, 0},
599 {"rx-retry-num", required_argument, NULL, 0},
600 {"mergeable", required_argument, NULL, 0},
601 {"vlan-strip", required_argument, NULL, 0},
602 {"stats", required_argument, NULL, 0},
603 {"dev-basename", required_argument, NULL, 0},
604 {"zero-copy", required_argument, NULL, 0},
605 {"rx-desc-num", required_argument, NULL, 0},
606 {"tx-desc-num", required_argument, NULL, 0},
610 /* Parse command line */
611 while ((opt = getopt_long(argc, argv, "p:P",
612 long_option, &option_index)) != EOF) {
616 enabled_port_mask = parse_portmask(optarg);
617 if (enabled_port_mask == 0) {
618 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
619 us_vhost_usage(prgname);
626 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
627 ETH_VMDQ_ACCEPT_BROADCAST |
628 ETH_VMDQ_ACCEPT_MULTICAST;
629 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
634 /* Enable/disable vm2vm comms. */
635 if (!strncmp(long_option[option_index].name, "vm2vm",
637 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
639 RTE_LOG(INFO, VHOST_CONFIG,
640 "Invalid argument for "
642 us_vhost_usage(prgname);
645 vm2vm_mode = (vm2vm_type)ret;
649 /* Enable/disable retries on RX. */
650 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
651 ret = parse_num_opt(optarg, 1);
653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
654 us_vhost_usage(prgname);
661 /* Specify the retries delay time (in useconds) on RX. */
662 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
663 ret = parse_num_opt(optarg, INT32_MAX);
665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
666 us_vhost_usage(prgname);
669 burst_rx_delay_time = ret;
673 /* Specify the retries number on RX. */
674 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
675 ret = parse_num_opt(optarg, INT32_MAX);
677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
678 us_vhost_usage(prgname);
681 burst_rx_retry_num = ret;
685 /* Enable/disable RX mergeable buffers. */
686 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
687 ret = parse_num_opt(optarg, 1);
689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
690 us_vhost_usage(prgname);
695 vmdq_conf_default.rxmode.jumbo_frame = 1;
696 vmdq_conf_default.rxmode.max_rx_pkt_len
697 = JUMBO_FRAME_MAX_SIZE;
702 /* Enable/disable RX VLAN strip on host. */
703 if (!strncmp(long_option[option_index].name,
704 "vlan-strip", MAX_LONG_OPT_SZ)) {
705 ret = parse_num_opt(optarg, 1);
707 RTE_LOG(INFO, VHOST_CONFIG,
708 "Invalid argument for VLAN strip [0|1]\n");
709 us_vhost_usage(prgname);
713 vmdq_conf_default.rxmode.hw_vlan_strip =
718 /* Enable/disable stats. */
719 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
720 ret = parse_num_opt(optarg, INT32_MAX);
722 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
723 us_vhost_usage(prgname);
730 /* Set character device basename. */
731 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
732 if (us_vhost_parse_basename(optarg) == -1) {
733 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
734 us_vhost_usage(prgname);
739 /* Enable/disable rx/tx zero copy. */
740 if (!strncmp(long_option[option_index].name,
741 "zero-copy", MAX_LONG_OPT_SZ)) {
742 ret = parse_num_opt(optarg, 1);
744 RTE_LOG(INFO, VHOST_CONFIG,
746 " for zero-copy [0|1]\n");
747 us_vhost_usage(prgname);
753 /* Specify the descriptor number on RX. */
754 if (!strncmp(long_option[option_index].name,
755 "rx-desc-num", MAX_LONG_OPT_SZ)) {
756 ret = parse_num_opt(optarg, MAX_RING_DESC);
757 if ((ret == -1) || (!POWEROF2(ret))) {
758 RTE_LOG(INFO, VHOST_CONFIG,
759 "Invalid argument for rx-desc-num[0-N],"
760 "power of 2 required.\n");
761 us_vhost_usage(prgname);
764 num_rx_descriptor = ret;
768 /* Specify the descriptor number on TX. */
769 if (!strncmp(long_option[option_index].name,
770 "tx-desc-num", MAX_LONG_OPT_SZ)) {
771 ret = parse_num_opt(optarg, MAX_RING_DESC);
772 if ((ret == -1) || (!POWEROF2(ret))) {
773 RTE_LOG(INFO, VHOST_CONFIG,
774 "Invalid argument for tx-desc-num [0-N],"
775 "power of 2 required.\n");
776 us_vhost_usage(prgname);
779 num_tx_descriptor = ret;
785 /* Invalid option - print options. */
787 us_vhost_usage(prgname);
792 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
793 if (enabled_port_mask & (1 << i))
794 ports[num_ports++] = (uint8_t)i;
797 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
798 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
799 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
803 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
804 RTE_LOG(INFO, VHOST_PORT,
805 "Vhost zero copy doesn't support software vm2vm,"
806 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
810 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
811 RTE_LOG(INFO, VHOST_PORT,
812 "Vhost zero copy doesn't support jumbo frame,"
813 "please specify '--mergeable 0' to disable the "
814 "mergeable feature.\n");
822 * Update the global var NUM_PORTS and array PORTS according to system ports number
823 * and return valid ports number
825 static unsigned check_ports_num(unsigned nb_ports)
827 unsigned valid_num_ports = num_ports;
830 if (num_ports > nb_ports) {
831 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
832 num_ports, nb_ports);
833 num_ports = nb_ports;
836 for (portid = 0; portid < num_ports; portid ++) {
837 if (ports[portid] >= nb_ports) {
838 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
839 ports[portid], (nb_ports - 1));
840 ports[portid] = INVALID_PORT_ID;
844 return valid_num_ports;
848 * Macro to print out packet contents. Wrapped in debug define so that the
849 * data path is not effected when debug is disabled.
852 #define PRINT_PACKET(device, addr, size, header) do { \
853 char *pkt_addr = (char*)(addr); \
854 unsigned int index; \
855 char packet[MAX_PRINT_BUFF]; \
858 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
860 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
861 for (index = 0; index < (size); index++) { \
862 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
863 "%02hhx ", pkt_addr[index]); \
865 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
867 LOG_DEBUG(VHOST_DATA, "%s", packet); \
870 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
874 * Function to convert guest physical addresses to vhost physical addresses.
875 * This is used to convert virtio buffer addresses.
877 static inline uint64_t __attribute__((always_inline))
878 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
879 uint32_t buf_len, hpa_type *addr_type)
881 struct virtio_memory_regions_hpa *region;
883 uint64_t vhost_pa = 0;
885 *addr_type = PHYS_ADDR_INVALID;
887 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
888 region = &vdev->regions_hpa[regionidx];
889 if ((guest_pa >= region->guest_phys_address) &&
890 (guest_pa <= region->guest_phys_address_end)) {
891 vhost_pa = region->host_phys_addr_offset + guest_pa;
892 if (likely((guest_pa + buf_len - 1)
893 <= region->guest_phys_address_end))
894 *addr_type = PHYS_ADDR_CONTINUOUS;
896 *addr_type = PHYS_ADDR_CROSS_SUBREG;
901 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
902 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
903 (void *)(uintptr_t)vhost_pa);
909 * Compares a packet destination MAC address to a device MAC address.
911 static inline int __attribute__((always_inline))
912 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
914 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
918 * This function learns the MAC address of the device and registers this along with a
919 * vlan tag to a VMDQ.
922 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
924 struct ether_hdr *pkt_hdr;
925 struct virtio_net_data_ll *dev_ll;
926 struct virtio_net *dev = vdev->dev;
929 /* Learn MAC address of guest device from packet */
930 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
932 dev_ll = ll_root_used;
934 while (dev_ll != NULL) {
935 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
936 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
939 dev_ll = dev_ll->next;
942 for (i = 0; i < ETHER_ADDR_LEN; i++)
943 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
945 /* vlan_tag currently uses the device_id. */
946 vdev->vlan_tag = vlan_tags[dev->device_fh];
948 /* Print out VMDQ registration info. */
949 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
951 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
952 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
953 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
956 /* Register the MAC address. */
957 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
958 (uint32_t)dev->device_fh + vmdq_pool_base);
960 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
963 /* Enable stripping of the vlan tag as we handle routing. */
965 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
966 (uint16_t)vdev->vmdq_rx_q, 1);
968 /* Set device as ready for RX. */
969 vdev->ready = DEVICE_RX;
975 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
976 * queue before disabling RX on the device.
979 unlink_vmdq(struct vhost_dev *vdev)
983 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
985 if (vdev->ready == DEVICE_RX) {
986 /*clear MAC and VLAN settings*/
987 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
988 for (i = 0; i < 6; i++)
989 vdev->mac_address.addr_bytes[i] = 0;
993 /*Clear out the receive buffers*/
994 rx_count = rte_eth_rx_burst(ports[0],
995 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
998 for (i = 0; i < rx_count; i++)
999 rte_pktmbuf_free(pkts_burst[i]);
1001 rx_count = rte_eth_rx_burst(ports[0],
1002 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1005 vdev->ready = DEVICE_MAC_LEARNING;
1010 * Check if the packet destination MAC address is for a local device. If so then put
1011 * the packet on that devices RX queue. If not then return.
1013 static inline int __attribute__((always_inline))
1014 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1016 struct virtio_net_data_ll *dev_ll;
1017 struct ether_hdr *pkt_hdr;
1019 struct virtio_net *dev = vdev->dev;
1020 struct virtio_net *tdev; /* destination virito device */
1022 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1024 /*get the used devices list*/
1025 dev_ll = ll_root_used;
1027 while (dev_ll != NULL) {
1028 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1029 &dev_ll->vdev->mac_address)) {
1031 /* Drop the packet if the TX packet is destined for the TX device. */
1032 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1033 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1037 tdev = dev_ll->vdev->dev;
1040 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1042 if (unlikely(dev_ll->vdev->remove)) {
1043 /*drop the packet if the device is marked for removal*/
1044 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1046 /*send the packet to the local virtio device*/
1047 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1050 &dev_statistics[tdev->device_fh].rx_total_atomic,
1053 &dev_statistics[tdev->device_fh].rx_atomic,
1055 dev_statistics[dev->device_fh].tx_total++;
1056 dev_statistics[dev->device_fh].tx += ret;
1062 dev_ll = dev_ll->next;
1069 * Check if the destination MAC of a packet is one local VM,
1070 * and get its vlan tag, and offset if it is.
1072 static inline int __attribute__((always_inline))
1073 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1074 uint32_t *offset, uint16_t *vlan_tag)
1076 struct virtio_net_data_ll *dev_ll = ll_root_used;
1077 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1079 while (dev_ll != NULL) {
1080 if ((dev_ll->vdev->ready == DEVICE_RX)
1081 && ether_addr_cmp(&(pkt_hdr->d_addr),
1082 &dev_ll->vdev->mac_address)) {
1084 * Drop the packet if the TX packet is
1085 * destined for the TX device.
1087 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1088 LOG_DEBUG(VHOST_DATA,
1089 "(%"PRIu64") TX: Source and destination"
1090 " MAC addresses are the same. Dropping "
1092 dev_ll->vdev->dev->device_fh);
1097 * HW vlan strip will reduce the packet length
1098 * by minus length of vlan tag, so need restore
1099 * the packet length by plus it.
1101 *offset = VLAN_HLEN;
1104 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1106 LOG_DEBUG(VHOST_DATA,
1107 "(%"PRIu64") TX: pkt to local VM device id:"
1108 "(%"PRIu64") vlan tag: %d.\n",
1109 dev->device_fh, dev_ll->vdev->dev->device_fh,
1114 dev_ll = dev_ll->next;
1120 * This function routes the TX packet to the correct interface. This may be a local device
1121 * or the physical port.
1123 static inline void __attribute__((always_inline))
1124 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1126 struct mbuf_table *tx_q;
1127 struct rte_mbuf **m_table;
1128 unsigned len, ret, offset = 0;
1129 const uint16_t lcore_id = rte_lcore_id();
1130 struct virtio_net *dev = vdev->dev;
1131 struct ether_hdr *nh;
1133 /*check if destination is local VM*/
1134 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1135 rte_pktmbuf_free(m);
1139 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1140 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1141 rte_pktmbuf_free(m);
1146 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1148 /*Add packet to the port tx queue*/
1149 tx_q = &lcore_tx_queue[lcore_id];
1152 nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1153 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1154 /* Guest has inserted the vlan tag. */
1155 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1156 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1157 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1158 (vh->vlan_tci != vlan_tag_be))
1159 vh->vlan_tci = vlan_tag_be;
1161 m->ol_flags = PKT_TX_VLAN_PKT;
1164 * Find the right seg to adjust the data len when offset is
1165 * bigger than tail room size.
1167 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1168 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1169 m->data_len += offset;
1171 struct rte_mbuf *seg = m;
1173 while ((seg->next != NULL) &&
1174 (offset > rte_pktmbuf_tailroom(seg)))
1177 seg->data_len += offset;
1179 m->pkt_len += offset;
1182 m->vlan_tci = vlan_tag;
1185 tx_q->m_table[len] = m;
1188 dev_statistics[dev->device_fh].tx_total++;
1189 dev_statistics[dev->device_fh].tx++;
1192 if (unlikely(len == MAX_PKT_BURST)) {
1193 m_table = (struct rte_mbuf **)tx_q->m_table;
1194 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1195 /* Free any buffers not handled by TX and update the port stats. */
1196 if (unlikely(ret < len)) {
1198 rte_pktmbuf_free(m_table[ret]);
1199 } while (++ret < len);
1209 * This function is called by each data core. It handles all RX/TX registered with the
1210 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1211 * with all devices in the main linked list.
1214 switch_worker(__attribute__((unused)) void *arg)
1216 struct rte_mempool *mbuf_pool = arg;
1217 struct virtio_net *dev = NULL;
1218 struct vhost_dev *vdev = NULL;
1219 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1220 struct virtio_net_data_ll *dev_ll;
1221 struct mbuf_table *tx_q;
1222 volatile struct lcore_ll_info *lcore_ll;
1223 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1224 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1226 const uint16_t lcore_id = rte_lcore_id();
1227 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1228 uint16_t rx_count = 0;
1232 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1233 lcore_ll = lcore_info[lcore_id].lcore_ll;
1236 tx_q = &lcore_tx_queue[lcore_id];
1237 for (i = 0; i < num_cores; i ++) {
1238 if (lcore_ids[i] == lcore_id) {
1245 cur_tsc = rte_rdtsc();
1247 * TX burst queue drain
1249 diff_tsc = cur_tsc - prev_tsc;
1250 if (unlikely(diff_tsc > drain_tsc)) {
1253 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1255 /*Tx any packets in the queue*/
1256 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1257 (struct rte_mbuf **)tx_q->m_table,
1258 (uint16_t)tx_q->len);
1259 if (unlikely(ret < tx_q->len)) {
1261 rte_pktmbuf_free(tx_q->m_table[ret]);
1262 } while (++ret < tx_q->len);
1272 rte_prefetch0(lcore_ll->ll_root_used);
1274 * Inform the configuration core that we have exited the linked list and that no devices are
1275 * in use if requested.
1277 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1278 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1283 dev_ll = lcore_ll->ll_root_used;
1285 while (dev_ll != NULL) {
1286 /*get virtio device ID*/
1287 vdev = dev_ll->vdev;
1290 if (unlikely(vdev->remove)) {
1291 dev_ll = dev_ll->next;
1293 vdev->ready = DEVICE_SAFE_REMOVE;
1296 if (likely(vdev->ready == DEVICE_RX)) {
1298 rx_count = rte_eth_rx_burst(ports[0],
1299 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1303 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1304 * Here MAX_PKT_BURST must be less than virtio queue size
1306 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1307 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1308 rte_delay_us(burst_rx_delay_time);
1309 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1313 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1316 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1319 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1321 while (likely(rx_count)) {
1323 rte_pktmbuf_free(pkts_burst[rx_count]);
1329 if (likely(!vdev->remove)) {
1330 /* Handle guest TX*/
1331 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1332 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1333 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1334 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1336 rte_pktmbuf_free(pkts_burst[--tx_count]);
1340 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1343 /*move to the next device in the list*/
1344 dev_ll = dev_ll->next;
1352 * This function gets available ring number for zero copy rx.
1353 * Only one thread will call this funciton for a paticular virtio device,
1354 * so, it is designed as non-thread-safe function.
1356 static inline uint32_t __attribute__((always_inline))
1357 get_available_ring_num_zcp(struct virtio_net *dev)
1359 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1362 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1363 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1367 * This function gets available ring index for zero copy rx,
1368 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1369 * Only one thread will call this funciton for a paticular virtio device,
1370 * so, it is designed as non-thread-safe function.
1372 static inline uint32_t __attribute__((always_inline))
1373 get_available_ring_index_zcp(struct virtio_net *dev,
1374 uint16_t *res_base_idx, uint32_t count)
1376 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1379 uint16_t free_entries;
1381 *res_base_idx = vq->last_used_idx_res;
1382 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1383 free_entries = (avail_idx - *res_base_idx);
1385 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1387 "res base idx:%d, free entries:%d\n",
1388 dev->device_fh, avail_idx, *res_base_idx,
1392 * If retry is enabled and the queue is full then we wait
1393 * and retry to avoid packet loss.
1395 if (enable_retry && unlikely(count > free_entries)) {
1396 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1397 rte_delay_us(burst_rx_delay_time);
1398 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1399 free_entries = (avail_idx - *res_base_idx);
1400 if (count <= free_entries)
1405 /*check that we have enough buffers*/
1406 if (unlikely(count > free_entries))
1407 count = free_entries;
1409 if (unlikely(count == 0)) {
1410 LOG_DEBUG(VHOST_DATA,
1411 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1412 "avail idx: %d, res base idx:%d, free entries:%d\n",
1413 dev->device_fh, avail_idx,
1414 *res_base_idx, free_entries);
1418 vq->last_used_idx_res = *res_base_idx + count;
1424 * This function put descriptor back to used list.
1426 static inline void __attribute__((always_inline))
1427 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1429 uint16_t res_cur_idx = vq->last_used_idx;
1430 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1431 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1432 rte_compiler_barrier();
1433 *(volatile uint16_t *)&vq->used->idx += 1;
1434 vq->last_used_idx += 1;
1436 /* Kick the guest if necessary. */
1437 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1438 eventfd_write(vq->callfd, (eventfd_t)1);
1442 * This function get available descriptor from vitio vring and un-attached mbuf
1443 * from vpool->ring, and then attach them together. It needs adjust the offset
1444 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1445 * frame data may be put to wrong location in mbuf.
1447 static inline void __attribute__((always_inline))
1448 attach_rxmbuf_zcp(struct virtio_net *dev)
1450 uint16_t res_base_idx, desc_idx;
1451 uint64_t buff_addr, phys_addr;
1452 struct vhost_virtqueue *vq;
1453 struct vring_desc *desc;
1455 struct rte_mbuf *mbuf;
1456 struct vpool *vpool;
1458 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1460 vpool = &vpool_array[vdev->vmdq_rx_q];
1461 vq = dev->virtqueue[VIRTIO_RXQ];
1464 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1467 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1469 desc = &vq->desc[desc_idx];
1470 if (desc->flags & VRING_DESC_F_NEXT) {
1471 desc = &vq->desc[desc->next];
1472 buff_addr = gpa_to_vva(dev, desc->addr);
1473 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1476 buff_addr = gpa_to_vva(dev,
1477 desc->addr + vq->vhost_hlen);
1478 phys_addr = gpa_to_hpa(vdev,
1479 desc->addr + vq->vhost_hlen,
1480 desc->len, &addr_type);
1483 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1484 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1485 " address found when attaching RX frame buffer"
1486 " address!\n", dev->device_fh);
1487 put_desc_to_used_list_zcp(vq, desc_idx);
1492 * Check if the frame buffer address from guest crosses
1493 * sub-region or not.
1495 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1496 RTE_LOG(ERR, VHOST_DATA,
1497 "(%"PRIu64") Frame buffer address cross "
1498 "sub-regioin found when attaching RX frame "
1499 "buffer address!\n",
1501 put_desc_to_used_list_zcp(vq, desc_idx);
1504 } while (unlikely(phys_addr == 0));
1506 rte_ring_sc_dequeue(vpool->ring, &obj);
1508 if (unlikely(mbuf == NULL)) {
1509 LOG_DEBUG(VHOST_DATA,
1510 "(%"PRIu64") in attach_rxmbuf_zcp: "
1511 "ring_sc_dequeue fail.\n",
1513 put_desc_to_used_list_zcp(vq, desc_idx);
1517 if (unlikely(vpool->buf_size > desc->len)) {
1518 LOG_DEBUG(VHOST_DATA,
1519 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1520 "length(%d) of descriptor idx: %d less than room "
1521 "size required: %d\n",
1522 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1523 put_desc_to_used_list_zcp(vq, desc_idx);
1524 rte_ring_sp_enqueue(vpool->ring, obj);
1528 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1529 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1530 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1531 mbuf->data_len = desc->len;
1532 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1534 LOG_DEBUG(VHOST_DATA,
1535 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1536 "descriptor idx:%d\n",
1537 dev->device_fh, res_base_idx, desc_idx);
1539 __rte_mbuf_raw_free(mbuf);
1545 * Detach an attched packet mbuf -
1546 * - restore original mbuf address and length values.
1547 * - reset pktmbuf data and data_len to their default values.
1548 * All other fields of the given packet mbuf will be left intact.
1551 * The attached packet mbuf.
1553 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1555 const struct rte_mempool *mp = m->pool;
1556 void *buf = rte_mbuf_to_baddr(m);
1558 uint32_t buf_len = mp->elt_size - sizeof(*m);
1559 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1562 m->buf_len = (uint16_t)buf_len;
1564 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1565 RTE_PKTMBUF_HEADROOM : m->buf_len;
1566 m->data_off = buf_ofs;
1572 * This function is called after packets have been transimited. It fetchs mbuf
1573 * from vpool->pool, detached it and put into vpool->ring. It also update the
1574 * used index and kick the guest if necessary.
1576 static inline uint32_t __attribute__((always_inline))
1577 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1579 struct rte_mbuf *mbuf;
1580 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1581 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1583 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1585 LOG_DEBUG(VHOST_DATA,
1586 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1588 dev->device_fh, mbuf_count);
1589 LOG_DEBUG(VHOST_DATA,
1590 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1592 dev->device_fh, rte_ring_count(vpool->ring));
1594 for (index = 0; index < mbuf_count; index++) {
1595 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1596 if (likely(MBUF_EXT_MEM(mbuf)))
1597 pktmbuf_detach_zcp(mbuf);
1598 rte_ring_sp_enqueue(vpool->ring, mbuf);
1600 /* Update used index buffer information. */
1601 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1602 vq->used->ring[used_idx].len = 0;
1604 used_idx = (used_idx + 1) & (vq->size - 1);
1607 LOG_DEBUG(VHOST_DATA,
1608 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1610 dev->device_fh, rte_mempool_count(vpool->pool));
1611 LOG_DEBUG(VHOST_DATA,
1612 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1614 dev->device_fh, rte_ring_count(vpool->ring));
1615 LOG_DEBUG(VHOST_DATA,
1616 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1617 "vq->last_used_idx:%d\n",
1618 dev->device_fh, vq->last_used_idx);
1620 vq->last_used_idx += mbuf_count;
1622 LOG_DEBUG(VHOST_DATA,
1623 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1624 "vq->last_used_idx:%d\n",
1625 dev->device_fh, vq->last_used_idx);
1627 rte_compiler_barrier();
1629 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1631 /* Kick guest if required. */
1632 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1633 eventfd_write(vq->callfd, (eventfd_t)1);
1639 * This function is called when a virtio device is destroy.
1640 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1642 static void mbuf_destroy_zcp(struct vpool *vpool)
1644 struct rte_mbuf *mbuf = NULL;
1645 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1647 LOG_DEBUG(VHOST_CONFIG,
1648 "in mbuf_destroy_zcp: mbuf count in mempool before "
1649 "mbuf_destroy_zcp is: %d\n",
1651 LOG_DEBUG(VHOST_CONFIG,
1652 "in mbuf_destroy_zcp: mbuf count in ring before "
1653 "mbuf_destroy_zcp is : %d\n",
1654 rte_ring_count(vpool->ring));
1656 for (index = 0; index < mbuf_count; index++) {
1657 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1658 if (likely(mbuf != NULL)) {
1659 if (likely(MBUF_EXT_MEM(mbuf)))
1660 pktmbuf_detach_zcp(mbuf);
1661 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1665 LOG_DEBUG(VHOST_CONFIG,
1666 "in mbuf_destroy_zcp: mbuf count in mempool after "
1667 "mbuf_destroy_zcp is: %d\n",
1668 rte_mempool_count(vpool->pool));
1669 LOG_DEBUG(VHOST_CONFIG,
1670 "in mbuf_destroy_zcp: mbuf count in ring after "
1671 "mbuf_destroy_zcp is : %d\n",
1672 rte_ring_count(vpool->ring));
1676 * This function update the use flag and counter.
1678 static inline uint32_t __attribute__((always_inline))
1679 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1682 struct vhost_virtqueue *vq;
1683 struct vring_desc *desc;
1684 struct rte_mbuf *buff;
1685 /* The virtio_hdr is initialised to 0. */
1686 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1687 = {{0, 0, 0, 0, 0, 0}, 0};
1688 uint64_t buff_hdr_addr = 0;
1689 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1690 uint32_t head_idx, packet_success = 0;
1691 uint16_t res_cur_idx;
1693 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1698 vq = dev->virtqueue[VIRTIO_RXQ];
1699 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1701 res_cur_idx = vq->last_used_idx;
1702 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1703 dev->device_fh, res_cur_idx, res_cur_idx + count);
1705 /* Retrieve all of the head indexes first to avoid caching issues. */
1706 for (head_idx = 0; head_idx < count; head_idx++)
1707 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1709 /*Prefetch descriptor index. */
1710 rte_prefetch0(&vq->desc[head[packet_success]]);
1712 while (packet_success != count) {
1713 /* Get descriptor from available ring */
1714 desc = &vq->desc[head[packet_success]];
1716 buff = pkts[packet_success];
1717 LOG_DEBUG(VHOST_DATA,
1718 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1719 "pkt[%d] descriptor idx: %d\n",
1720 dev->device_fh, packet_success,
1721 MBUF_HEADROOM_UINT32(buff));
1724 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1725 + RTE_PKTMBUF_HEADROOM),
1726 rte_pktmbuf_data_len(buff), 0);
1728 /* Buffer address translation for virtio header. */
1729 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1730 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1733 * If the descriptors are chained the header and data are
1734 * placed in separate buffers.
1736 if (desc->flags & VRING_DESC_F_NEXT) {
1737 desc->len = vq->vhost_hlen;
1738 desc = &vq->desc[desc->next];
1739 desc->len = rte_pktmbuf_data_len(buff);
1741 desc->len = packet_len;
1744 /* Update used ring with desc information */
1745 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1746 = head[packet_success];
1747 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1752 /* A header is required per buffer. */
1753 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1754 (const void *)&virtio_hdr, vq->vhost_hlen);
1756 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1758 if (likely(packet_success < count)) {
1759 /* Prefetch descriptor index. */
1760 rte_prefetch0(&vq->desc[head[packet_success]]);
1764 rte_compiler_barrier();
1766 LOG_DEBUG(VHOST_DATA,
1767 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1768 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1769 dev->device_fh, vq->last_used_idx, vq->used->idx);
1771 *(volatile uint16_t *)&vq->used->idx += count;
1772 vq->last_used_idx += count;
1774 LOG_DEBUG(VHOST_DATA,
1775 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1776 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1777 dev->device_fh, vq->last_used_idx, vq->used->idx);
1779 /* Kick the guest if necessary. */
1780 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1781 eventfd_write(vq->callfd, (eventfd_t)1);
1787 * This function routes the TX packet to the correct interface.
1788 * This may be a local device or the physical port.
1790 static inline void __attribute__((always_inline))
1791 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1792 uint32_t desc_idx, uint8_t need_copy)
1794 struct mbuf_table *tx_q;
1795 struct rte_mbuf **m_table;
1797 struct rte_mbuf *mbuf;
1798 unsigned len, ret, offset = 0;
1799 struct vpool *vpool;
1800 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1801 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1803 /*Add packet to the port tx queue*/
1804 tx_q = &tx_queue_zcp[vmdq_rx_q];
1807 /* Allocate an mbuf and populate the structure. */
1808 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1809 rte_ring_sc_dequeue(vpool->ring, &obj);
1811 if (unlikely(mbuf == NULL)) {
1812 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1813 RTE_LOG(ERR, VHOST_DATA,
1814 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1816 put_desc_to_used_list_zcp(vq, desc_idx);
1820 if (vm2vm_mode == VM2VM_HARDWARE) {
1821 /* Avoid using a vlan tag from any vm for external pkt, such as
1822 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1823 * selection, MAC address determines it as an external pkt
1824 * which should go to network, while vlan tag determine it as
1825 * a vm2vm pkt should forward to another vm. Hardware confuse
1826 * such a ambiguous situation, so pkt will lost.
1828 vlan_tag = external_pkt_default_vlan_tag;
1829 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1830 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1831 __rte_mbuf_raw_free(mbuf);
1836 mbuf->nb_segs = m->nb_segs;
1837 mbuf->next = m->next;
1838 mbuf->data_len = m->data_len + offset;
1839 mbuf->pkt_len = mbuf->data_len;
1840 if (unlikely(need_copy)) {
1841 /* Copy the packet contents to the mbuf. */
1842 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1843 rte_pktmbuf_mtod(m, void *),
1846 mbuf->data_off = m->data_off;
1847 mbuf->buf_physaddr = m->buf_physaddr;
1848 mbuf->buf_addr = m->buf_addr;
1850 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1851 mbuf->vlan_tci = vlan_tag;
1852 mbuf->l2_len = sizeof(struct ether_hdr);
1853 mbuf->l3_len = sizeof(struct ipv4_hdr);
1854 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1856 tx_q->m_table[len] = mbuf;
1859 LOG_DEBUG(VHOST_DATA,
1860 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1863 (mbuf->next == NULL) ? "null" : "non-null");
1866 dev_statistics[dev->device_fh].tx_total++;
1867 dev_statistics[dev->device_fh].tx++;
1870 if (unlikely(len == MAX_PKT_BURST)) {
1871 m_table = (struct rte_mbuf **)tx_q->m_table;
1872 ret = rte_eth_tx_burst(ports[0],
1873 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1876 * Free any buffers not handled by TX and update
1879 if (unlikely(ret < len)) {
1881 rte_pktmbuf_free(m_table[ret]);
1882 } while (++ret < len);
1886 txmbuf_clean_zcp(dev, vpool);
1895 * This function TX all available packets in virtio TX queue for one
1896 * virtio-net device. If it is first packet, it learns MAC address and
1899 static inline void __attribute__((always_inline))
1900 virtio_dev_tx_zcp(struct virtio_net *dev)
1903 struct vhost_virtqueue *vq;
1904 struct vring_desc *desc;
1905 uint64_t buff_addr = 0, phys_addr;
1906 uint32_t head[MAX_PKT_BURST];
1908 uint16_t free_entries, packet_success = 0;
1910 uint8_t need_copy = 0;
1912 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1914 vq = dev->virtqueue[VIRTIO_TXQ];
1915 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1917 /* If there are no available buffers then return. */
1918 if (vq->last_used_idx_res == avail_idx)
1921 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1923 /* Prefetch available ring to retrieve head indexes. */
1924 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1926 /* Get the number of free entries in the ring */
1927 free_entries = (avail_idx - vq->last_used_idx_res);
1929 /* Limit to MAX_PKT_BURST. */
1931 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1933 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1934 dev->device_fh, free_entries);
1936 /* Retrieve all of the head indexes first to avoid caching issues. */
1937 for (i = 0; i < free_entries; i++)
1939 = vq->avail->ring[(vq->last_used_idx_res + i)
1942 vq->last_used_idx_res += free_entries;
1944 /* Prefetch descriptor index. */
1945 rte_prefetch0(&vq->desc[head[packet_success]]);
1946 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1948 while (packet_success < free_entries) {
1949 desc = &vq->desc[head[packet_success]];
1951 /* Discard first buffer as it is the virtio header */
1952 desc = &vq->desc[desc->next];
1954 /* Buffer address translation. */
1955 buff_addr = gpa_to_vva(dev, desc->addr);
1956 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1957 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1960 if (likely(packet_success < (free_entries - 1)))
1961 /* Prefetch descriptor index. */
1962 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1964 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1965 RTE_LOG(ERR, VHOST_DATA,
1966 "(%"PRIu64") Invalid frame buffer address found"
1967 "when TX packets!\n",
1973 /* Prefetch buffer address. */
1974 rte_prefetch0((void *)(uintptr_t)buff_addr);
1977 * Setup dummy mbuf. This is copied to a real mbuf if
1978 * transmitted out the physical port.
1980 m.data_len = desc->len;
1984 m.buf_addr = (void *)(uintptr_t)buff_addr;
1985 m.buf_physaddr = phys_addr;
1988 * Check if the frame buffer address from guest crosses
1989 * sub-region or not.
1991 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1992 RTE_LOG(ERR, VHOST_DATA,
1993 "(%"PRIu64") Frame buffer address cross "
1994 "sub-regioin found when attaching TX frame "
1995 "buffer address!\n",
2001 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2004 * If this is the first received packet we need to learn
2005 * the MAC and setup VMDQ
2007 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2008 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2010 * Discard frame if device is scheduled for
2011 * removal or a duplicate MAC address is found.
2013 packet_success += free_entries;
2014 vq->last_used_idx += packet_success;
2019 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2025 * This function is called by each data core. It handles all RX/TX registered
2026 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2027 * addresses are compared with all devices in the main linked list.
2030 switch_worker_zcp(__attribute__((unused)) void *arg)
2032 struct virtio_net *dev = NULL;
2033 struct vhost_dev *vdev = NULL;
2034 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2035 struct virtio_net_data_ll *dev_ll;
2036 struct mbuf_table *tx_q;
2037 volatile struct lcore_ll_info *lcore_ll;
2038 const uint64_t drain_tsc
2039 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2040 * BURST_TX_DRAIN_US;
2041 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2043 const uint16_t lcore_id = rte_lcore_id();
2044 uint16_t count_in_ring, rx_count = 0;
2046 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2048 lcore_ll = lcore_info[lcore_id].lcore_ll;
2052 cur_tsc = rte_rdtsc();
2054 /* TX burst queue drain */
2055 diff_tsc = cur_tsc - prev_tsc;
2056 if (unlikely(diff_tsc > drain_tsc)) {
2058 * Get mbuf from vpool.pool and detach mbuf and
2059 * put back into vpool.ring.
2061 dev_ll = lcore_ll->ll_root_used;
2062 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2063 /* Get virtio device ID */
2064 vdev = dev_ll->vdev;
2067 if (likely(!vdev->remove)) {
2068 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2070 LOG_DEBUG(VHOST_DATA,
2071 "TX queue drained after timeout"
2072 " with burst size %u\n",
2076 * Tx any packets in the queue
2078 ret = rte_eth_tx_burst(
2080 (uint16_t)tx_q->txq_id,
2081 (struct rte_mbuf **)
2083 (uint16_t)tx_q->len);
2084 if (unlikely(ret < tx_q->len)) {
2087 tx_q->m_table[ret]);
2088 } while (++ret < tx_q->len);
2092 txmbuf_clean_zcp(dev,
2093 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2096 dev_ll = dev_ll->next;
2101 rte_prefetch0(lcore_ll->ll_root_used);
2104 * Inform the configuration core that we have exited the linked
2105 * list and that no devices are in use if requested.
2107 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2108 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2110 /* Process devices */
2111 dev_ll = lcore_ll->ll_root_used;
2113 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2114 vdev = dev_ll->vdev;
2116 if (unlikely(vdev->remove)) {
2117 dev_ll = dev_ll->next;
2119 vdev->ready = DEVICE_SAFE_REMOVE;
2123 if (likely(vdev->ready == DEVICE_RX)) {
2124 uint32_t index = vdev->vmdq_rx_q;
2127 = rte_ring_count(vpool_array[index].ring);
2128 uint16_t free_entries
2129 = (uint16_t)get_available_ring_num_zcp(dev);
2132 * Attach all mbufs in vpool.ring and put back
2136 i < RTE_MIN(free_entries,
2137 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2139 attach_rxmbuf_zcp(dev);
2141 /* Handle guest RX */
2142 rx_count = rte_eth_rx_burst(ports[0],
2143 vdev->vmdq_rx_q, pkts_burst,
2147 ret_count = virtio_dev_rx_zcp(dev,
2148 pkts_burst, rx_count);
2150 dev_statistics[dev->device_fh].rx_total
2152 dev_statistics[dev->device_fh].rx
2155 while (likely(rx_count)) {
2158 pkts_burst[rx_count]);
2159 rte_ring_sp_enqueue(
2160 vpool_array[index].ring,
2161 (void *)pkts_burst[rx_count]);
2166 if (likely(!vdev->remove))
2167 /* Handle guest TX */
2168 virtio_dev_tx_zcp(dev);
2170 /* Move to the next device in the list */
2171 dev_ll = dev_ll->next;
2180 * Add an entry to a used linked list. A free entry must first be found
2181 * in the free linked list using get_data_ll_free_entry();
2184 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2185 struct virtio_net_data_ll *ll_dev)
2187 struct virtio_net_data_ll *ll = *ll_root_addr;
2189 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2190 ll_dev->next = NULL;
2191 rte_compiler_barrier();
2193 /* If ll == NULL then this is the first device. */
2195 /* Increment to the tail of the linked list. */
2196 while ((ll->next != NULL) )
2201 *ll_root_addr = ll_dev;
2206 * Remove an entry from a used linked list. The entry must then be added to
2207 * the free linked list using put_data_ll_free_entry().
2210 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2211 struct virtio_net_data_ll *ll_dev,
2212 struct virtio_net_data_ll *ll_dev_last)
2214 struct virtio_net_data_ll *ll = *ll_root_addr;
2216 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2220 *ll_root_addr = ll_dev->next;
2222 if (likely(ll_dev_last != NULL))
2223 ll_dev_last->next = ll_dev->next;
2225 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2229 * Find and return an entry from the free linked list.
2231 static struct virtio_net_data_ll *
2232 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2234 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2235 struct virtio_net_data_ll *ll_dev;
2237 if (ll_free == NULL)
2241 *ll_root_addr = ll_free->next;
2247 * Place an entry back on to the free linked list.
2250 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2251 struct virtio_net_data_ll *ll_dev)
2253 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2258 ll_dev->next = ll_free;
2259 *ll_root_addr = ll_dev;
2263 * Creates a linked list of a given size.
2265 static struct virtio_net_data_ll *
2266 alloc_data_ll(uint32_t size)
2268 struct virtio_net_data_ll *ll_new;
2271 /* Malloc and then chain the linked list. */
2272 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2273 if (ll_new == NULL) {
2274 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2278 for (i = 0; i < size - 1; i++) {
2279 ll_new[i].vdev = NULL;
2280 ll_new[i].next = &ll_new[i+1];
2282 ll_new[i].next = NULL;
2288 * Create the main linked list along with each individual cores linked list. A used and a free list
2289 * are created to manage entries.
2296 RTE_LCORE_FOREACH_SLAVE(lcore) {
2297 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2298 if (lcore_info[lcore].lcore_ll == NULL) {
2299 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2303 lcore_info[lcore].lcore_ll->device_num = 0;
2304 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2305 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2306 if (num_devices % num_switching_cores)
2307 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2309 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2312 /* Allocate devices up to a maximum of MAX_DEVICES. */
2313 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2319 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2320 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2321 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2324 destroy_device (volatile struct virtio_net *dev)
2326 struct virtio_net_data_ll *ll_lcore_dev_cur;
2327 struct virtio_net_data_ll *ll_main_dev_cur;
2328 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2329 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2330 struct vhost_dev *vdev;
2333 dev->flags &= ~VIRTIO_DEV_RUNNING;
2335 vdev = (struct vhost_dev *)dev->priv;
2336 /*set the remove flag. */
2338 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2342 /* Search for entry to be removed from lcore ll */
2343 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2344 while (ll_lcore_dev_cur != NULL) {
2345 if (ll_lcore_dev_cur->vdev == vdev) {
2348 ll_lcore_dev_last = ll_lcore_dev_cur;
2349 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2353 if (ll_lcore_dev_cur == NULL) {
2354 RTE_LOG(ERR, VHOST_CONFIG,
2355 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2360 /* Search for entry to be removed from main ll */
2361 ll_main_dev_cur = ll_root_used;
2362 ll_main_dev_last = NULL;
2363 while (ll_main_dev_cur != NULL) {
2364 if (ll_main_dev_cur->vdev == vdev) {
2367 ll_main_dev_last = ll_main_dev_cur;
2368 ll_main_dev_cur = ll_main_dev_cur->next;
2372 /* Remove entries from the lcore and main ll. */
2373 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2374 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2376 /* Set the dev_removal_flag on each lcore. */
2377 RTE_LCORE_FOREACH_SLAVE(lcore) {
2378 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2382 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2383 * they can no longer access the device removed from the linked lists and that the devices
2384 * are no longer in use.
2386 RTE_LCORE_FOREACH_SLAVE(lcore) {
2387 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2392 /* Add the entries back to the lcore and main free ll.*/
2393 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2394 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2396 /* Decrement number of device on the lcore. */
2397 lcore_info[vdev->coreid].lcore_ll->device_num--;
2399 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2402 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2404 /* Stop the RX queue. */
2405 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2406 LOG_DEBUG(VHOST_CONFIG,
2407 "(%"PRIu64") In destroy_device: Failed to stop "
2413 LOG_DEBUG(VHOST_CONFIG,
2414 "(%"PRIu64") in destroy_device: Start put mbuf in "
2415 "mempool back to ring for RX queue: %d\n",
2416 dev->device_fh, vdev->vmdq_rx_q);
2418 mbuf_destroy_zcp(vpool);
2420 /* Stop the TX queue. */
2421 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2422 LOG_DEBUG(VHOST_CONFIG,
2423 "(%"PRIu64") In destroy_device: Failed to "
2424 "stop tx queue:%d\n",
2425 dev->device_fh, vdev->vmdq_rx_q);
2428 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2430 LOG_DEBUG(VHOST_CONFIG,
2431 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2432 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2433 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2436 mbuf_destroy_zcp(vpool);
2437 rte_free(vdev->regions_hpa);
2444 * Calculate the region count of physical continous regions for one particular
2445 * region of whose vhost virtual address is continous. The particular region
2446 * start from vva_start, with size of 'size' in argument.
2449 check_hpa_regions(uint64_t vva_start, uint64_t size)
2451 uint32_t i, nregions = 0, page_size = getpagesize();
2452 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2453 if (vva_start % page_size) {
2454 LOG_DEBUG(VHOST_CONFIG,
2455 "in check_countinous: vva start(%p) mod page_size(%d) "
2457 (void *)(uintptr_t)vva_start, page_size);
2460 if (size % page_size) {
2461 LOG_DEBUG(VHOST_CONFIG,
2462 "in check_countinous: "
2463 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2467 for (i = 0; i < size - page_size; i = i + page_size) {
2469 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2470 next_phys_addr = rte_mem_virt2phy(
2471 (void *)(uintptr_t)(vva_start + i + page_size));
2472 if ((cur_phys_addr + page_size) != next_phys_addr) {
2474 LOG_DEBUG(VHOST_CONFIG,
2475 "in check_continuous: hva addr:(%p) is not "
2476 "continuous with hva addr:(%p), diff:%d\n",
2477 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2478 (void *)(uintptr_t)(vva_start + (uint64_t)i
2479 + page_size), page_size);
2480 LOG_DEBUG(VHOST_CONFIG,
2481 "in check_continuous: hpa addr:(%p) is not "
2482 "continuous with hpa addr:(%p), "
2483 "diff:(%"PRIu64")\n",
2484 (void *)(uintptr_t)cur_phys_addr,
2485 (void *)(uintptr_t)next_phys_addr,
2486 (next_phys_addr-cur_phys_addr));
2493 * Divide each region whose vhost virtual address is continous into a few
2494 * sub-regions, make sure the physical address within each sub-region are
2495 * continous. And fill offset(to GPA) and size etc. information of each
2496 * sub-region into regions_hpa.
2499 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2501 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2502 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2504 if (mem_region_hpa == NULL)
2507 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2508 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2509 virtio_memory->regions[regionidx].address_offset;
2510 mem_region_hpa[regionidx_hpa].guest_phys_address
2511 = virtio_memory->regions[regionidx].guest_phys_address;
2512 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2513 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2514 mem_region_hpa[regionidx_hpa].guest_phys_address;
2515 LOG_DEBUG(VHOST_CONFIG,
2516 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2519 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2520 LOG_DEBUG(VHOST_CONFIG,
2521 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2524 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2526 i < virtio_memory->regions[regionidx].memory_size -
2529 cur_phys_addr = rte_mem_virt2phy(
2530 (void *)(uintptr_t)(vva_start + i));
2531 next_phys_addr = rte_mem_virt2phy(
2532 (void *)(uintptr_t)(vva_start +
2534 if ((cur_phys_addr + page_size) != next_phys_addr) {
2535 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2536 mem_region_hpa[regionidx_hpa].guest_phys_address +
2538 mem_region_hpa[regionidx_hpa].memory_size
2540 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2541 "phys addr end [%d]:(%p)\n",
2544 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2545 LOG_DEBUG(VHOST_CONFIG,
2546 "in fill_hpa_regions: guest phys addr "
2550 (mem_region_hpa[regionidx_hpa].memory_size));
2551 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2552 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2554 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2556 mem_region_hpa[regionidx_hpa].guest_phys_address;
2557 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2558 " phys addr start[%d]:(%p)\n",
2561 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2562 LOG_DEBUG(VHOST_CONFIG,
2563 "in fill_hpa_regions: host phys addr "
2567 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2573 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2574 = mem_region_hpa[regionidx_hpa].guest_phys_address
2576 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2577 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2578 "[%d]:(%p)\n", regionidx_hpa,
2580 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2581 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2582 "[%d]:(%p)\n", regionidx_hpa,
2584 (mem_region_hpa[regionidx_hpa].memory_size));
2587 return regionidx_hpa;
2591 * A new device is added to a data core. First the device is added to the main linked list
2592 * and the allocated to a specific data core.
2595 new_device (struct virtio_net *dev)
2597 struct virtio_net_data_ll *ll_dev;
2598 int lcore, core_add = 0;
2599 uint32_t device_num_min = num_devices;
2600 struct vhost_dev *vdev;
2603 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2605 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2613 vdev->nregions_hpa = dev->mem->nregions;
2614 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2616 += check_hpa_regions(
2617 dev->mem->regions[regionidx].guest_phys_address
2618 + dev->mem->regions[regionidx].address_offset,
2619 dev->mem->regions[regionidx].memory_size);
2623 vdev->regions_hpa = rte_calloc("vhost hpa region",
2625 sizeof(struct virtio_memory_regions_hpa),
2626 RTE_CACHE_LINE_SIZE);
2627 if (vdev->regions_hpa == NULL) {
2628 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2634 if (fill_hpa_memory_regions(
2635 vdev->regions_hpa, dev->mem
2636 ) != vdev->nregions_hpa) {
2638 RTE_LOG(ERR, VHOST_CONFIG,
2639 "hpa memory regions number mismatch: "
2640 "[%d]\n", vdev->nregions_hpa);
2641 rte_free(vdev->regions_hpa);
2648 /* Add device to main ll */
2649 ll_dev = get_data_ll_free_entry(&ll_root_free);
2650 if (ll_dev == NULL) {
2651 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2652 "of %d devices per core has been reached\n",
2653 dev->device_fh, num_devices);
2654 if (vdev->regions_hpa)
2655 rte_free(vdev->regions_hpa);
2659 ll_dev->vdev = vdev;
2660 add_data_ll_entry(&ll_root_used, ll_dev);
2662 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2665 uint32_t index = vdev->vmdq_rx_q;
2666 uint32_t count_in_ring, i;
2667 struct mbuf_table *tx_q;
2669 count_in_ring = rte_ring_count(vpool_array[index].ring);
2671 LOG_DEBUG(VHOST_CONFIG,
2672 "(%"PRIu64") in new_device: mbuf count in mempool "
2673 "before attach is: %d\n",
2675 rte_mempool_count(vpool_array[index].pool));
2676 LOG_DEBUG(VHOST_CONFIG,
2677 "(%"PRIu64") in new_device: mbuf count in ring "
2678 "before attach is : %d\n",
2679 dev->device_fh, count_in_ring);
2682 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2684 for (i = 0; i < count_in_ring; i++)
2685 attach_rxmbuf_zcp(dev);
2687 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2688 "mempool after attach is: %d\n",
2690 rte_mempool_count(vpool_array[index].pool));
2691 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2692 "ring after attach is : %d\n",
2694 rte_ring_count(vpool_array[index].ring));
2696 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2697 tx_q->txq_id = vdev->vmdq_rx_q;
2699 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2700 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2702 LOG_DEBUG(VHOST_CONFIG,
2703 "(%"PRIu64") In new_device: Failed to start "
2705 dev->device_fh, vdev->vmdq_rx_q);
2707 mbuf_destroy_zcp(vpool);
2708 rte_free(vdev->regions_hpa);
2713 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2714 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2716 LOG_DEBUG(VHOST_CONFIG,
2717 "(%"PRIu64") In new_device: Failed to start "
2719 dev->device_fh, vdev->vmdq_rx_q);
2721 /* Stop the TX queue. */
2722 if (rte_eth_dev_tx_queue_stop(ports[0],
2723 vdev->vmdq_rx_q) != 0) {
2724 LOG_DEBUG(VHOST_CONFIG,
2725 "(%"PRIu64") In new_device: Failed to "
2726 "stop tx queue:%d\n",
2727 dev->device_fh, vdev->vmdq_rx_q);
2730 mbuf_destroy_zcp(vpool);
2731 rte_free(vdev->regions_hpa);
2738 /*reset ready flag*/
2739 vdev->ready = DEVICE_MAC_LEARNING;
2742 /* Find a suitable lcore to add the device. */
2743 RTE_LCORE_FOREACH_SLAVE(lcore) {
2744 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2745 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2749 /* Add device to lcore ll */
2750 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2751 if (ll_dev == NULL) {
2752 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2753 vdev->ready = DEVICE_SAFE_REMOVE;
2754 destroy_device(dev);
2755 rte_free(vdev->regions_hpa);
2759 ll_dev->vdev = vdev;
2760 vdev->coreid = core_add;
2762 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2764 /* Initialize device stats */
2765 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2767 /* Disable notifications. */
2768 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2769 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2770 lcore_info[vdev->coreid].lcore_ll->device_num++;
2771 dev->flags |= VIRTIO_DEV_RUNNING;
2773 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2779 * These callback allow devices to be added to the data core when configuration
2780 * has been fully complete.
2782 static const struct virtio_net_device_ops virtio_net_device_ops =
2784 .new_device = new_device,
2785 .destroy_device = destroy_device,
2789 * This is a thread will wake up after a period to print stats if the user has
2795 struct virtio_net_data_ll *dev_ll;
2796 uint64_t tx_dropped, rx_dropped;
2797 uint64_t tx, tx_total, rx, rx_total;
2799 const char clr[] = { 27, '[', '2', 'J', '\0' };
2800 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2803 sleep(enable_stats);
2805 /* Clear screen and move to top left */
2806 printf("%s%s", clr, top_left);
2808 printf("\nDevice statistics ====================================");
2810 dev_ll = ll_root_used;
2811 while (dev_ll != NULL) {
2812 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2813 tx_total = dev_statistics[device_fh].tx_total;
2814 tx = dev_statistics[device_fh].tx;
2815 tx_dropped = tx_total - tx;
2816 if (zero_copy == 0) {
2817 rx_total = rte_atomic64_read(
2818 &dev_statistics[device_fh].rx_total_atomic);
2819 rx = rte_atomic64_read(
2820 &dev_statistics[device_fh].rx_atomic);
2822 rx_total = dev_statistics[device_fh].rx_total;
2823 rx = dev_statistics[device_fh].rx;
2825 rx_dropped = rx_total - rx;
2827 printf("\nStatistics for device %"PRIu32" ------------------------------"
2828 "\nTX total: %"PRIu64""
2829 "\nTX dropped: %"PRIu64""
2830 "\nTX successful: %"PRIu64""
2831 "\nRX total: %"PRIu64""
2832 "\nRX dropped: %"PRIu64""
2833 "\nRX successful: %"PRIu64"",
2842 dev_ll = dev_ll->next;
2844 printf("\n======================================================\n");
2849 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2850 char *ring_name, uint32_t nb_mbuf)
2852 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2853 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2854 if (vpool_array[index].pool != NULL) {
2855 vpool_array[index].ring
2856 = rte_ring_create(ring_name,
2857 rte_align32pow2(nb_mbuf + 1),
2858 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2859 if (likely(vpool_array[index].ring != NULL)) {
2860 LOG_DEBUG(VHOST_CONFIG,
2861 "in setup_mempool_tbl: mbuf count in "
2863 rte_mempool_count(vpool_array[index].pool));
2864 LOG_DEBUG(VHOST_CONFIG,
2865 "in setup_mempool_tbl: mbuf count in "
2867 rte_ring_count(vpool_array[index].ring));
2869 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2873 /* Need consider head room. */
2874 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2876 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2880 /* When we receive a INT signal, unregister vhost driver */
2882 sigint_handler(__rte_unused int signum)
2884 /* Unregister vhost driver. */
2885 int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2887 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2892 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2893 * device is also registered here to handle the IOCTLs.
2896 main(int argc, char *argv[])
2898 struct rte_mempool *mbuf_pool = NULL;
2899 unsigned lcore_id, core_id = 0;
2900 unsigned nb_ports, valid_num_ports;
2904 static pthread_t tid;
2905 char thread_name[RTE_MAX_THREAD_NAME_LEN];
2907 signal(SIGINT, sigint_handler);
2910 ret = rte_eal_init(argc, argv);
2912 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2916 /* parse app arguments */
2917 ret = us_vhost_parse_args(argc, argv);
2919 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2921 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2922 if (rte_lcore_is_enabled(lcore_id))
2923 lcore_ids[core_id ++] = lcore_id;
2925 if (rte_lcore_count() > RTE_MAX_LCORE)
2926 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2928 /*set the number of swithcing cores available*/
2929 num_switching_cores = rte_lcore_count()-1;
2931 /* Get the number of physical ports. */
2932 nb_ports = rte_eth_dev_count();
2933 if (nb_ports > RTE_MAX_ETHPORTS)
2934 nb_ports = RTE_MAX_ETHPORTS;
2937 * Update the global var NUM_PORTS and global array PORTS
2938 * and get value of var VALID_NUM_PORTS according to system ports number
2940 valid_num_ports = check_ports_num(nb_ports);
2942 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2943 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2944 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2948 if (zero_copy == 0) {
2949 /* Create the mbuf pool. */
2950 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2951 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2952 0, MBUF_DATA_SIZE, rte_socket_id());
2953 if (mbuf_pool == NULL)
2954 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2956 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2957 vpool_array[queue_id].pool = mbuf_pool;
2959 if (vm2vm_mode == VM2VM_HARDWARE) {
2960 /* Enable VT loop back to let L2 switch to do it. */
2961 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2962 LOG_DEBUG(VHOST_CONFIG,
2963 "Enable loop back for L2 switch in vmdq.\n");
2967 char pool_name[RTE_MEMPOOL_NAMESIZE];
2968 char ring_name[RTE_MEMPOOL_NAMESIZE];
2970 nb_mbuf = num_rx_descriptor
2971 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2972 + num_switching_cores * MAX_PKT_BURST;
2974 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2975 snprintf(pool_name, sizeof(pool_name),
2976 "rxmbuf_pool_%u", queue_id);
2977 snprintf(ring_name, sizeof(ring_name),
2978 "rxmbuf_ring_%u", queue_id);
2979 setup_mempool_tbl(rte_socket_id(), queue_id,
2980 pool_name, ring_name, nb_mbuf);
2983 nb_mbuf = num_tx_descriptor
2984 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2985 + num_switching_cores * MAX_PKT_BURST;
2987 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2988 snprintf(pool_name, sizeof(pool_name),
2989 "txmbuf_pool_%u", queue_id);
2990 snprintf(ring_name, sizeof(ring_name),
2991 "txmbuf_ring_%u", queue_id);
2992 setup_mempool_tbl(rte_socket_id(),
2993 (queue_id + MAX_QUEUES),
2994 pool_name, ring_name, nb_mbuf);
2997 if (vm2vm_mode == VM2VM_HARDWARE) {
2998 /* Enable VT loop back to let L2 switch to do it. */
2999 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3000 LOG_DEBUG(VHOST_CONFIG,
3001 "Enable loop back for L2 switch in vmdq.\n");
3004 /* Set log level. */
3005 rte_set_log_level(LOG_LEVEL);
3007 /* initialize all ports */
3008 for (portid = 0; portid < nb_ports; portid++) {
3009 /* skip ports that are not enabled */
3010 if ((enabled_port_mask & (1 << portid)) == 0) {
3011 RTE_LOG(INFO, VHOST_PORT,
3012 "Skipping disabled port %d\n", portid);
3015 if (port_init(portid) != 0)
3016 rte_exit(EXIT_FAILURE,
3017 "Cannot initialize network ports\n");
3020 /* Initialise all linked lists. */
3021 if (init_data_ll() == -1)
3022 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3024 /* Initialize device stats */
3025 memset(&dev_statistics, 0, sizeof(dev_statistics));
3027 /* Enable stats if the user option is set. */
3029 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3031 rte_exit(EXIT_FAILURE,
3032 "Cannot create print-stats thread\n");
3034 /* Set thread_name for aid in debugging. */
3035 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3036 ret = rte_thread_setname(tid, thread_name);
3038 RTE_LOG(ERR, VHOST_CONFIG,
3039 "Cannot set print-stats name\n");
3042 /* Launch all data cores. */
3043 if (zero_copy == 0) {
3044 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3045 rte_eal_remote_launch(switch_worker,
3046 mbuf_pool, lcore_id);
3049 uint32_t count_in_mempool, index, i;
3050 for (index = 0; index < 2*MAX_QUEUES; index++) {
3051 /* For all RX and TX queues. */
3053 = rte_mempool_count(vpool_array[index].pool);
3056 * Transfer all un-attached mbufs from vpool.pool
3059 for (i = 0; i < count_in_mempool; i++) {
3060 struct rte_mbuf *mbuf
3061 = __rte_mbuf_raw_alloc(
3062 vpool_array[index].pool);
3063 rte_ring_sp_enqueue(vpool_array[index].ring,
3067 LOG_DEBUG(VHOST_CONFIG,
3068 "in main: mbuf count in mempool at initial "
3069 "is: %d\n", count_in_mempool);
3070 LOG_DEBUG(VHOST_CONFIG,
3071 "in main: mbuf count in ring at initial is :"
3073 rte_ring_count(vpool_array[index].ring));
3076 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3077 rte_eal_remote_launch(switch_worker_zcp, NULL,
3082 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3084 /* Register vhost(cuse or user) driver to handle vhost messages. */
3085 ret = rte_vhost_driver_register((char *)&dev_basename);
3087 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3089 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3091 /* Start CUSE session. */
3092 rte_vhost_driver_session_start();