4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
59 #define MAX_QUEUES 128
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
66 * Calculate the number of buffers needed per port
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
69 (num_switching_cores*MAX_PKT_BURST) + \
70 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71 ((num_switching_cores+1)*MBUF_CACHE_SIZE))
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
77 * No frame data buffer allocated from host are required for zero copy
78 * implementation, guest will allocate the frame data buffer, and vhost
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
85 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
88 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
91 #define JUMBO_FRAME_MAX_SIZE 0x2600
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
96 #define DEVICE_SAFE_REMOVE 2
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
107 * Need refine these 2 macros for legacy and DPDK based front end:
108 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109 * And then adjust power 2.
112 * For legacy front end, 128 descriptors,
113 * half for virtio header, another half for mbuf.
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120 + sizeof(struct rte_mbuf)))
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
125 #define INVALID_PORT_ID 0xFF
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
145 #define MBUF_EXT_MEM(mb) (rte_mbuf_from_indirect(mb) != (mb))
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
161 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162 * disabled on default.
164 static uint32_t zero_copy;
165 static int mergeable;
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
178 struct rte_mempool *pool;
179 struct rte_ring *ring;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
192 /* The type of host physical address translated from guest physical address. */
194 PHYS_ADDR_CONTINUOUS = 0,
195 PHYS_ADDR_CROSS_SUBREG = 1,
196 PHYS_ADDR_INVALID = 2,
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
222 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
224 .header_split = 0, /**< Header Split disabled */
225 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
226 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
228 * It is necessary for 1G NIC such as I350,
229 * this fixes bug of ipv4 forwarding in guest can't
230 * forward pakets from one virtio dev to another virtio dev.
232 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
233 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
234 .hw_strip_crc = 0, /**< CRC stripped by hardware */
238 .mq_mode = ETH_MQ_TX_NONE,
242 * should be overridden separately in code with
246 .nb_queue_pools = ETH_8_POOLS,
247 .enable_default_pool = 0,
250 .pool_map = {{0, 0},},
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
266 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
284 /* Used for queueing bursts of TX packets. */
288 struct rte_mbuf *m_table[MAX_PKT_BURST];
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
297 /* Vlan header struct used to insert vlan tags on TX. */
299 unsigned char h_dest[ETH_ALEN];
300 unsigned char h_source[ETH_ALEN];
303 __be16 h_vlan_encapsulated_proto;
306 /* Header lengths. */
308 #define VLAN_ETH_HLEN 18
310 /* Per-device statistics struct */
311 struct device_statistics {
313 rte_atomic64_t rx_total_atomic;
316 rte_atomic64_t rx_atomic;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
322 * Builds up the correct configuration for VMDQ VLAN pool map
323 * according to the pool & queue limits.
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
328 struct rte_eth_vmdq_rx_conf conf;
329 struct rte_eth_vmdq_rx_conf *def_conf =
330 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
333 memset(&conf, 0, sizeof(conf));
334 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335 conf.nb_pool_maps = num_devices;
336 conf.enable_loop_back = def_conf->enable_loop_back;
337 conf.rx_mode = def_conf->rx_mode;
339 for (i = 0; i < conf.nb_pool_maps; i++) {
340 conf.pool_map[i].vlan_id = vlan_tags[ i ];
341 conf.pool_map[i].pools = (1UL << i);
344 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
351 * Validate the device number according to the max pool number gotten form
352 * dev_info. If the device number is invalid, give the error message and
353 * return -1. Each device must have its own pool.
356 validate_num_devices(uint32_t max_nb_devices)
358 if (num_devices > max_nb_devices) {
359 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
366 * Initialises a given port using global settings and with the rx buffers
367 * coming from the mbuf_pool passed as parameter
370 port_init(uint8_t port)
372 struct rte_eth_dev_info dev_info;
373 struct rte_eth_conf port_conf;
374 struct rte_eth_rxconf *rxconf;
375 struct rte_eth_txconf *txconf;
376 int16_t rx_rings, tx_rings;
377 uint16_t rx_ring_size, tx_ring_size;
381 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382 rte_eth_dev_info_get (port, &dev_info);
384 if (dev_info.max_rx_queues > MAX_QUEUES) {
385 rte_exit(EXIT_FAILURE,
386 "please define MAX_QUEUES no less than %u in %s\n",
387 dev_info.max_rx_queues, __FILE__);
390 rxconf = &dev_info.default_rxconf;
391 txconf = &dev_info.default_txconf;
392 rxconf->rx_drop_en = 1;
394 /* Enable vlan offload */
395 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
398 * Zero copy defers queue RX/TX start to the time when guest
399 * finishes its startup and packet buffers from that guest are
403 rxconf->rx_deferred_start = 1;
404 rxconf->rx_drop_en = 0;
405 txconf->tx_deferred_start = 1;
408 /*configure the number of supported virtio devices based on VMDQ limits */
409 num_devices = dev_info.max_vmdq_pools;
412 rx_ring_size = num_rx_descriptor;
413 tx_ring_size = num_tx_descriptor;
414 tx_rings = dev_info.max_tx_queues;
416 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418 tx_rings = (uint16_t)rte_lcore_count();
421 retval = validate_num_devices(MAX_DEVICES);
425 /* Get port configuration. */
426 retval = get_eth_conf(&port_conf, num_devices);
429 /* NIC queues are divided into pf queues and vmdq queues. */
430 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432 num_vmdq_queues = num_devices * queues_per_pool;
433 num_queues = num_pf_queues + num_vmdq_queues;
434 vmdq_queue_base = dev_info.vmdq_queue_base;
435 vmdq_pool_base = dev_info.vmdq_pool_base;
436 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437 num_pf_queues, num_devices, queues_per_pool);
439 if (port >= rte_eth_dev_count()) return -1;
441 if (enable_tx_csum == 0)
442 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
444 if (enable_tso == 0) {
445 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
449 rx_rings = (uint16_t)dev_info.max_rx_queues;
450 /* Configure ethernet device. */
451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
455 /* Setup the queues. */
456 for (q = 0; q < rx_rings; q ++) {
457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 rte_eth_dev_socket_id(port),
460 vpool_array[q].pool);
464 for (q = 0; q < tx_rings; q ++) {
465 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466 rte_eth_dev_socket_id(port),
472 /* Start the device. */
473 retval = rte_eth_dev_start(port);
475 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
480 rte_eth_promiscuous_enable(port);
482 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
487 vmdq_ports_eth_addr[port].addr_bytes[0],
488 vmdq_ports_eth_addr[port].addr_bytes[1],
489 vmdq_ports_eth_addr[port].addr_bytes[2],
490 vmdq_ports_eth_addr[port].addr_bytes[3],
491 vmdq_ports_eth_addr[port].addr_bytes[4],
492 vmdq_ports_eth_addr[port].addr_bytes[5]);
498 * Set character device basename.
501 us_vhost_parse_basename(const char *q_arg)
503 /* parse number string */
505 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
508 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
514 * Parse the portmask provided at run time.
517 parse_portmask(const char *portmask)
524 /* parse hexadecimal string */
525 pm = strtoul(portmask, &end, 16);
526 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
537 * Parse num options at run time.
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
547 /* parse unsigned int string */
548 num = strtoul(q_arg, &end, 10);
549 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
552 if (num > max_valid_value)
563 us_vhost_usage(const char *prgname)
565 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
567 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568 " --dev-basename <name>\n"
570 " -p PORTMASK: Set mask for ports to be used by application\n"
571 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576 " --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578 " --dev-basename: The basename to be used for the character device.\n"
579 " --zero-copy [0|1]: disable(default)/enable rx/tx "
581 " --rx-desc-num [0-N]: the number of descriptors on rx, "
582 "used only when zero copy is enabled.\n"
583 " --tx-desc-num [0-N]: the number of descriptors on tx, "
584 "used only when zero copy is enabled.\n"
585 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
586 " --tso [0|1] disable/enable TCP segment offload.\n",
591 * Parse the arguments given in the command line of the application.
594 us_vhost_parse_args(int argc, char **argv)
599 const char *prgname = argv[0];
600 static struct option long_option[] = {
601 {"vm2vm", required_argument, NULL, 0},
602 {"rx-retry", required_argument, NULL, 0},
603 {"rx-retry-delay", required_argument, NULL, 0},
604 {"rx-retry-num", required_argument, NULL, 0},
605 {"mergeable", required_argument, NULL, 0},
606 {"vlan-strip", required_argument, NULL, 0},
607 {"stats", required_argument, NULL, 0},
608 {"dev-basename", required_argument, NULL, 0},
609 {"zero-copy", required_argument, NULL, 0},
610 {"rx-desc-num", required_argument, NULL, 0},
611 {"tx-desc-num", required_argument, NULL, 0},
612 {"tx-csum", required_argument, NULL, 0},
613 {"tso", required_argument, NULL, 0},
617 /* Parse command line */
618 while ((opt = getopt_long(argc, argv, "p:P",
619 long_option, &option_index)) != EOF) {
623 enabled_port_mask = parse_portmask(optarg);
624 if (enabled_port_mask == 0) {
625 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626 us_vhost_usage(prgname);
633 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634 ETH_VMDQ_ACCEPT_BROADCAST |
635 ETH_VMDQ_ACCEPT_MULTICAST;
636 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
641 /* Enable/disable vm2vm comms. */
642 if (!strncmp(long_option[option_index].name, "vm2vm",
644 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
646 RTE_LOG(INFO, VHOST_CONFIG,
647 "Invalid argument for "
649 us_vhost_usage(prgname);
652 vm2vm_mode = (vm2vm_type)ret;
656 /* Enable/disable retries on RX. */
657 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658 ret = parse_num_opt(optarg, 1);
660 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661 us_vhost_usage(prgname);
668 /* Enable/disable TX checksum offload. */
669 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670 ret = parse_num_opt(optarg, 1);
672 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673 us_vhost_usage(prgname);
676 enable_tx_csum = ret;
679 /* Enable/disable TSO offload. */
680 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681 ret = parse_num_opt(optarg, 1);
683 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684 us_vhost_usage(prgname);
690 /* Specify the retries delay time (in useconds) on RX. */
691 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692 ret = parse_num_opt(optarg, INT32_MAX);
694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695 us_vhost_usage(prgname);
698 burst_rx_delay_time = ret;
702 /* Specify the retries number on RX. */
703 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704 ret = parse_num_opt(optarg, INT32_MAX);
706 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707 us_vhost_usage(prgname);
710 burst_rx_retry_num = ret;
714 /* Enable/disable RX mergeable buffers. */
715 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716 ret = parse_num_opt(optarg, 1);
718 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719 us_vhost_usage(prgname);
724 vmdq_conf_default.rxmode.jumbo_frame = 1;
725 vmdq_conf_default.rxmode.max_rx_pkt_len
726 = JUMBO_FRAME_MAX_SIZE;
731 /* Enable/disable RX VLAN strip on host. */
732 if (!strncmp(long_option[option_index].name,
733 "vlan-strip", MAX_LONG_OPT_SZ)) {
734 ret = parse_num_opt(optarg, 1);
736 RTE_LOG(INFO, VHOST_CONFIG,
737 "Invalid argument for VLAN strip [0|1]\n");
738 us_vhost_usage(prgname);
742 vmdq_conf_default.rxmode.hw_vlan_strip =
747 /* Enable/disable stats. */
748 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749 ret = parse_num_opt(optarg, INT32_MAX);
751 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752 us_vhost_usage(prgname);
759 /* Set character device basename. */
760 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761 if (us_vhost_parse_basename(optarg) == -1) {
762 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763 us_vhost_usage(prgname);
768 /* Enable/disable rx/tx zero copy. */
769 if (!strncmp(long_option[option_index].name,
770 "zero-copy", MAX_LONG_OPT_SZ)) {
771 ret = parse_num_opt(optarg, 1);
773 RTE_LOG(INFO, VHOST_CONFIG,
775 " for zero-copy [0|1]\n");
776 us_vhost_usage(prgname);
782 /* Specify the descriptor number on RX. */
783 if (!strncmp(long_option[option_index].name,
784 "rx-desc-num", MAX_LONG_OPT_SZ)) {
785 ret = parse_num_opt(optarg, MAX_RING_DESC);
786 if ((ret == -1) || (!POWEROF2(ret))) {
787 RTE_LOG(INFO, VHOST_CONFIG,
788 "Invalid argument for rx-desc-num[0-N],"
789 "power of 2 required.\n");
790 us_vhost_usage(prgname);
793 num_rx_descriptor = ret;
797 /* Specify the descriptor number on TX. */
798 if (!strncmp(long_option[option_index].name,
799 "tx-desc-num", MAX_LONG_OPT_SZ)) {
800 ret = parse_num_opt(optarg, MAX_RING_DESC);
801 if ((ret == -1) || (!POWEROF2(ret))) {
802 RTE_LOG(INFO, VHOST_CONFIG,
803 "Invalid argument for tx-desc-num [0-N],"
804 "power of 2 required.\n");
805 us_vhost_usage(prgname);
808 num_tx_descriptor = ret;
814 /* Invalid option - print options. */
816 us_vhost_usage(prgname);
821 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822 if (enabled_port_mask & (1 << i))
823 ports[num_ports++] = (uint8_t)i;
826 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
827 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
832 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833 RTE_LOG(INFO, VHOST_PORT,
834 "Vhost zero copy doesn't support software vm2vm,"
835 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
839 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840 RTE_LOG(INFO, VHOST_PORT,
841 "Vhost zero copy doesn't support jumbo frame,"
842 "please specify '--mergeable 0' to disable the "
843 "mergeable feature.\n");
851 * Update the global var NUM_PORTS and array PORTS according to system ports number
852 * and return valid ports number
854 static unsigned check_ports_num(unsigned nb_ports)
856 unsigned valid_num_ports = num_ports;
859 if (num_ports > nb_ports) {
860 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861 num_ports, nb_ports);
862 num_ports = nb_ports;
865 for (portid = 0; portid < num_ports; portid ++) {
866 if (ports[portid] >= nb_ports) {
867 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868 ports[portid], (nb_ports - 1));
869 ports[portid] = INVALID_PORT_ID;
873 return valid_num_ports;
877 * Macro to print out packet contents. Wrapped in debug define so that the
878 * data path is not effected when debug is disabled.
881 #define PRINT_PACKET(device, addr, size, header) do { \
882 char *pkt_addr = (char*)(addr); \
883 unsigned int index; \
884 char packet[MAX_PRINT_BUFF]; \
887 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
889 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
890 for (index = 0; index < (size); index++) { \
891 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
892 "%02hhx ", pkt_addr[index]); \
894 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
896 LOG_DEBUG(VHOST_DATA, "%s", packet); \
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
903 * Function to convert guest physical addresses to vhost physical addresses.
904 * This is used to convert virtio buffer addresses.
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
908 uint32_t buf_len, hpa_type *addr_type)
910 struct virtio_memory_regions_hpa *region;
912 uint64_t vhost_pa = 0;
914 *addr_type = PHYS_ADDR_INVALID;
916 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917 region = &vdev->regions_hpa[regionidx];
918 if ((guest_pa >= region->guest_phys_address) &&
919 (guest_pa <= region->guest_phys_address_end)) {
920 vhost_pa = region->host_phys_addr_offset + guest_pa;
921 if (likely((guest_pa + buf_len - 1)
922 <= region->guest_phys_address_end))
923 *addr_type = PHYS_ADDR_CONTINUOUS;
925 *addr_type = PHYS_ADDR_CROSS_SUBREG;
930 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
931 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932 (void *)(uintptr_t)vhost_pa);
938 * Compares a packet destination MAC address to a device MAC address.
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
943 return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
947 * This function learns the MAC address of the device and registers this along with a
948 * vlan tag to a VMDQ.
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
953 struct ether_hdr *pkt_hdr;
954 struct virtio_net_data_ll *dev_ll;
955 struct virtio_net *dev = vdev->dev;
958 /* Learn MAC address of guest device from packet */
959 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
961 dev_ll = ll_root_used;
963 while (dev_ll != NULL) {
964 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
968 dev_ll = dev_ll->next;
971 for (i = 0; i < ETHER_ADDR_LEN; i++)
972 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
974 /* vlan_tag currently uses the device_id. */
975 vdev->vlan_tag = vlan_tags[dev->device_fh];
977 /* Print out VMDQ registration info. */
978 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
980 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
985 /* Register the MAC address. */
986 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987 (uint32_t)dev->device_fh + vmdq_pool_base);
989 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
992 /* Enable stripping of the vlan tag as we handle routing. */
994 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995 (uint16_t)vdev->vmdq_rx_q, 1);
997 /* Set device as ready for RX. */
998 vdev->ready = DEVICE_RX;
1004 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005 * queue before disabling RX on the device.
1008 unlink_vmdq(struct vhost_dev *vdev)
1012 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1014 if (vdev->ready == DEVICE_RX) {
1015 /*clear MAC and VLAN settings*/
1016 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017 for (i = 0; i < 6; i++)
1018 vdev->mac_address.addr_bytes[i] = 0;
1022 /*Clear out the receive buffers*/
1023 rx_count = rte_eth_rx_burst(ports[0],
1024 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1027 for (i = 0; i < rx_count; i++)
1028 rte_pktmbuf_free(pkts_burst[i]);
1030 rx_count = rte_eth_rx_burst(ports[0],
1031 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1034 vdev->ready = DEVICE_MAC_LEARNING;
1039 * Check if the packet destination MAC address is for a local device. If so then put
1040 * the packet on that devices RX queue. If not then return.
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1045 struct virtio_net_data_ll *dev_ll;
1046 struct ether_hdr *pkt_hdr;
1048 struct virtio_net *dev = vdev->dev;
1049 struct virtio_net *tdev; /* destination virito device */
1051 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1053 /*get the used devices list*/
1054 dev_ll = ll_root_used;
1056 while (dev_ll != NULL) {
1057 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058 &dev_ll->vdev->mac_address)) {
1060 /* Drop the packet if the TX packet is destined for the TX device. */
1061 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1066 tdev = dev_ll->vdev->dev;
1069 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1071 if (unlikely(dev_ll->vdev->remove)) {
1072 /*drop the packet if the device is marked for removal*/
1073 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1075 /*send the packet to the local virtio device*/
1076 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1079 &dev_statistics[tdev->device_fh].rx_total_atomic,
1082 &dev_statistics[tdev->device_fh].rx_atomic,
1084 dev_statistics[dev->device_fh].tx_total++;
1085 dev_statistics[dev->device_fh].tx += ret;
1091 dev_ll = dev_ll->next;
1098 * Check if the destination MAC of a packet is one local VM,
1099 * and get its vlan tag, and offset if it is.
1101 static inline int __attribute__((always_inline))
1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1103 uint32_t *offset, uint16_t *vlan_tag)
1105 struct virtio_net_data_ll *dev_ll = ll_root_used;
1106 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1108 while (dev_ll != NULL) {
1109 if ((dev_ll->vdev->ready == DEVICE_RX)
1110 && ether_addr_cmp(&(pkt_hdr->d_addr),
1111 &dev_ll->vdev->mac_address)) {
1113 * Drop the packet if the TX packet is
1114 * destined for the TX device.
1116 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1117 LOG_DEBUG(VHOST_DATA,
1118 "(%"PRIu64") TX: Source and destination"
1119 " MAC addresses are the same. Dropping "
1121 dev_ll->vdev->dev->device_fh);
1126 * HW vlan strip will reduce the packet length
1127 * by minus length of vlan tag, so need restore
1128 * the packet length by plus it.
1130 *offset = VLAN_HLEN;
1133 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1135 LOG_DEBUG(VHOST_DATA,
1136 "(%"PRIu64") TX: pkt to local VM device id:"
1137 "(%"PRIu64") vlan tag: %d.\n",
1138 dev->device_fh, dev_ll->vdev->dev->device_fh,
1143 dev_ll = dev_ll->next;
1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1151 if (ol_flags & PKT_TX_IPV4)
1152 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1153 else /* assume ethertype == ETHER_TYPE_IPv6 */
1154 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1157 static void virtio_tx_offload(struct rte_mbuf *m)
1160 struct ipv4_hdr *ipv4_hdr = NULL;
1161 struct tcp_hdr *tcp_hdr = NULL;
1162 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1164 l3_hdr = (char *)eth_hdr + m->l2_len;
1166 if (m->ol_flags & PKT_TX_IPV4) {
1168 ipv4_hdr->hdr_checksum = 0;
1169 m->ol_flags |= PKT_TX_IP_CKSUM;
1172 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1173 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1177 * This function routes the TX packet to the correct interface. This may be a local device
1178 * or the physical port.
1180 static inline void __attribute__((always_inline))
1181 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1183 struct mbuf_table *tx_q;
1184 struct rte_mbuf **m_table;
1185 unsigned len, ret, offset = 0;
1186 const uint16_t lcore_id = rte_lcore_id();
1187 struct virtio_net *dev = vdev->dev;
1188 struct ether_hdr *nh;
1190 /*check if destination is local VM*/
1191 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1192 rte_pktmbuf_free(m);
1196 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1197 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1198 rte_pktmbuf_free(m);
1203 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1205 /*Add packet to the port tx queue*/
1206 tx_q = &lcore_tx_queue[lcore_id];
1209 nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1210 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1211 /* Guest has inserted the vlan tag. */
1212 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1213 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1214 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1215 (vh->vlan_tci != vlan_tag_be))
1216 vh->vlan_tci = vlan_tag_be;
1218 m->ol_flags |= PKT_TX_VLAN_PKT;
1221 * Find the right seg to adjust the data len when offset is
1222 * bigger than tail room size.
1224 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1225 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1226 m->data_len += offset;
1228 struct rte_mbuf *seg = m;
1230 while ((seg->next != NULL) &&
1231 (offset > rte_pktmbuf_tailroom(seg)))
1234 seg->data_len += offset;
1236 m->pkt_len += offset;
1239 m->vlan_tci = vlan_tag;
1242 if (m->ol_flags & PKT_TX_TCP_SEG)
1243 virtio_tx_offload(m);
1245 tx_q->m_table[len] = m;
1248 dev_statistics[dev->device_fh].tx_total++;
1249 dev_statistics[dev->device_fh].tx++;
1252 if (unlikely(len == MAX_PKT_BURST)) {
1253 m_table = (struct rte_mbuf **)tx_q->m_table;
1254 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1255 /* Free any buffers not handled by TX and update the port stats. */
1256 if (unlikely(ret < len)) {
1258 rte_pktmbuf_free(m_table[ret]);
1259 } while (++ret < len);
1269 * This function is called by each data core. It handles all RX/TX registered with the
1270 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1271 * with all devices in the main linked list.
1274 switch_worker(__attribute__((unused)) void *arg)
1276 struct rte_mempool *mbuf_pool = arg;
1277 struct virtio_net *dev = NULL;
1278 struct vhost_dev *vdev = NULL;
1279 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1280 struct virtio_net_data_ll *dev_ll;
1281 struct mbuf_table *tx_q;
1282 volatile struct lcore_ll_info *lcore_ll;
1283 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1284 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1286 const uint16_t lcore_id = rte_lcore_id();
1287 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1288 uint16_t rx_count = 0;
1292 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1293 lcore_ll = lcore_info[lcore_id].lcore_ll;
1296 tx_q = &lcore_tx_queue[lcore_id];
1297 for (i = 0; i < num_cores; i ++) {
1298 if (lcore_ids[i] == lcore_id) {
1305 cur_tsc = rte_rdtsc();
1307 * TX burst queue drain
1309 diff_tsc = cur_tsc - prev_tsc;
1310 if (unlikely(diff_tsc > drain_tsc)) {
1313 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1315 /*Tx any packets in the queue*/
1316 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1317 (struct rte_mbuf **)tx_q->m_table,
1318 (uint16_t)tx_q->len);
1319 if (unlikely(ret < tx_q->len)) {
1321 rte_pktmbuf_free(tx_q->m_table[ret]);
1322 } while (++ret < tx_q->len);
1332 rte_prefetch0(lcore_ll->ll_root_used);
1334 * Inform the configuration core that we have exited the linked list and that no devices are
1335 * in use if requested.
1337 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1338 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1343 dev_ll = lcore_ll->ll_root_used;
1345 while (dev_ll != NULL) {
1346 /*get virtio device ID*/
1347 vdev = dev_ll->vdev;
1350 if (unlikely(vdev->remove)) {
1351 dev_ll = dev_ll->next;
1353 vdev->ready = DEVICE_SAFE_REMOVE;
1356 if (likely(vdev->ready == DEVICE_RX)) {
1358 rx_count = rte_eth_rx_burst(ports[0],
1359 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1363 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1364 * Here MAX_PKT_BURST must be less than virtio queue size
1366 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1367 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1368 rte_delay_us(burst_rx_delay_time);
1369 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1373 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1376 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1379 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1381 while (likely(rx_count)) {
1383 rte_pktmbuf_free(pkts_burst[rx_count]);
1389 if (likely(!vdev->remove)) {
1390 /* Handle guest TX*/
1391 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1392 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1393 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1394 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1396 rte_pktmbuf_free(pkts_burst[--tx_count]);
1399 for (i = 0; i < tx_count; ++i)
1400 virtio_tx_route(vdev, pkts_burst[i], (uint16_t)dev->device_fh);
1403 /*move to the next device in the list*/
1404 dev_ll = dev_ll->next;
1412 * This function gets available ring number for zero copy rx.
1413 * Only one thread will call this funciton for a paticular virtio device,
1414 * so, it is designed as non-thread-safe function.
1416 static inline uint32_t __attribute__((always_inline))
1417 get_available_ring_num_zcp(struct virtio_net *dev)
1419 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1422 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1423 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1427 * This function gets available ring index for zero copy rx,
1428 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1429 * Only one thread will call this funciton for a paticular virtio device,
1430 * so, it is designed as non-thread-safe function.
1432 static inline uint32_t __attribute__((always_inline))
1433 get_available_ring_index_zcp(struct virtio_net *dev,
1434 uint16_t *res_base_idx, uint32_t count)
1436 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1439 uint16_t free_entries;
1441 *res_base_idx = vq->last_used_idx_res;
1442 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1443 free_entries = (avail_idx - *res_base_idx);
1445 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1447 "res base idx:%d, free entries:%d\n",
1448 dev->device_fh, avail_idx, *res_base_idx,
1452 * If retry is enabled and the queue is full then we wait
1453 * and retry to avoid packet loss.
1455 if (enable_retry && unlikely(count > free_entries)) {
1456 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1457 rte_delay_us(burst_rx_delay_time);
1458 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1459 free_entries = (avail_idx - *res_base_idx);
1460 if (count <= free_entries)
1465 /*check that we have enough buffers*/
1466 if (unlikely(count > free_entries))
1467 count = free_entries;
1469 if (unlikely(count == 0)) {
1470 LOG_DEBUG(VHOST_DATA,
1471 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1472 "avail idx: %d, res base idx:%d, free entries:%d\n",
1473 dev->device_fh, avail_idx,
1474 *res_base_idx, free_entries);
1478 vq->last_used_idx_res = *res_base_idx + count;
1484 * This function put descriptor back to used list.
1486 static inline void __attribute__((always_inline))
1487 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1489 uint16_t res_cur_idx = vq->last_used_idx;
1490 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1491 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1492 rte_compiler_barrier();
1493 *(volatile uint16_t *)&vq->used->idx += 1;
1494 vq->last_used_idx += 1;
1496 /* Kick the guest if necessary. */
1497 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1498 eventfd_write(vq->callfd, (eventfd_t)1);
1502 * This function get available descriptor from vitio vring and un-attached mbuf
1503 * from vpool->ring, and then attach them together. It needs adjust the offset
1504 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1505 * frame data may be put to wrong location in mbuf.
1507 static inline void __attribute__((always_inline))
1508 attach_rxmbuf_zcp(struct virtio_net *dev)
1510 uint16_t res_base_idx, desc_idx;
1511 uint64_t buff_addr, phys_addr;
1512 struct vhost_virtqueue *vq;
1513 struct vring_desc *desc;
1515 struct rte_mbuf *mbuf;
1516 struct vpool *vpool;
1518 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1520 vpool = &vpool_array[vdev->vmdq_rx_q];
1521 vq = dev->virtqueue[VIRTIO_RXQ];
1524 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1527 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1529 desc = &vq->desc[desc_idx];
1530 if (desc->flags & VRING_DESC_F_NEXT) {
1531 desc = &vq->desc[desc->next];
1532 buff_addr = gpa_to_vva(dev, desc->addr);
1533 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1536 buff_addr = gpa_to_vva(dev,
1537 desc->addr + vq->vhost_hlen);
1538 phys_addr = gpa_to_hpa(vdev,
1539 desc->addr + vq->vhost_hlen,
1540 desc->len, &addr_type);
1543 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1544 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1545 " address found when attaching RX frame buffer"
1546 " address!\n", dev->device_fh);
1547 put_desc_to_used_list_zcp(vq, desc_idx);
1552 * Check if the frame buffer address from guest crosses
1553 * sub-region or not.
1555 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1556 RTE_LOG(ERR, VHOST_DATA,
1557 "(%"PRIu64") Frame buffer address cross "
1558 "sub-regioin found when attaching RX frame "
1559 "buffer address!\n",
1561 put_desc_to_used_list_zcp(vq, desc_idx);
1564 } while (unlikely(phys_addr == 0));
1566 rte_ring_sc_dequeue(vpool->ring, &obj);
1568 if (unlikely(mbuf == NULL)) {
1569 LOG_DEBUG(VHOST_DATA,
1570 "(%"PRIu64") in attach_rxmbuf_zcp: "
1571 "ring_sc_dequeue fail.\n",
1573 put_desc_to_used_list_zcp(vq, desc_idx);
1577 if (unlikely(vpool->buf_size > desc->len)) {
1578 LOG_DEBUG(VHOST_DATA,
1579 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1580 "length(%d) of descriptor idx: %d less than room "
1581 "size required: %d\n",
1582 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1583 put_desc_to_used_list_zcp(vq, desc_idx);
1584 rte_ring_sp_enqueue(vpool->ring, obj);
1588 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1589 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1590 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1591 mbuf->data_len = desc->len;
1592 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1594 LOG_DEBUG(VHOST_DATA,
1595 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1596 "descriptor idx:%d\n",
1597 dev->device_fh, res_base_idx, desc_idx);
1599 __rte_mbuf_raw_free(mbuf);
1605 * Detach an attched packet mbuf -
1606 * - restore original mbuf address and length values.
1607 * - reset pktmbuf data and data_len to their default values.
1608 * All other fields of the given packet mbuf will be left intact.
1611 * The attached packet mbuf.
1613 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1615 const struct rte_mempool *mp = m->pool;
1616 void *buf = rte_mbuf_to_baddr(m);
1618 uint32_t buf_len = mp->elt_size - sizeof(*m);
1619 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1622 m->buf_len = (uint16_t)buf_len;
1624 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1625 RTE_PKTMBUF_HEADROOM : m->buf_len;
1626 m->data_off = buf_ofs;
1632 * This function is called after packets have been transimited. It fetchs mbuf
1633 * from vpool->pool, detached it and put into vpool->ring. It also update the
1634 * used index and kick the guest if necessary.
1636 static inline uint32_t __attribute__((always_inline))
1637 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1639 struct rte_mbuf *mbuf;
1640 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1641 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1643 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1645 LOG_DEBUG(VHOST_DATA,
1646 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1648 dev->device_fh, mbuf_count);
1649 LOG_DEBUG(VHOST_DATA,
1650 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1652 dev->device_fh, rte_ring_count(vpool->ring));
1654 for (index = 0; index < mbuf_count; index++) {
1655 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1656 if (likely(MBUF_EXT_MEM(mbuf)))
1657 pktmbuf_detach_zcp(mbuf);
1658 rte_ring_sp_enqueue(vpool->ring, mbuf);
1660 /* Update used index buffer information. */
1661 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1662 vq->used->ring[used_idx].len = 0;
1664 used_idx = (used_idx + 1) & (vq->size - 1);
1667 LOG_DEBUG(VHOST_DATA,
1668 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1670 dev->device_fh, rte_mempool_count(vpool->pool));
1671 LOG_DEBUG(VHOST_DATA,
1672 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1674 dev->device_fh, rte_ring_count(vpool->ring));
1675 LOG_DEBUG(VHOST_DATA,
1676 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1677 "vq->last_used_idx:%d\n",
1678 dev->device_fh, vq->last_used_idx);
1680 vq->last_used_idx += mbuf_count;
1682 LOG_DEBUG(VHOST_DATA,
1683 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1684 "vq->last_used_idx:%d\n",
1685 dev->device_fh, vq->last_used_idx);
1687 rte_compiler_barrier();
1689 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1691 /* Kick guest if required. */
1692 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1693 eventfd_write(vq->callfd, (eventfd_t)1);
1699 * This function is called when a virtio device is destroy.
1700 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1702 static void mbuf_destroy_zcp(struct vpool *vpool)
1704 struct rte_mbuf *mbuf = NULL;
1705 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1707 LOG_DEBUG(VHOST_CONFIG,
1708 "in mbuf_destroy_zcp: mbuf count in mempool before "
1709 "mbuf_destroy_zcp is: %d\n",
1711 LOG_DEBUG(VHOST_CONFIG,
1712 "in mbuf_destroy_zcp: mbuf count in ring before "
1713 "mbuf_destroy_zcp is : %d\n",
1714 rte_ring_count(vpool->ring));
1716 for (index = 0; index < mbuf_count; index++) {
1717 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1718 if (likely(mbuf != NULL)) {
1719 if (likely(MBUF_EXT_MEM(mbuf)))
1720 pktmbuf_detach_zcp(mbuf);
1721 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1725 LOG_DEBUG(VHOST_CONFIG,
1726 "in mbuf_destroy_zcp: mbuf count in mempool after "
1727 "mbuf_destroy_zcp is: %d\n",
1728 rte_mempool_count(vpool->pool));
1729 LOG_DEBUG(VHOST_CONFIG,
1730 "in mbuf_destroy_zcp: mbuf count in ring after "
1731 "mbuf_destroy_zcp is : %d\n",
1732 rte_ring_count(vpool->ring));
1736 * This function update the use flag and counter.
1738 static inline uint32_t __attribute__((always_inline))
1739 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1742 struct vhost_virtqueue *vq;
1743 struct vring_desc *desc;
1744 struct rte_mbuf *buff;
1745 /* The virtio_hdr is initialised to 0. */
1746 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1747 = {{0, 0, 0, 0, 0, 0}, 0};
1748 uint64_t buff_hdr_addr = 0;
1749 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1750 uint32_t head_idx, packet_success = 0;
1751 uint16_t res_cur_idx;
1753 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1758 vq = dev->virtqueue[VIRTIO_RXQ];
1759 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1761 res_cur_idx = vq->last_used_idx;
1762 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1763 dev->device_fh, res_cur_idx, res_cur_idx + count);
1765 /* Retrieve all of the head indexes first to avoid caching issues. */
1766 for (head_idx = 0; head_idx < count; head_idx++)
1767 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1769 /*Prefetch descriptor index. */
1770 rte_prefetch0(&vq->desc[head[packet_success]]);
1772 while (packet_success != count) {
1773 /* Get descriptor from available ring */
1774 desc = &vq->desc[head[packet_success]];
1776 buff = pkts[packet_success];
1777 LOG_DEBUG(VHOST_DATA,
1778 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1779 "pkt[%d] descriptor idx: %d\n",
1780 dev->device_fh, packet_success,
1781 MBUF_HEADROOM_UINT32(buff));
1784 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1785 + RTE_PKTMBUF_HEADROOM),
1786 rte_pktmbuf_data_len(buff), 0);
1788 /* Buffer address translation for virtio header. */
1789 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1790 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1793 * If the descriptors are chained the header and data are
1794 * placed in separate buffers.
1796 if (desc->flags & VRING_DESC_F_NEXT) {
1797 desc->len = vq->vhost_hlen;
1798 desc = &vq->desc[desc->next];
1799 desc->len = rte_pktmbuf_data_len(buff);
1801 desc->len = packet_len;
1804 /* Update used ring with desc information */
1805 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1806 = head[packet_success];
1807 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1812 /* A header is required per buffer. */
1813 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1814 (const void *)&virtio_hdr, vq->vhost_hlen);
1816 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1818 if (likely(packet_success < count)) {
1819 /* Prefetch descriptor index. */
1820 rte_prefetch0(&vq->desc[head[packet_success]]);
1824 rte_compiler_barrier();
1826 LOG_DEBUG(VHOST_DATA,
1827 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1828 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1829 dev->device_fh, vq->last_used_idx, vq->used->idx);
1831 *(volatile uint16_t *)&vq->used->idx += count;
1832 vq->last_used_idx += count;
1834 LOG_DEBUG(VHOST_DATA,
1835 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1836 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1837 dev->device_fh, vq->last_used_idx, vq->used->idx);
1839 /* Kick the guest if necessary. */
1840 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1841 eventfd_write(vq->callfd, (eventfd_t)1);
1847 * This function routes the TX packet to the correct interface.
1848 * This may be a local device or the physical port.
1850 static inline void __attribute__((always_inline))
1851 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1852 uint32_t desc_idx, uint8_t need_copy)
1854 struct mbuf_table *tx_q;
1855 struct rte_mbuf **m_table;
1857 struct rte_mbuf *mbuf;
1858 unsigned len, ret, offset = 0;
1859 struct vpool *vpool;
1860 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1861 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1863 /*Add packet to the port tx queue*/
1864 tx_q = &tx_queue_zcp[vmdq_rx_q];
1867 /* Allocate an mbuf and populate the structure. */
1868 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1869 rte_ring_sc_dequeue(vpool->ring, &obj);
1871 if (unlikely(mbuf == NULL)) {
1872 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1873 RTE_LOG(ERR, VHOST_DATA,
1874 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1876 put_desc_to_used_list_zcp(vq, desc_idx);
1880 if (vm2vm_mode == VM2VM_HARDWARE) {
1881 /* Avoid using a vlan tag from any vm for external pkt, such as
1882 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1883 * selection, MAC address determines it as an external pkt
1884 * which should go to network, while vlan tag determine it as
1885 * a vm2vm pkt should forward to another vm. Hardware confuse
1886 * such a ambiguous situation, so pkt will lost.
1888 vlan_tag = external_pkt_default_vlan_tag;
1889 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1890 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1891 __rte_mbuf_raw_free(mbuf);
1896 mbuf->nb_segs = m->nb_segs;
1897 mbuf->next = m->next;
1898 mbuf->data_len = m->data_len + offset;
1899 mbuf->pkt_len = mbuf->data_len;
1900 if (unlikely(need_copy)) {
1901 /* Copy the packet contents to the mbuf. */
1902 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1903 rte_pktmbuf_mtod(m, void *),
1906 mbuf->data_off = m->data_off;
1907 mbuf->buf_physaddr = m->buf_physaddr;
1908 mbuf->buf_addr = m->buf_addr;
1910 mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1911 mbuf->vlan_tci = vlan_tag;
1912 mbuf->l2_len = sizeof(struct ether_hdr);
1913 mbuf->l3_len = sizeof(struct ipv4_hdr);
1914 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1916 tx_q->m_table[len] = mbuf;
1919 LOG_DEBUG(VHOST_DATA,
1920 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1923 (mbuf->next == NULL) ? "null" : "non-null");
1926 dev_statistics[dev->device_fh].tx_total++;
1927 dev_statistics[dev->device_fh].tx++;
1930 if (unlikely(len == MAX_PKT_BURST)) {
1931 m_table = (struct rte_mbuf **)tx_q->m_table;
1932 ret = rte_eth_tx_burst(ports[0],
1933 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1936 * Free any buffers not handled by TX and update
1939 if (unlikely(ret < len)) {
1941 rte_pktmbuf_free(m_table[ret]);
1942 } while (++ret < len);
1946 txmbuf_clean_zcp(dev, vpool);
1955 * This function TX all available packets in virtio TX queue for one
1956 * virtio-net device. If it is first packet, it learns MAC address and
1959 static inline void __attribute__((always_inline))
1960 virtio_dev_tx_zcp(struct virtio_net *dev)
1963 struct vhost_virtqueue *vq;
1964 struct vring_desc *desc;
1965 uint64_t buff_addr = 0, phys_addr;
1966 uint32_t head[MAX_PKT_BURST];
1968 uint16_t free_entries, packet_success = 0;
1970 uint8_t need_copy = 0;
1972 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1974 vq = dev->virtqueue[VIRTIO_TXQ];
1975 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1977 /* If there are no available buffers then return. */
1978 if (vq->last_used_idx_res == avail_idx)
1981 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1983 /* Prefetch available ring to retrieve head indexes. */
1984 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1986 /* Get the number of free entries in the ring */
1987 free_entries = (avail_idx - vq->last_used_idx_res);
1989 /* Limit to MAX_PKT_BURST. */
1991 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1993 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1994 dev->device_fh, free_entries);
1996 /* Retrieve all of the head indexes first to avoid caching issues. */
1997 for (i = 0; i < free_entries; i++)
1999 = vq->avail->ring[(vq->last_used_idx_res + i)
2002 vq->last_used_idx_res += free_entries;
2004 /* Prefetch descriptor index. */
2005 rte_prefetch0(&vq->desc[head[packet_success]]);
2006 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2008 while (packet_success < free_entries) {
2009 desc = &vq->desc[head[packet_success]];
2011 /* Discard first buffer as it is the virtio header */
2012 desc = &vq->desc[desc->next];
2014 /* Buffer address translation. */
2015 buff_addr = gpa_to_vva(dev, desc->addr);
2016 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
2017 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2020 if (likely(packet_success < (free_entries - 1)))
2021 /* Prefetch descriptor index. */
2022 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2024 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2025 RTE_LOG(ERR, VHOST_DATA,
2026 "(%"PRIu64") Invalid frame buffer address found"
2027 "when TX packets!\n",
2033 /* Prefetch buffer address. */
2034 rte_prefetch0((void *)(uintptr_t)buff_addr);
2037 * Setup dummy mbuf. This is copied to a real mbuf if
2038 * transmitted out the physical port.
2040 m.data_len = desc->len;
2044 m.buf_addr = (void *)(uintptr_t)buff_addr;
2045 m.buf_physaddr = phys_addr;
2048 * Check if the frame buffer address from guest crosses
2049 * sub-region or not.
2051 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2052 RTE_LOG(ERR, VHOST_DATA,
2053 "(%"PRIu64") Frame buffer address cross "
2054 "sub-regioin found when attaching TX frame "
2055 "buffer address!\n",
2061 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2064 * If this is the first received packet we need to learn
2065 * the MAC and setup VMDQ
2067 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2068 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2070 * Discard frame if device is scheduled for
2071 * removal or a duplicate MAC address is found.
2073 packet_success += free_entries;
2074 vq->last_used_idx += packet_success;
2079 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2085 * This function is called by each data core. It handles all RX/TX registered
2086 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2087 * addresses are compared with all devices in the main linked list.
2090 switch_worker_zcp(__attribute__((unused)) void *arg)
2092 struct virtio_net *dev = NULL;
2093 struct vhost_dev *vdev = NULL;
2094 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2095 struct virtio_net_data_ll *dev_ll;
2096 struct mbuf_table *tx_q;
2097 volatile struct lcore_ll_info *lcore_ll;
2098 const uint64_t drain_tsc
2099 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2100 * BURST_TX_DRAIN_US;
2101 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2103 const uint16_t lcore_id = rte_lcore_id();
2104 uint16_t count_in_ring, rx_count = 0;
2106 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2108 lcore_ll = lcore_info[lcore_id].lcore_ll;
2112 cur_tsc = rte_rdtsc();
2114 /* TX burst queue drain */
2115 diff_tsc = cur_tsc - prev_tsc;
2116 if (unlikely(diff_tsc > drain_tsc)) {
2118 * Get mbuf from vpool.pool and detach mbuf and
2119 * put back into vpool.ring.
2121 dev_ll = lcore_ll->ll_root_used;
2122 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2123 /* Get virtio device ID */
2124 vdev = dev_ll->vdev;
2127 if (likely(!vdev->remove)) {
2128 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2130 LOG_DEBUG(VHOST_DATA,
2131 "TX queue drained after timeout"
2132 " with burst size %u\n",
2136 * Tx any packets in the queue
2138 ret = rte_eth_tx_burst(
2140 (uint16_t)tx_q->txq_id,
2141 (struct rte_mbuf **)
2143 (uint16_t)tx_q->len);
2144 if (unlikely(ret < tx_q->len)) {
2147 tx_q->m_table[ret]);
2148 } while (++ret < tx_q->len);
2152 txmbuf_clean_zcp(dev,
2153 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2156 dev_ll = dev_ll->next;
2161 rte_prefetch0(lcore_ll->ll_root_used);
2164 * Inform the configuration core that we have exited the linked
2165 * list and that no devices are in use if requested.
2167 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2168 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2170 /* Process devices */
2171 dev_ll = lcore_ll->ll_root_used;
2173 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2174 vdev = dev_ll->vdev;
2176 if (unlikely(vdev->remove)) {
2177 dev_ll = dev_ll->next;
2179 vdev->ready = DEVICE_SAFE_REMOVE;
2183 if (likely(vdev->ready == DEVICE_RX)) {
2184 uint32_t index = vdev->vmdq_rx_q;
2187 = rte_ring_count(vpool_array[index].ring);
2188 uint16_t free_entries
2189 = (uint16_t)get_available_ring_num_zcp(dev);
2192 * Attach all mbufs in vpool.ring and put back
2196 i < RTE_MIN(free_entries,
2197 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2199 attach_rxmbuf_zcp(dev);
2201 /* Handle guest RX */
2202 rx_count = rte_eth_rx_burst(ports[0],
2203 vdev->vmdq_rx_q, pkts_burst,
2207 ret_count = virtio_dev_rx_zcp(dev,
2208 pkts_burst, rx_count);
2210 dev_statistics[dev->device_fh].rx_total
2212 dev_statistics[dev->device_fh].rx
2215 while (likely(rx_count)) {
2218 pkts_burst[rx_count]);
2219 rte_ring_sp_enqueue(
2220 vpool_array[index].ring,
2221 (void *)pkts_burst[rx_count]);
2226 if (likely(!vdev->remove))
2227 /* Handle guest TX */
2228 virtio_dev_tx_zcp(dev);
2230 /* Move to the next device in the list */
2231 dev_ll = dev_ll->next;
2240 * Add an entry to a used linked list. A free entry must first be found
2241 * in the free linked list using get_data_ll_free_entry();
2244 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2245 struct virtio_net_data_ll *ll_dev)
2247 struct virtio_net_data_ll *ll = *ll_root_addr;
2249 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2250 ll_dev->next = NULL;
2251 rte_compiler_barrier();
2253 /* If ll == NULL then this is the first device. */
2255 /* Increment to the tail of the linked list. */
2256 while ((ll->next != NULL) )
2261 *ll_root_addr = ll_dev;
2266 * Remove an entry from a used linked list. The entry must then be added to
2267 * the free linked list using put_data_ll_free_entry().
2270 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2271 struct virtio_net_data_ll *ll_dev,
2272 struct virtio_net_data_ll *ll_dev_last)
2274 struct virtio_net_data_ll *ll = *ll_root_addr;
2276 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2280 *ll_root_addr = ll_dev->next;
2282 if (likely(ll_dev_last != NULL))
2283 ll_dev_last->next = ll_dev->next;
2285 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2289 * Find and return an entry from the free linked list.
2291 static struct virtio_net_data_ll *
2292 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2294 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2295 struct virtio_net_data_ll *ll_dev;
2297 if (ll_free == NULL)
2301 *ll_root_addr = ll_free->next;
2307 * Place an entry back on to the free linked list.
2310 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2311 struct virtio_net_data_ll *ll_dev)
2313 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2318 ll_dev->next = ll_free;
2319 *ll_root_addr = ll_dev;
2323 * Creates a linked list of a given size.
2325 static struct virtio_net_data_ll *
2326 alloc_data_ll(uint32_t size)
2328 struct virtio_net_data_ll *ll_new;
2331 /* Malloc and then chain the linked list. */
2332 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2333 if (ll_new == NULL) {
2334 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2338 for (i = 0; i < size - 1; i++) {
2339 ll_new[i].vdev = NULL;
2340 ll_new[i].next = &ll_new[i+1];
2342 ll_new[i].next = NULL;
2348 * Create the main linked list along with each individual cores linked list. A used and a free list
2349 * are created to manage entries.
2356 RTE_LCORE_FOREACH_SLAVE(lcore) {
2357 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2358 if (lcore_info[lcore].lcore_ll == NULL) {
2359 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2363 lcore_info[lcore].lcore_ll->device_num = 0;
2364 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2365 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2366 if (num_devices % num_switching_cores)
2367 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2369 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2372 /* Allocate devices up to a maximum of MAX_DEVICES. */
2373 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2379 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2380 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2381 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2384 destroy_device (volatile struct virtio_net *dev)
2386 struct virtio_net_data_ll *ll_lcore_dev_cur;
2387 struct virtio_net_data_ll *ll_main_dev_cur;
2388 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2389 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2390 struct vhost_dev *vdev;
2393 dev->flags &= ~VIRTIO_DEV_RUNNING;
2395 vdev = (struct vhost_dev *)dev->priv;
2396 /*set the remove flag. */
2398 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2402 /* Search for entry to be removed from lcore ll */
2403 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2404 while (ll_lcore_dev_cur != NULL) {
2405 if (ll_lcore_dev_cur->vdev == vdev) {
2408 ll_lcore_dev_last = ll_lcore_dev_cur;
2409 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2413 if (ll_lcore_dev_cur == NULL) {
2414 RTE_LOG(ERR, VHOST_CONFIG,
2415 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2420 /* Search for entry to be removed from main ll */
2421 ll_main_dev_cur = ll_root_used;
2422 ll_main_dev_last = NULL;
2423 while (ll_main_dev_cur != NULL) {
2424 if (ll_main_dev_cur->vdev == vdev) {
2427 ll_main_dev_last = ll_main_dev_cur;
2428 ll_main_dev_cur = ll_main_dev_cur->next;
2432 /* Remove entries from the lcore and main ll. */
2433 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2434 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2436 /* Set the dev_removal_flag on each lcore. */
2437 RTE_LCORE_FOREACH_SLAVE(lcore) {
2438 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2442 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2443 * they can no longer access the device removed from the linked lists and that the devices
2444 * are no longer in use.
2446 RTE_LCORE_FOREACH_SLAVE(lcore) {
2447 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2452 /* Add the entries back to the lcore and main free ll.*/
2453 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2454 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2456 /* Decrement number of device on the lcore. */
2457 lcore_info[vdev->coreid].lcore_ll->device_num--;
2459 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2462 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2464 /* Stop the RX queue. */
2465 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2466 LOG_DEBUG(VHOST_CONFIG,
2467 "(%"PRIu64") In destroy_device: Failed to stop "
2473 LOG_DEBUG(VHOST_CONFIG,
2474 "(%"PRIu64") in destroy_device: Start put mbuf in "
2475 "mempool back to ring for RX queue: %d\n",
2476 dev->device_fh, vdev->vmdq_rx_q);
2478 mbuf_destroy_zcp(vpool);
2480 /* Stop the TX queue. */
2481 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2482 LOG_DEBUG(VHOST_CONFIG,
2483 "(%"PRIu64") In destroy_device: Failed to "
2484 "stop tx queue:%d\n",
2485 dev->device_fh, vdev->vmdq_rx_q);
2488 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2490 LOG_DEBUG(VHOST_CONFIG,
2491 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2492 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2493 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2496 mbuf_destroy_zcp(vpool);
2497 rte_free(vdev->regions_hpa);
2504 * Calculate the region count of physical continous regions for one particular
2505 * region of whose vhost virtual address is continous. The particular region
2506 * start from vva_start, with size of 'size' in argument.
2509 check_hpa_regions(uint64_t vva_start, uint64_t size)
2511 uint32_t i, nregions = 0, page_size = getpagesize();
2512 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2513 if (vva_start % page_size) {
2514 LOG_DEBUG(VHOST_CONFIG,
2515 "in check_countinous: vva start(%p) mod page_size(%d) "
2517 (void *)(uintptr_t)vva_start, page_size);
2520 if (size % page_size) {
2521 LOG_DEBUG(VHOST_CONFIG,
2522 "in check_countinous: "
2523 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2527 for (i = 0; i < size - page_size; i = i + page_size) {
2529 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2530 next_phys_addr = rte_mem_virt2phy(
2531 (void *)(uintptr_t)(vva_start + i + page_size));
2532 if ((cur_phys_addr + page_size) != next_phys_addr) {
2534 LOG_DEBUG(VHOST_CONFIG,
2535 "in check_continuous: hva addr:(%p) is not "
2536 "continuous with hva addr:(%p), diff:%d\n",
2537 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2538 (void *)(uintptr_t)(vva_start + (uint64_t)i
2539 + page_size), page_size);
2540 LOG_DEBUG(VHOST_CONFIG,
2541 "in check_continuous: hpa addr:(%p) is not "
2542 "continuous with hpa addr:(%p), "
2543 "diff:(%"PRIu64")\n",
2544 (void *)(uintptr_t)cur_phys_addr,
2545 (void *)(uintptr_t)next_phys_addr,
2546 (next_phys_addr-cur_phys_addr));
2553 * Divide each region whose vhost virtual address is continous into a few
2554 * sub-regions, make sure the physical address within each sub-region are
2555 * continous. And fill offset(to GPA) and size etc. information of each
2556 * sub-region into regions_hpa.
2559 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2561 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2562 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2564 if (mem_region_hpa == NULL)
2567 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2568 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2569 virtio_memory->regions[regionidx].address_offset;
2570 mem_region_hpa[regionidx_hpa].guest_phys_address
2571 = virtio_memory->regions[regionidx].guest_phys_address;
2572 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2573 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2574 mem_region_hpa[regionidx_hpa].guest_phys_address;
2575 LOG_DEBUG(VHOST_CONFIG,
2576 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2579 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2580 LOG_DEBUG(VHOST_CONFIG,
2581 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2584 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2586 i < virtio_memory->regions[regionidx].memory_size -
2589 cur_phys_addr = rte_mem_virt2phy(
2590 (void *)(uintptr_t)(vva_start + i));
2591 next_phys_addr = rte_mem_virt2phy(
2592 (void *)(uintptr_t)(vva_start +
2594 if ((cur_phys_addr + page_size) != next_phys_addr) {
2595 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2596 mem_region_hpa[regionidx_hpa].guest_phys_address +
2598 mem_region_hpa[regionidx_hpa].memory_size
2600 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2601 "phys addr end [%d]:(%p)\n",
2604 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2605 LOG_DEBUG(VHOST_CONFIG,
2606 "in fill_hpa_regions: guest phys addr "
2610 (mem_region_hpa[regionidx_hpa].memory_size));
2611 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2612 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2614 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2616 mem_region_hpa[regionidx_hpa].guest_phys_address;
2617 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2618 " phys addr start[%d]:(%p)\n",
2621 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2622 LOG_DEBUG(VHOST_CONFIG,
2623 "in fill_hpa_regions: host phys addr "
2627 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2633 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2634 = mem_region_hpa[regionidx_hpa].guest_phys_address
2636 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2637 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2638 "[%d]:(%p)\n", regionidx_hpa,
2640 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2641 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2642 "[%d]:(%p)\n", regionidx_hpa,
2644 (mem_region_hpa[regionidx_hpa].memory_size));
2647 return regionidx_hpa;
2651 * A new device is added to a data core. First the device is added to the main linked list
2652 * and the allocated to a specific data core.
2655 new_device (struct virtio_net *dev)
2657 struct virtio_net_data_ll *ll_dev;
2658 int lcore, core_add = 0;
2659 uint32_t device_num_min = num_devices;
2660 struct vhost_dev *vdev;
2663 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2665 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2673 vdev->nregions_hpa = dev->mem->nregions;
2674 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2676 += check_hpa_regions(
2677 dev->mem->regions[regionidx].guest_phys_address
2678 + dev->mem->regions[regionidx].address_offset,
2679 dev->mem->regions[regionidx].memory_size);
2683 vdev->regions_hpa = rte_calloc("vhost hpa region",
2685 sizeof(struct virtio_memory_regions_hpa),
2686 RTE_CACHE_LINE_SIZE);
2687 if (vdev->regions_hpa == NULL) {
2688 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2694 if (fill_hpa_memory_regions(
2695 vdev->regions_hpa, dev->mem
2696 ) != vdev->nregions_hpa) {
2698 RTE_LOG(ERR, VHOST_CONFIG,
2699 "hpa memory regions number mismatch: "
2700 "[%d]\n", vdev->nregions_hpa);
2701 rte_free(vdev->regions_hpa);
2708 /* Add device to main ll */
2709 ll_dev = get_data_ll_free_entry(&ll_root_free);
2710 if (ll_dev == NULL) {
2711 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2712 "of %d devices per core has been reached\n",
2713 dev->device_fh, num_devices);
2714 if (vdev->regions_hpa)
2715 rte_free(vdev->regions_hpa);
2719 ll_dev->vdev = vdev;
2720 add_data_ll_entry(&ll_root_used, ll_dev);
2722 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2725 uint32_t index = vdev->vmdq_rx_q;
2726 uint32_t count_in_ring, i;
2727 struct mbuf_table *tx_q;
2729 count_in_ring = rte_ring_count(vpool_array[index].ring);
2731 LOG_DEBUG(VHOST_CONFIG,
2732 "(%"PRIu64") in new_device: mbuf count in mempool "
2733 "before attach is: %d\n",
2735 rte_mempool_count(vpool_array[index].pool));
2736 LOG_DEBUG(VHOST_CONFIG,
2737 "(%"PRIu64") in new_device: mbuf count in ring "
2738 "before attach is : %d\n",
2739 dev->device_fh, count_in_ring);
2742 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2744 for (i = 0; i < count_in_ring; i++)
2745 attach_rxmbuf_zcp(dev);
2747 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2748 "mempool after attach is: %d\n",
2750 rte_mempool_count(vpool_array[index].pool));
2751 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2752 "ring after attach is : %d\n",
2754 rte_ring_count(vpool_array[index].ring));
2756 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2757 tx_q->txq_id = vdev->vmdq_rx_q;
2759 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2760 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2762 LOG_DEBUG(VHOST_CONFIG,
2763 "(%"PRIu64") In new_device: Failed to start "
2765 dev->device_fh, vdev->vmdq_rx_q);
2767 mbuf_destroy_zcp(vpool);
2768 rte_free(vdev->regions_hpa);
2773 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2774 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2776 LOG_DEBUG(VHOST_CONFIG,
2777 "(%"PRIu64") In new_device: Failed to start "
2779 dev->device_fh, vdev->vmdq_rx_q);
2781 /* Stop the TX queue. */
2782 if (rte_eth_dev_tx_queue_stop(ports[0],
2783 vdev->vmdq_rx_q) != 0) {
2784 LOG_DEBUG(VHOST_CONFIG,
2785 "(%"PRIu64") In new_device: Failed to "
2786 "stop tx queue:%d\n",
2787 dev->device_fh, vdev->vmdq_rx_q);
2790 mbuf_destroy_zcp(vpool);
2791 rte_free(vdev->regions_hpa);
2798 /*reset ready flag*/
2799 vdev->ready = DEVICE_MAC_LEARNING;
2802 /* Find a suitable lcore to add the device. */
2803 RTE_LCORE_FOREACH_SLAVE(lcore) {
2804 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2805 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2809 /* Add device to lcore ll */
2810 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2811 if (ll_dev == NULL) {
2812 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2813 vdev->ready = DEVICE_SAFE_REMOVE;
2814 destroy_device(dev);
2815 rte_free(vdev->regions_hpa);
2819 ll_dev->vdev = vdev;
2820 vdev->coreid = core_add;
2822 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2824 /* Initialize device stats */
2825 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2827 /* Disable notifications. */
2828 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2829 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2830 lcore_info[vdev->coreid].lcore_ll->device_num++;
2831 dev->flags |= VIRTIO_DEV_RUNNING;
2833 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2839 * These callback allow devices to be added to the data core when configuration
2840 * has been fully complete.
2842 static const struct virtio_net_device_ops virtio_net_device_ops =
2844 .new_device = new_device,
2845 .destroy_device = destroy_device,
2849 * This is a thread will wake up after a period to print stats if the user has
2855 struct virtio_net_data_ll *dev_ll;
2856 uint64_t tx_dropped, rx_dropped;
2857 uint64_t tx, tx_total, rx, rx_total;
2859 const char clr[] = { 27, '[', '2', 'J', '\0' };
2860 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2863 sleep(enable_stats);
2865 /* Clear screen and move to top left */
2866 printf("%s%s", clr, top_left);
2868 printf("\nDevice statistics ====================================");
2870 dev_ll = ll_root_used;
2871 while (dev_ll != NULL) {
2872 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2873 tx_total = dev_statistics[device_fh].tx_total;
2874 tx = dev_statistics[device_fh].tx;
2875 tx_dropped = tx_total - tx;
2876 if (zero_copy == 0) {
2877 rx_total = rte_atomic64_read(
2878 &dev_statistics[device_fh].rx_total_atomic);
2879 rx = rte_atomic64_read(
2880 &dev_statistics[device_fh].rx_atomic);
2882 rx_total = dev_statistics[device_fh].rx_total;
2883 rx = dev_statistics[device_fh].rx;
2885 rx_dropped = rx_total - rx;
2887 printf("\nStatistics for device %"PRIu32" ------------------------------"
2888 "\nTX total: %"PRIu64""
2889 "\nTX dropped: %"PRIu64""
2890 "\nTX successful: %"PRIu64""
2891 "\nRX total: %"PRIu64""
2892 "\nRX dropped: %"PRIu64""
2893 "\nRX successful: %"PRIu64"",
2902 dev_ll = dev_ll->next;
2904 printf("\n======================================================\n");
2909 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2910 char *ring_name, uint32_t nb_mbuf)
2912 vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2913 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2914 if (vpool_array[index].pool != NULL) {
2915 vpool_array[index].ring
2916 = rte_ring_create(ring_name,
2917 rte_align32pow2(nb_mbuf + 1),
2918 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2919 if (likely(vpool_array[index].ring != NULL)) {
2920 LOG_DEBUG(VHOST_CONFIG,
2921 "in setup_mempool_tbl: mbuf count in "
2923 rte_mempool_count(vpool_array[index].pool));
2924 LOG_DEBUG(VHOST_CONFIG,
2925 "in setup_mempool_tbl: mbuf count in "
2927 rte_ring_count(vpool_array[index].ring));
2929 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2933 /* Need consider head room. */
2934 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2936 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2940 /* When we receive a INT signal, unregister vhost driver */
2942 sigint_handler(__rte_unused int signum)
2944 /* Unregister vhost driver. */
2945 int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2947 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2952 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2953 * device is also registered here to handle the IOCTLs.
2956 main(int argc, char *argv[])
2958 struct rte_mempool *mbuf_pool = NULL;
2959 unsigned lcore_id, core_id = 0;
2960 unsigned nb_ports, valid_num_ports;
2964 static pthread_t tid;
2965 char thread_name[RTE_MAX_THREAD_NAME_LEN];
2967 signal(SIGINT, sigint_handler);
2970 ret = rte_eal_init(argc, argv);
2972 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2976 /* parse app arguments */
2977 ret = us_vhost_parse_args(argc, argv);
2979 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2981 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2982 if (rte_lcore_is_enabled(lcore_id))
2983 lcore_ids[core_id ++] = lcore_id;
2985 if (rte_lcore_count() > RTE_MAX_LCORE)
2986 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2988 /*set the number of swithcing cores available*/
2989 num_switching_cores = rte_lcore_count()-1;
2991 /* Get the number of physical ports. */
2992 nb_ports = rte_eth_dev_count();
2993 if (nb_ports > RTE_MAX_ETHPORTS)
2994 nb_ports = RTE_MAX_ETHPORTS;
2997 * Update the global var NUM_PORTS and global array PORTS
2998 * and get value of var VALID_NUM_PORTS according to system ports number
3000 valid_num_ports = check_ports_num(nb_ports);
3002 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
3003 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3004 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3008 if (zero_copy == 0) {
3009 /* Create the mbuf pool. */
3010 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3011 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3012 0, MBUF_DATA_SIZE, rte_socket_id());
3013 if (mbuf_pool == NULL)
3014 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3016 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3017 vpool_array[queue_id].pool = mbuf_pool;
3019 if (vm2vm_mode == VM2VM_HARDWARE) {
3020 /* Enable VT loop back to let L2 switch to do it. */
3021 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3022 LOG_DEBUG(VHOST_CONFIG,
3023 "Enable loop back for L2 switch in vmdq.\n");
3027 char pool_name[RTE_MEMPOOL_NAMESIZE];
3028 char ring_name[RTE_MEMPOOL_NAMESIZE];
3030 nb_mbuf = num_rx_descriptor
3031 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3032 + num_switching_cores * MAX_PKT_BURST;
3034 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3035 snprintf(pool_name, sizeof(pool_name),
3036 "rxmbuf_pool_%u", queue_id);
3037 snprintf(ring_name, sizeof(ring_name),
3038 "rxmbuf_ring_%u", queue_id);
3039 setup_mempool_tbl(rte_socket_id(), queue_id,
3040 pool_name, ring_name, nb_mbuf);
3043 nb_mbuf = num_tx_descriptor
3044 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3045 + num_switching_cores * MAX_PKT_BURST;
3047 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3048 snprintf(pool_name, sizeof(pool_name),
3049 "txmbuf_pool_%u", queue_id);
3050 snprintf(ring_name, sizeof(ring_name),
3051 "txmbuf_ring_%u", queue_id);
3052 setup_mempool_tbl(rte_socket_id(),
3053 (queue_id + MAX_QUEUES),
3054 pool_name, ring_name, nb_mbuf);
3057 if (vm2vm_mode == VM2VM_HARDWARE) {
3058 /* Enable VT loop back to let L2 switch to do it. */
3059 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3060 LOG_DEBUG(VHOST_CONFIG,
3061 "Enable loop back for L2 switch in vmdq.\n");
3064 /* Set log level. */
3065 rte_set_log_level(LOG_LEVEL);
3067 /* initialize all ports */
3068 for (portid = 0; portid < nb_ports; portid++) {
3069 /* skip ports that are not enabled */
3070 if ((enabled_port_mask & (1 << portid)) == 0) {
3071 RTE_LOG(INFO, VHOST_PORT,
3072 "Skipping disabled port %d\n", portid);
3075 if (port_init(portid) != 0)
3076 rte_exit(EXIT_FAILURE,
3077 "Cannot initialize network ports\n");
3080 /* Initialise all linked lists. */
3081 if (init_data_ll() == -1)
3082 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3084 /* Initialize device stats */
3085 memset(&dev_statistics, 0, sizeof(dev_statistics));
3087 /* Enable stats if the user option is set. */
3089 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3091 rte_exit(EXIT_FAILURE,
3092 "Cannot create print-stats thread\n");
3094 /* Set thread_name for aid in debugging. */
3095 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3096 ret = rte_thread_setname(tid, thread_name);
3098 RTE_LOG(ERR, VHOST_CONFIG,
3099 "Cannot set print-stats name\n");
3102 /* Launch all data cores. */
3103 if (zero_copy == 0) {
3104 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3105 rte_eal_remote_launch(switch_worker,
3106 mbuf_pool, lcore_id);
3109 uint32_t count_in_mempool, index, i;
3110 for (index = 0; index < 2*MAX_QUEUES; index++) {
3111 /* For all RX and TX queues. */
3113 = rte_mempool_count(vpool_array[index].pool);
3116 * Transfer all un-attached mbufs from vpool.pool
3119 for (i = 0; i < count_in_mempool; i++) {
3120 struct rte_mbuf *mbuf
3121 = __rte_mbuf_raw_alloc(
3122 vpool_array[index].pool);
3123 rte_ring_sp_enqueue(vpool_array[index].ring,
3127 LOG_DEBUG(VHOST_CONFIG,
3128 "in main: mbuf count in mempool at initial "
3129 "is: %d\n", count_in_mempool);
3130 LOG_DEBUG(VHOST_CONFIG,
3131 "in main: mbuf count in ring at initial is :"
3133 rte_ring_count(vpool_array[index].ring));
3136 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3137 rte_eal_remote_launch(switch_worker_zcp, NULL,
3142 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3144 /* Register vhost(cuse or user) driver to handle vhost messages. */
3145 ret = rte_vhost_driver_register((char *)&dev_basename);
3147 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3149 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3151 /* Start CUSE session. */
3152 rte_vhost_driver_session_start();