vhost: combine select with sleep
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
84
85 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
87
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX                       1
93 #define DEVICE_SAFE_REMOVE      2
94
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117                 + sizeof(struct rte_mbuf)))
118
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121
122 #define INVALID_PORT_ID 0xFF
123
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141
142 #define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
143
144 /* mask of enabled ports */
145 static uint32_t enabled_port_mask = 0;
146
147 /* Promiscuous mode */
148 static uint32_t promiscuous;
149
150 /*Number of switching cores enabled*/
151 static uint32_t num_switching_cores = 0;
152
153 /* number of devices/queues to support*/
154 static uint32_t num_queues = 0;
155 static uint32_t num_devices;
156
157 /*
158  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
159  * disabled on default.
160  */
161 static uint32_t zero_copy;
162 static int mergeable;
163
164 /* Do vlan strip on host, enabled on default */
165 static uint32_t vlan_strip = 1;
166
167 /* number of descriptors to apply*/
168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
170
171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
172 #define MAX_RING_DESC 4096
173
174 struct vpool {
175         struct rte_mempool *pool;
176         struct rte_ring *ring;
177         uint32_t buf_size;
178 } vpool_array[MAX_QUEUES+MAX_QUEUES];
179
180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
181 typedef enum {
182         VM2VM_DISABLED = 0,
183         VM2VM_SOFTWARE = 1,
184         VM2VM_HARDWARE = 2,
185         VM2VM_LAST
186 } vm2vm_type;
187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
188
189 /* The type of host physical address translated from guest physical address. */
190 typedef enum {
191         PHYS_ADDR_CONTINUOUS = 0,
192         PHYS_ADDR_CROSS_SUBREG = 1,
193         PHYS_ADDR_INVALID = 2,
194         PHYS_ADDR_LAST
195 } hpa_type;
196
197 /* Enable stats. */
198 static uint32_t enable_stats = 0;
199 /* Enable retries on RX. */
200 static uint32_t enable_retry = 1;
201 /* Specify timeout (in useconds) between retries on RX. */
202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
203 /* Specify the number of retries on RX. */
204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
205
206 /* Character device basename. Can be set by user. */
207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
208
209 /* empty vmdq configuration structure. Filled in programatically */
210 static struct rte_eth_conf vmdq_conf_default = {
211         .rxmode = {
212                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
213                 .split_hdr_size = 0,
214                 .header_split   = 0, /**< Header Split disabled */
215                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
216                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
217                 /*
218                  * It is necessary for 1G NIC such as I350,
219                  * this fixes bug of ipv4 forwarding in guest can't
220                  * forward pakets from one virtio dev to another virtio dev.
221                  */
222                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
223                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
224                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
225         },
226
227         .txmode = {
228                 .mq_mode = ETH_MQ_TX_NONE,
229         },
230         .rx_adv_conf = {
231                 /*
232                  * should be overridden separately in code with
233                  * appropriate values
234                  */
235                 .vmdq_rx_conf = {
236                         .nb_queue_pools = ETH_8_POOLS,
237                         .enable_default_pool = 0,
238                         .default_pool = 0,
239                         .nb_pool_maps = 0,
240                         .pool_map = {{0, 0},},
241                 },
242         },
243 };
244
245 static unsigned lcore_ids[RTE_MAX_LCORE];
246 static uint8_t ports[RTE_MAX_ETHPORTS];
247 static unsigned num_ports = 0; /**< The number of ports specified in command line */
248 static uint16_t num_pf_queues, num_vmdq_queues;
249 static uint16_t vmdq_pool_base, vmdq_queue_base;
250 static uint16_t queues_per_pool;
251
252 static const uint16_t external_pkt_default_vlan_tag = 2000;
253 const uint16_t vlan_tags[] = {
254         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
255         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
256         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
257         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
258         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
259         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
260         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
261         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
262 };
263
264 /* ethernet addresses of ports */
265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
266
267 /* heads for the main used and free linked lists for the data path. */
268 static struct virtio_net_data_ll *ll_root_used = NULL;
269 static struct virtio_net_data_ll *ll_root_free = NULL;
270
271 /* Array of data core structures containing information on individual core linked lists. */
272 static struct lcore_info lcore_info[RTE_MAX_LCORE];
273
274 /* Used for queueing bursts of TX packets. */
275 struct mbuf_table {
276         unsigned len;
277         unsigned txq_id;
278         struct rte_mbuf *m_table[MAX_PKT_BURST];
279 };
280
281 /* TX queue for each data core. */
282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
283
284 /* TX queue fori each virtio device for zero copy. */
285 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
286
287 /* Vlan header struct used to insert vlan tags on TX. */
288 struct vlan_ethhdr {
289         unsigned char   h_dest[ETH_ALEN];
290         unsigned char   h_source[ETH_ALEN];
291         __be16          h_vlan_proto;
292         __be16          h_vlan_TCI;
293         __be16          h_vlan_encapsulated_proto;
294 };
295
296 /* IPv4 Header */
297 struct ipv4_hdr {
298         uint8_t  version_ihl;           /**< version and header length */
299         uint8_t  type_of_service;       /**< type of service */
300         uint16_t total_length;          /**< length of packet */
301         uint16_t packet_id;             /**< packet ID */
302         uint16_t fragment_offset;       /**< fragmentation offset */
303         uint8_t  time_to_live;          /**< time to live */
304         uint8_t  next_proto_id;         /**< protocol ID */
305         uint16_t hdr_checksum;          /**< header checksum */
306         uint32_t src_addr;              /**< source address */
307         uint32_t dst_addr;              /**< destination address */
308 } __attribute__((__packed__));
309
310 /* Header lengths. */
311 #define VLAN_HLEN       4
312 #define VLAN_ETH_HLEN   18
313
314 /* Per-device statistics struct */
315 struct device_statistics {
316         uint64_t tx_total;
317         rte_atomic64_t rx_total_atomic;
318         uint64_t rx_total;
319         uint64_t tx;
320         rte_atomic64_t rx_atomic;
321         uint64_t rx;
322 } __rte_cache_aligned;
323 struct device_statistics dev_statistics[MAX_DEVICES];
324
325 /*
326  * Builds up the correct configuration for VMDQ VLAN pool map
327  * according to the pool & queue limits.
328  */
329 static inline int
330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
331 {
332         struct rte_eth_vmdq_rx_conf conf;
333         struct rte_eth_vmdq_rx_conf *def_conf =
334                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
335         unsigned i;
336
337         memset(&conf, 0, sizeof(conf));
338         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
339         conf.nb_pool_maps = num_devices;
340         conf.enable_loop_back = def_conf->enable_loop_back;
341         conf.rx_mode = def_conf->rx_mode;
342
343         for (i = 0; i < conf.nb_pool_maps; i++) {
344                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
345                 conf.pool_map[i].pools = (1UL << i);
346         }
347
348         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
349         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
350                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
351         return 0;
352 }
353
354 /*
355  * Validate the device number according to the max pool number gotten form
356  * dev_info. If the device number is invalid, give the error message and
357  * return -1. Each device must have its own pool.
358  */
359 static inline int
360 validate_num_devices(uint32_t max_nb_devices)
361 {
362         if (num_devices > max_nb_devices) {
363                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
364                 return -1;
365         }
366         return 0;
367 }
368
369 /*
370  * Initialises a given port using global settings and with the rx buffers
371  * coming from the mbuf_pool passed as parameter
372  */
373 static inline int
374 port_init(uint8_t port)
375 {
376         struct rte_eth_dev_info dev_info;
377         struct rte_eth_conf port_conf;
378         struct rte_eth_rxconf *rxconf;
379         struct rte_eth_txconf *txconf;
380         int16_t rx_rings, tx_rings;
381         uint16_t rx_ring_size, tx_ring_size;
382         int retval;
383         uint16_t q;
384
385         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
386         rte_eth_dev_info_get (port, &dev_info);
387
388         if (dev_info.max_rx_queues > MAX_QUEUES) {
389                 rte_exit(EXIT_FAILURE,
390                         "please define MAX_QUEUES no less than %u in %s\n",
391                         dev_info.max_rx_queues, __FILE__);
392         }
393
394         rxconf = &dev_info.default_rxconf;
395         txconf = &dev_info.default_txconf;
396         rxconf->rx_drop_en = 1;
397
398         /* Enable vlan offload */
399         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
400
401         /*
402          * Zero copy defers queue RX/TX start to the time when guest
403          * finishes its startup and packet buffers from that guest are
404          * available.
405          */
406         if (zero_copy) {
407                 rxconf->rx_deferred_start = 1;
408                 rxconf->rx_drop_en = 0;
409                 txconf->tx_deferred_start = 1;
410         }
411
412         /*configure the number of supported virtio devices based on VMDQ limits */
413         num_devices = dev_info.max_vmdq_pools;
414
415         if (zero_copy) {
416                 rx_ring_size = num_rx_descriptor;
417                 tx_ring_size = num_tx_descriptor;
418                 tx_rings = dev_info.max_tx_queues;
419         } else {
420                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
421                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
422                 tx_rings = (uint16_t)rte_lcore_count();
423         }
424
425         retval = validate_num_devices(MAX_DEVICES);
426         if (retval < 0)
427                 return retval;
428
429         /* Get port configuration. */
430         retval = get_eth_conf(&port_conf, num_devices);
431         if (retval < 0)
432                 return retval;
433         /* NIC queues are divided into pf queues and vmdq queues.  */
434         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
435         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
436         num_vmdq_queues = num_devices * queues_per_pool;
437         num_queues = num_pf_queues + num_vmdq_queues;
438         vmdq_queue_base = dev_info.vmdq_queue_base;
439         vmdq_pool_base  = dev_info.vmdq_pool_base;
440         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
441                 num_pf_queues, num_devices, queues_per_pool);
442
443         if (port >= rte_eth_dev_count()) return -1;
444
445         rx_rings = (uint16_t)dev_info.max_rx_queues;
446         /* Configure ethernet device. */
447         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
448         if (retval != 0)
449                 return retval;
450
451         /* Setup the queues. */
452         for (q = 0; q < rx_rings; q ++) {
453                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
454                                                 rte_eth_dev_socket_id(port),
455                                                 rxconf,
456                                                 vpool_array[q].pool);
457                 if (retval < 0)
458                         return retval;
459         }
460         for (q = 0; q < tx_rings; q ++) {
461                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462                                                 rte_eth_dev_socket_id(port),
463                                                 txconf);
464                 if (retval < 0)
465                         return retval;
466         }
467
468         /* Start the device. */
469         retval  = rte_eth_dev_start(port);
470         if (retval < 0) {
471                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
472                 return retval;
473         }
474
475         if (promiscuous)
476                 rte_eth_promiscuous_enable(port);
477
478         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
479         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
480         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
481                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482                         (unsigned)port,
483                         vmdq_ports_eth_addr[port].addr_bytes[0],
484                         vmdq_ports_eth_addr[port].addr_bytes[1],
485                         vmdq_ports_eth_addr[port].addr_bytes[2],
486                         vmdq_ports_eth_addr[port].addr_bytes[3],
487                         vmdq_ports_eth_addr[port].addr_bytes[4],
488                         vmdq_ports_eth_addr[port].addr_bytes[5]);
489
490         return 0;
491 }
492
493 /*
494  * Set character device basename.
495  */
496 static int
497 us_vhost_parse_basename(const char *q_arg)
498 {
499         /* parse number string */
500
501         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
502                 return -1;
503         else
504                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
505
506         return 0;
507 }
508
509 /*
510  * Parse the portmask provided at run time.
511  */
512 static int
513 parse_portmask(const char *portmask)
514 {
515         char *end = NULL;
516         unsigned long pm;
517
518         errno = 0;
519
520         /* parse hexadecimal string */
521         pm = strtoul(portmask, &end, 16);
522         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
523                 return -1;
524
525         if (pm == 0)
526                 return -1;
527
528         return pm;
529
530 }
531
532 /*
533  * Parse num options at run time.
534  */
535 static int
536 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
537 {
538         char *end = NULL;
539         unsigned long num;
540
541         errno = 0;
542
543         /* parse unsigned int string */
544         num = strtoul(q_arg, &end, 10);
545         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
546                 return -1;
547
548         if (num > max_valid_value)
549                 return -1;
550
551         return num;
552
553 }
554
555 /*
556  * Display usage
557  */
558 static void
559 us_vhost_usage(const char *prgname)
560 {
561         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562         "               --vm2vm [0|1|2]\n"
563         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
564         "               --dev-basename <name>\n"
565         "               --nb-devices ND\n"
566         "               -p PORTMASK: Set mask for ports to be used by application\n"
567         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
568         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
569         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
570         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
571         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
572         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
573         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574         "               --dev-basename: The basename to be used for the character device.\n"
575         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
576                         "zero copy\n"
577         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
578                         "used only when zero copy is enabled.\n"
579         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
580                         "used only when zero copy is enabled.\n",
581                prgname);
582 }
583
584 /*
585  * Parse the arguments given in the command line of the application.
586  */
587 static int
588 us_vhost_parse_args(int argc, char **argv)
589 {
590         int opt, ret;
591         int option_index;
592         unsigned i;
593         const char *prgname = argv[0];
594         static struct option long_option[] = {
595                 {"vm2vm", required_argument, NULL, 0},
596                 {"rx-retry", required_argument, NULL, 0},
597                 {"rx-retry-delay", required_argument, NULL, 0},
598                 {"rx-retry-num", required_argument, NULL, 0},
599                 {"mergeable", required_argument, NULL, 0},
600                 {"vlan-strip", required_argument, NULL, 0},
601                 {"stats", required_argument, NULL, 0},
602                 {"dev-basename", required_argument, NULL, 0},
603                 {"zero-copy", required_argument, NULL, 0},
604                 {"rx-desc-num", required_argument, NULL, 0},
605                 {"tx-desc-num", required_argument, NULL, 0},
606                 {NULL, 0, 0, 0},
607         };
608
609         /* Parse command line */
610         while ((opt = getopt_long(argc, argv, "p:P",
611                         long_option, &option_index)) != EOF) {
612                 switch (opt) {
613                 /* Portmask */
614                 case 'p':
615                         enabled_port_mask = parse_portmask(optarg);
616                         if (enabled_port_mask == 0) {
617                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
618                                 us_vhost_usage(prgname);
619                                 return -1;
620                         }
621                         break;
622
623                 case 'P':
624                         promiscuous = 1;
625                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
626                                 ETH_VMDQ_ACCEPT_BROADCAST |
627                                 ETH_VMDQ_ACCEPT_MULTICAST;
628                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
629
630                         break;
631
632                 case 0:
633                         /* Enable/disable vm2vm comms. */
634                         if (!strncmp(long_option[option_index].name, "vm2vm",
635                                 MAX_LONG_OPT_SZ)) {
636                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
637                                 if (ret == -1) {
638                                         RTE_LOG(INFO, VHOST_CONFIG,
639                                                 "Invalid argument for "
640                                                 "vm2vm [0|1|2]\n");
641                                         us_vhost_usage(prgname);
642                                         return -1;
643                                 } else {
644                                         vm2vm_mode = (vm2vm_type)ret;
645                                 }
646                         }
647
648                         /* Enable/disable retries on RX. */
649                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
650                                 ret = parse_num_opt(optarg, 1);
651                                 if (ret == -1) {
652                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
653                                         us_vhost_usage(prgname);
654                                         return -1;
655                                 } else {
656                                         enable_retry = ret;
657                                 }
658                         }
659
660                         /* Specify the retries delay time (in useconds) on RX. */
661                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
662                                 ret = parse_num_opt(optarg, INT32_MAX);
663                                 if (ret == -1) {
664                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
665                                         us_vhost_usage(prgname);
666                                         return -1;
667                                 } else {
668                                         burst_rx_delay_time = ret;
669                                 }
670                         }
671
672                         /* Specify the retries number on RX. */
673                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
674                                 ret = parse_num_opt(optarg, INT32_MAX);
675                                 if (ret == -1) {
676                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
677                                         us_vhost_usage(prgname);
678                                         return -1;
679                                 } else {
680                                         burst_rx_retry_num = ret;
681                                 }
682                         }
683
684                         /* Enable/disable RX mergeable buffers. */
685                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
686                                 ret = parse_num_opt(optarg, 1);
687                                 if (ret == -1) {
688                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
689                                         us_vhost_usage(prgname);
690                                         return -1;
691                                 } else {
692                                         mergeable = !!ret;
693                                         if (ret) {
694                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
695                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
696                                                         = JUMBO_FRAME_MAX_SIZE;
697                                         }
698                                 }
699                         }
700
701                         /* Enable/disable RX VLAN strip on host. */
702                         if (!strncmp(long_option[option_index].name,
703                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
704                                 ret = parse_num_opt(optarg, 1);
705                                 if (ret == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG,
707                                                 "Invalid argument for VLAN strip [0|1]\n");
708                                         us_vhost_usage(prgname);
709                                         return -1;
710                                 } else {
711                                         vlan_strip = !!ret;
712                                         vmdq_conf_default.rxmode.hw_vlan_strip =
713                                                 vlan_strip;
714                                 }
715                         }
716
717                         /* Enable/disable stats. */
718                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
719                                 ret = parse_num_opt(optarg, INT32_MAX);
720                                 if (ret == -1) {
721                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
722                                         us_vhost_usage(prgname);
723                                         return -1;
724                                 } else {
725                                         enable_stats = ret;
726                                 }
727                         }
728
729                         /* Set character device basename. */
730                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
731                                 if (us_vhost_parse_basename(optarg) == -1) {
732                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
733                                         us_vhost_usage(prgname);
734                                         return -1;
735                                 }
736                         }
737
738                         /* Enable/disable rx/tx zero copy. */
739                         if (!strncmp(long_option[option_index].name,
740                                 "zero-copy", MAX_LONG_OPT_SZ)) {
741                                 ret = parse_num_opt(optarg, 1);
742                                 if (ret == -1) {
743                                         RTE_LOG(INFO, VHOST_CONFIG,
744                                                 "Invalid argument"
745                                                 " for zero-copy [0|1]\n");
746                                         us_vhost_usage(prgname);
747                                         return -1;
748                                 } else
749                                         zero_copy = ret;
750                         }
751
752                         /* Specify the descriptor number on RX. */
753                         if (!strncmp(long_option[option_index].name,
754                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
755                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
756                                 if ((ret == -1) || (!POWEROF2(ret))) {
757                                         RTE_LOG(INFO, VHOST_CONFIG,
758                                         "Invalid argument for rx-desc-num[0-N],"
759                                         "power of 2 required.\n");
760                                         us_vhost_usage(prgname);
761                                         return -1;
762                                 } else {
763                                         num_rx_descriptor = ret;
764                                 }
765                         }
766
767                         /* Specify the descriptor number on TX. */
768                         if (!strncmp(long_option[option_index].name,
769                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
770                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
771                                 if ((ret == -1) || (!POWEROF2(ret))) {
772                                         RTE_LOG(INFO, VHOST_CONFIG,
773                                         "Invalid argument for tx-desc-num [0-N],"
774                                         "power of 2 required.\n");
775                                         us_vhost_usage(prgname);
776                                         return -1;
777                                 } else {
778                                         num_tx_descriptor = ret;
779                                 }
780                         }
781
782                         break;
783
784                         /* Invalid option - print options. */
785                 default:
786                         us_vhost_usage(prgname);
787                         return -1;
788                 }
789         }
790
791         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
792                 if (enabled_port_mask & (1 << i))
793                         ports[num_ports++] = (uint8_t)i;
794         }
795
796         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
797                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
798                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
799                 return -1;
800         }
801
802         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
803                 RTE_LOG(INFO, VHOST_PORT,
804                         "Vhost zero copy doesn't support software vm2vm,"
805                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
806                 return -1;
807         }
808
809         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
810                 RTE_LOG(INFO, VHOST_PORT,
811                         "Vhost zero copy doesn't support jumbo frame,"
812                         "please specify '--mergeable 0' to disable the "
813                         "mergeable feature.\n");
814                 return -1;
815         }
816
817         return 0;
818 }
819
820 /*
821  * Update the global var NUM_PORTS and array PORTS according to system ports number
822  * and return valid ports number
823  */
824 static unsigned check_ports_num(unsigned nb_ports)
825 {
826         unsigned valid_num_ports = num_ports;
827         unsigned portid;
828
829         if (num_ports > nb_ports) {
830                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
831                         num_ports, nb_ports);
832                 num_ports = nb_ports;
833         }
834
835         for (portid = 0; portid < num_ports; portid ++) {
836                 if (ports[portid] >= nb_ports) {
837                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
838                                 ports[portid], (nb_ports - 1));
839                         ports[portid] = INVALID_PORT_ID;
840                         valid_num_ports--;
841                 }
842         }
843         return valid_num_ports;
844 }
845
846 /*
847  * Macro to print out packet contents. Wrapped in debug define so that the
848  * data path is not effected when debug is disabled.
849  */
850 #ifdef DEBUG
851 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
852         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
853         unsigned int index;                                                                                                                                                                                             \
854         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
855                                                                                                                                                                                                                                         \
856         if ((header))                                                                                                                                                                                                   \
857                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
858         else                                                                                                                                                                                                                    \
859                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
860         for (index = 0; index < (size); index++) {                                                                                                                                              \
861                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
862                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
863         }                                                                                                                                                                                                                               \
864         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
865                                                                                                                                                                                                                                         \
866         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
867 } while(0)
868 #else
869 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
870 #endif
871
872 /*
873  * Function to convert guest physical addresses to vhost physical addresses.
874  * This is used to convert virtio buffer addresses.
875  */
876 static inline uint64_t __attribute__((always_inline))
877 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
878         uint32_t buf_len, hpa_type *addr_type)
879 {
880         struct virtio_memory_regions_hpa *region;
881         uint32_t regionidx;
882         uint64_t vhost_pa = 0;
883
884         *addr_type = PHYS_ADDR_INVALID;
885
886         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
887                 region = &vdev->regions_hpa[regionidx];
888                 if ((guest_pa >= region->guest_phys_address) &&
889                         (guest_pa <= region->guest_phys_address_end)) {
890                         vhost_pa = region->host_phys_addr_offset + guest_pa;
891                         if (likely((guest_pa + buf_len - 1)
892                                 <= region->guest_phys_address_end))
893                                 *addr_type = PHYS_ADDR_CONTINUOUS;
894                         else
895                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
896                         break;
897                 }
898         }
899
900         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
901                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
902                 (void *)(uintptr_t)vhost_pa);
903
904         return vhost_pa;
905 }
906
907 /*
908  * Compares a packet destination MAC address to a device MAC address.
909  */
910 static inline int __attribute__((always_inline))
911 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
912 {
913         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
914 }
915
916 /*
917  * This function learns the MAC address of the device and registers this along with a
918  * vlan tag to a VMDQ.
919  */
920 static int
921 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
922 {
923         struct ether_hdr *pkt_hdr;
924         struct virtio_net_data_ll *dev_ll;
925         struct virtio_net *dev = vdev->dev;
926         int i, ret;
927
928         /* Learn MAC address of guest device from packet */
929         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
930
931         dev_ll = ll_root_used;
932
933         while (dev_ll != NULL) {
934                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
935                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
936                         return -1;
937                 }
938                 dev_ll = dev_ll->next;
939         }
940
941         for (i = 0; i < ETHER_ADDR_LEN; i++)
942                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
943
944         /* vlan_tag currently uses the device_id. */
945         vdev->vlan_tag = vlan_tags[dev->device_fh];
946
947         /* Print out VMDQ registration info. */
948         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
949                 dev->device_fh,
950                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
951                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
952                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
953                 vdev->vlan_tag);
954
955         /* Register the MAC address. */
956         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
957                                 (uint32_t)dev->device_fh + vmdq_pool_base);
958         if (ret)
959                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
960                                         dev->device_fh);
961
962         /* Enable stripping of the vlan tag as we handle routing. */
963         if (vlan_strip)
964                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
965                         (uint16_t)vdev->vmdq_rx_q, 1);
966
967         /* Set device as ready for RX. */
968         vdev->ready = DEVICE_RX;
969
970         return 0;
971 }
972
973 /*
974  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
975  * queue before disabling RX on the device.
976  */
977 static inline void
978 unlink_vmdq(struct vhost_dev *vdev)
979 {
980         unsigned i = 0;
981         unsigned rx_count;
982         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
983
984         if (vdev->ready == DEVICE_RX) {
985                 /*clear MAC and VLAN settings*/
986                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
987                 for (i = 0; i < 6; i++)
988                         vdev->mac_address.addr_bytes[i] = 0;
989
990                 vdev->vlan_tag = 0;
991
992                 /*Clear out the receive buffers*/
993                 rx_count = rte_eth_rx_burst(ports[0],
994                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
995
996                 while (rx_count) {
997                         for (i = 0; i < rx_count; i++)
998                                 rte_pktmbuf_free(pkts_burst[i]);
999
1000                         rx_count = rte_eth_rx_burst(ports[0],
1001                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1002                 }
1003
1004                 vdev->ready = DEVICE_MAC_LEARNING;
1005         }
1006 }
1007
1008 /*
1009  * Check if the packet destination MAC address is for a local device. If so then put
1010  * the packet on that devices RX queue. If not then return.
1011  */
1012 static inline int __attribute__((always_inline))
1013 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1014 {
1015         struct virtio_net_data_ll *dev_ll;
1016         struct ether_hdr *pkt_hdr;
1017         uint64_t ret = 0;
1018         struct virtio_net *dev = vdev->dev;
1019         struct virtio_net *tdev; /* destination virito device */
1020
1021         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1022
1023         /*get the used devices list*/
1024         dev_ll = ll_root_used;
1025
1026         while (dev_ll != NULL) {
1027                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1028                                           &dev_ll->vdev->mac_address)) {
1029
1030                         /* Drop the packet if the TX packet is destined for the TX device. */
1031                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1032                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1033                                                         dev->device_fh);
1034                                 return 0;
1035                         }
1036                         tdev = dev_ll->vdev->dev;
1037
1038
1039                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1040
1041                         if (unlikely(dev_ll->vdev->remove)) {
1042                                 /*drop the packet if the device is marked for removal*/
1043                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1044                         } else {
1045                                 /*send the packet to the local virtio device*/
1046                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1047                                 if (enable_stats) {
1048                                         rte_atomic64_add(
1049                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1050                                         1);
1051                                         rte_atomic64_add(
1052                                         &dev_statistics[tdev->device_fh].rx_atomic,
1053                                         ret);
1054                                         dev_statistics[tdev->device_fh].tx_total++;
1055                                         dev_statistics[tdev->device_fh].tx += ret;
1056                                 }
1057                         }
1058
1059                         return 0;
1060                 }
1061                 dev_ll = dev_ll->next;
1062         }
1063
1064         return -1;
1065 }
1066
1067 /*
1068  * Check if the destination MAC of a packet is one local VM,
1069  * and get its vlan tag, and offset if it is.
1070  */
1071 static inline int __attribute__((always_inline))
1072 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1073         uint32_t *offset, uint16_t *vlan_tag)
1074 {
1075         struct virtio_net_data_ll *dev_ll = ll_root_used;
1076         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1077
1078         while (dev_ll != NULL) {
1079                 if ((dev_ll->vdev->ready == DEVICE_RX)
1080                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1081                 &dev_ll->vdev->mac_address)) {
1082                         /*
1083                          * Drop the packet if the TX packet is
1084                          * destined for the TX device.
1085                          */
1086                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1087                                 LOG_DEBUG(VHOST_DATA,
1088                                 "(%"PRIu64") TX: Source and destination"
1089                                 " MAC addresses are the same. Dropping "
1090                                 "packet.\n",
1091                                 dev_ll->vdev->dev->device_fh);
1092                                 return -1;
1093                         }
1094
1095                         /*
1096                          * HW vlan strip will reduce the packet length
1097                          * by minus length of vlan tag, so need restore
1098                          * the packet length by plus it.
1099                          */
1100                         *offset = VLAN_HLEN;
1101                         *vlan_tag =
1102                         (uint16_t)
1103                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1104
1105                         LOG_DEBUG(VHOST_DATA,
1106                         "(%"PRIu64") TX: pkt to local VM device id:"
1107                         "(%"PRIu64") vlan tag: %d.\n",
1108                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1109                         vlan_tag);
1110
1111                         break;
1112                 }
1113                 dev_ll = dev_ll->next;
1114         }
1115         return 0;
1116 }
1117
1118 /*
1119  * This function routes the TX packet to the correct interface. This may be a local device
1120  * or the physical port.
1121  */
1122 static inline void __attribute__((always_inline))
1123 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1124 {
1125         struct mbuf_table *tx_q;
1126         struct rte_mbuf **m_table;
1127         unsigned len, ret, offset = 0;
1128         const uint16_t lcore_id = rte_lcore_id();
1129         struct virtio_net *dev = vdev->dev;
1130         struct ether_hdr *nh;
1131
1132         /*check if destination is local VM*/
1133         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1134                 rte_pktmbuf_free(m);
1135                 return;
1136         }
1137
1138         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1139                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1140                         rte_pktmbuf_free(m);
1141                         return;
1142                 }
1143         }
1144
1145         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1146
1147         /*Add packet to the port tx queue*/
1148         tx_q = &lcore_tx_queue[lcore_id];
1149         len = tx_q->len;
1150
1151         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1152         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1153                 /* Guest has inserted the vlan tag. */
1154                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1155                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1156                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1157                         (vh->vlan_tci != vlan_tag_be))
1158                         vh->vlan_tci = vlan_tag_be;
1159         } else {
1160                 m->ol_flags = PKT_TX_VLAN_PKT;
1161
1162                 /*
1163                  * Find the right seg to adjust the data len when offset is
1164                  * bigger than tail room size.
1165                  */
1166                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1167                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1168                                 m->data_len += offset;
1169                         else {
1170                                 struct rte_mbuf *seg = m;
1171
1172                                 while ((seg->next != NULL) &&
1173                                         (offset > rte_pktmbuf_tailroom(seg)))
1174                                         seg = seg->next;
1175
1176                                 seg->data_len += offset;
1177                         }
1178                         m->pkt_len += offset;
1179                 }
1180
1181                 m->vlan_tci = vlan_tag;
1182         }
1183
1184         tx_q->m_table[len] = m;
1185         len++;
1186         if (enable_stats) {
1187                 dev_statistics[dev->device_fh].tx_total++;
1188                 dev_statistics[dev->device_fh].tx++;
1189         }
1190
1191         if (unlikely(len == MAX_PKT_BURST)) {
1192                 m_table = (struct rte_mbuf **)tx_q->m_table;
1193                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1194                 /* Free any buffers not handled by TX and update the port stats. */
1195                 if (unlikely(ret < len)) {
1196                         do {
1197                                 rte_pktmbuf_free(m_table[ret]);
1198                         } while (++ret < len);
1199                 }
1200
1201                 len = 0;
1202         }
1203
1204         tx_q->len = len;
1205         return;
1206 }
1207 /*
1208  * This function is called by each data core. It handles all RX/TX registered with the
1209  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1210  * with all devices in the main linked list.
1211  */
1212 static int
1213 switch_worker(__attribute__((unused)) void *arg)
1214 {
1215         struct rte_mempool *mbuf_pool = arg;
1216         struct virtio_net *dev = NULL;
1217         struct vhost_dev *vdev = NULL;
1218         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1219         struct virtio_net_data_ll *dev_ll;
1220         struct mbuf_table *tx_q;
1221         volatile struct lcore_ll_info *lcore_ll;
1222         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1223         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1224         unsigned ret, i;
1225         const uint16_t lcore_id = rte_lcore_id();
1226         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1227         uint16_t rx_count = 0;
1228         uint16_t tx_count;
1229         uint32_t retry = 0;
1230
1231         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1232         lcore_ll = lcore_info[lcore_id].lcore_ll;
1233         prev_tsc = 0;
1234
1235         tx_q = &lcore_tx_queue[lcore_id];
1236         for (i = 0; i < num_cores; i ++) {
1237                 if (lcore_ids[i] == lcore_id) {
1238                         tx_q->txq_id = i;
1239                         break;
1240                 }
1241         }
1242
1243         while(1) {
1244                 cur_tsc = rte_rdtsc();
1245                 /*
1246                  * TX burst queue drain
1247                  */
1248                 diff_tsc = cur_tsc - prev_tsc;
1249                 if (unlikely(diff_tsc > drain_tsc)) {
1250
1251                         if (tx_q->len) {
1252                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1253
1254                                 /*Tx any packets in the queue*/
1255                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1256                                                                            (struct rte_mbuf **)tx_q->m_table,
1257                                                                            (uint16_t)tx_q->len);
1258                                 if (unlikely(ret < tx_q->len)) {
1259                                         do {
1260                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1261                                         } while (++ret < tx_q->len);
1262                                 }
1263
1264                                 tx_q->len = 0;
1265                         }
1266
1267                         prev_tsc = cur_tsc;
1268
1269                 }
1270
1271                 rte_prefetch0(lcore_ll->ll_root_used);
1272                 /*
1273                  * Inform the configuration core that we have exited the linked list and that no devices are
1274                  * in use if requested.
1275                  */
1276                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1277                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1278
1279                 /*
1280                  * Process devices
1281                  */
1282                 dev_ll = lcore_ll->ll_root_used;
1283
1284                 while (dev_ll != NULL) {
1285                         /*get virtio device ID*/
1286                         vdev = dev_ll->vdev;
1287                         dev = vdev->dev;
1288
1289                         if (unlikely(vdev->remove)) {
1290                                 dev_ll = dev_ll->next;
1291                                 unlink_vmdq(vdev);
1292                                 vdev->ready = DEVICE_SAFE_REMOVE;
1293                                 continue;
1294                         }
1295                         if (likely(vdev->ready == DEVICE_RX)) {
1296                                 /*Handle guest RX*/
1297                                 rx_count = rte_eth_rx_burst(ports[0],
1298                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1299
1300                                 if (rx_count) {
1301                                         /*
1302                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1303                                         * Here MAX_PKT_BURST must be less than virtio queue size
1304                                         */
1305                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1306                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1307                                                         rte_delay_us(burst_rx_delay_time);
1308                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1309                                                                 break;
1310                                                 }
1311                                         }
1312                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1313                                         if (enable_stats) {
1314                                                 rte_atomic64_add(
1315                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1316                                                 rx_count);
1317                                                 rte_atomic64_add(
1318                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1319                                         }
1320                                         while (likely(rx_count)) {
1321                                                 rx_count--;
1322                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1323                                         }
1324
1325                                 }
1326                         }
1327
1328                         if (likely(!vdev->remove)) {
1329                                 /* Handle guest TX*/
1330                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1331                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1332                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1333                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1334                                                 while (tx_count)
1335                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1336                                         }
1337                                 }
1338                                 while (tx_count)
1339                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1340                         }
1341
1342                         /*move to the next device in the list*/
1343                         dev_ll = dev_ll->next;
1344                 }
1345         }
1346
1347         return 0;
1348 }
1349
1350 /*
1351  * This function gets available ring number for zero copy rx.
1352  * Only one thread will call this funciton for a paticular virtio device,
1353  * so, it is designed as non-thread-safe function.
1354  */
1355 static inline uint32_t __attribute__((always_inline))
1356 get_available_ring_num_zcp(struct virtio_net *dev)
1357 {
1358         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1359         uint16_t avail_idx;
1360
1361         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1362         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1363 }
1364
1365 /*
1366  * This function gets available ring index for zero copy rx,
1367  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1368  * Only one thread will call this funciton for a paticular virtio device,
1369  * so, it is designed as non-thread-safe function.
1370  */
1371 static inline uint32_t __attribute__((always_inline))
1372 get_available_ring_index_zcp(struct virtio_net *dev,
1373         uint16_t *res_base_idx, uint32_t count)
1374 {
1375         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1376         uint16_t avail_idx;
1377         uint32_t retry = 0;
1378         uint16_t free_entries;
1379
1380         *res_base_idx = vq->last_used_idx_res;
1381         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1382         free_entries = (avail_idx - *res_base_idx);
1383
1384         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1385                         "avail idx: %d, "
1386                         "res base idx:%d, free entries:%d\n",
1387                         dev->device_fh, avail_idx, *res_base_idx,
1388                         free_entries);
1389
1390         /*
1391          * If retry is enabled and the queue is full then we wait
1392          * and retry to avoid packet loss.
1393          */
1394         if (enable_retry && unlikely(count > free_entries)) {
1395                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1396                         rte_delay_us(burst_rx_delay_time);
1397                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1398                         free_entries = (avail_idx - *res_base_idx);
1399                         if (count <= free_entries)
1400                                 break;
1401                 }
1402         }
1403
1404         /*check that we have enough buffers*/
1405         if (unlikely(count > free_entries))
1406                 count = free_entries;
1407
1408         if (unlikely(count == 0)) {
1409                 LOG_DEBUG(VHOST_DATA,
1410                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1411                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1412                         dev->device_fh, avail_idx,
1413                         *res_base_idx, free_entries);
1414                 return 0;
1415         }
1416
1417         vq->last_used_idx_res = *res_base_idx + count;
1418
1419         return count;
1420 }
1421
1422 /*
1423  * This function put descriptor back to used list.
1424  */
1425 static inline void __attribute__((always_inline))
1426 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1427 {
1428         uint16_t res_cur_idx = vq->last_used_idx;
1429         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1430         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1431         rte_compiler_barrier();
1432         *(volatile uint16_t *)&vq->used->idx += 1;
1433         vq->last_used_idx += 1;
1434
1435         /* Kick the guest if necessary. */
1436         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1437                 eventfd_write((int)vq->kickfd, 1);
1438 }
1439
1440 /*
1441  * This function get available descriptor from vitio vring and un-attached mbuf
1442  * from vpool->ring, and then attach them together. It needs adjust the offset
1443  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1444  * frame data may be put to wrong location in mbuf.
1445  */
1446 static inline void __attribute__((always_inline))
1447 attach_rxmbuf_zcp(struct virtio_net *dev)
1448 {
1449         uint16_t res_base_idx, desc_idx;
1450         uint64_t buff_addr, phys_addr;
1451         struct vhost_virtqueue *vq;
1452         struct vring_desc *desc;
1453         struct rte_mbuf *mbuf = NULL;
1454         struct vpool *vpool;
1455         hpa_type addr_type;
1456         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1457
1458         vpool = &vpool_array[vdev->vmdq_rx_q];
1459         vq = dev->virtqueue[VIRTIO_RXQ];
1460
1461         do {
1462                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1463                                 1) != 1))
1464                         return;
1465                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1466
1467                 desc = &vq->desc[desc_idx];
1468                 if (desc->flags & VRING_DESC_F_NEXT) {
1469                         desc = &vq->desc[desc->next];
1470                         buff_addr = gpa_to_vva(dev, desc->addr);
1471                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1472                                         &addr_type);
1473                 } else {
1474                         buff_addr = gpa_to_vva(dev,
1475                                         desc->addr + vq->vhost_hlen);
1476                         phys_addr = gpa_to_hpa(vdev,
1477                                         desc->addr + vq->vhost_hlen,
1478                                         desc->len, &addr_type);
1479                 }
1480
1481                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1482                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1483                                 " address found when attaching RX frame buffer"
1484                                 " address!\n", dev->device_fh);
1485                         put_desc_to_used_list_zcp(vq, desc_idx);
1486                         continue;
1487                 }
1488
1489                 /*
1490                  * Check if the frame buffer address from guest crosses
1491                  * sub-region or not.
1492                  */
1493                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1494                         RTE_LOG(ERR, VHOST_DATA,
1495                                 "(%"PRIu64") Frame buffer address cross "
1496                                 "sub-regioin found when attaching RX frame "
1497                                 "buffer address!\n",
1498                                 dev->device_fh);
1499                         put_desc_to_used_list_zcp(vq, desc_idx);
1500                         continue;
1501                 }
1502         } while (unlikely(phys_addr == 0));
1503
1504         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1505         if (unlikely(mbuf == NULL)) {
1506                 LOG_DEBUG(VHOST_DATA,
1507                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1508                         "ring_sc_dequeue fail.\n",
1509                         dev->device_fh);
1510                 put_desc_to_used_list_zcp(vq, desc_idx);
1511                 return;
1512         }
1513
1514         if (unlikely(vpool->buf_size > desc->len)) {
1515                 LOG_DEBUG(VHOST_DATA,
1516                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1517                         "length(%d) of descriptor idx: %d less than room "
1518                         "size required: %d\n",
1519                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1520                 put_desc_to_used_list_zcp(vq, desc_idx);
1521                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1522                 return;
1523         }
1524
1525         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1526         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1527         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1528         mbuf->data_len = desc->len;
1529         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1530
1531         LOG_DEBUG(VHOST_DATA,
1532                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1533                 "descriptor idx:%d\n",
1534                 dev->device_fh, res_base_idx, desc_idx);
1535
1536         __rte_mbuf_raw_free(mbuf);
1537
1538         return;
1539 }
1540
1541 /*
1542  * Detach an attched packet mbuf -
1543  *  - restore original mbuf address and length values.
1544  *  - reset pktmbuf data and data_len to their default values.
1545  *  All other fields of the given packet mbuf will be left intact.
1546  *
1547  * @param m
1548  *   The attached packet mbuf.
1549  */
1550 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1551 {
1552         const struct rte_mempool *mp = m->pool;
1553         void *buf = RTE_MBUF_TO_BADDR(m);
1554         uint32_t buf_ofs;
1555         uint32_t buf_len = mp->elt_size - sizeof(*m);
1556         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1557
1558         m->buf_addr = buf;
1559         m->buf_len = (uint16_t)buf_len;
1560
1561         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1562                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1563         m->data_off = buf_ofs;
1564
1565         m->data_len = 0;
1566 }
1567
1568 /*
1569  * This function is called after packets have been transimited. It fetchs mbuf
1570  * from vpool->pool, detached it and put into vpool->ring. It also update the
1571  * used index and kick the guest if necessary.
1572  */
1573 static inline uint32_t __attribute__((always_inline))
1574 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1575 {
1576         struct rte_mbuf *mbuf;
1577         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1578         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1579         uint32_t index = 0;
1580         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1581
1582         LOG_DEBUG(VHOST_DATA,
1583                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1584                 "clean is: %d\n",
1585                 dev->device_fh, mbuf_count);
1586         LOG_DEBUG(VHOST_DATA,
1587                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1588                 "clean  is : %d\n",
1589                 dev->device_fh, rte_ring_count(vpool->ring));
1590
1591         for (index = 0; index < mbuf_count; index++) {
1592                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1593                 if (likely(MBUF_EXT_MEM(mbuf)))
1594                         pktmbuf_detach_zcp(mbuf);
1595                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1596
1597                 /* Update used index buffer information. */
1598                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1599                 vq->used->ring[used_idx].len = 0;
1600
1601                 used_idx = (used_idx + 1) & (vq->size - 1);
1602         }
1603
1604         LOG_DEBUG(VHOST_DATA,
1605                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1606                 "clean is: %d\n",
1607                 dev->device_fh, rte_mempool_count(vpool->pool));
1608         LOG_DEBUG(VHOST_DATA,
1609                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1610                 "clean  is : %d\n",
1611                 dev->device_fh, rte_ring_count(vpool->ring));
1612         LOG_DEBUG(VHOST_DATA,
1613                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1614                 "vq->last_used_idx:%d\n",
1615                 dev->device_fh, vq->last_used_idx);
1616
1617         vq->last_used_idx += mbuf_count;
1618
1619         LOG_DEBUG(VHOST_DATA,
1620                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1621                 "vq->last_used_idx:%d\n",
1622                 dev->device_fh, vq->last_used_idx);
1623
1624         rte_compiler_barrier();
1625
1626         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1627
1628         /* Kick guest if required. */
1629         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1630                 eventfd_write((int)vq->kickfd, 1);
1631
1632         return 0;
1633 }
1634
1635 /*
1636  * This function is called when a virtio device is destroy.
1637  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1638  */
1639 static void mbuf_destroy_zcp(struct vpool *vpool)
1640 {
1641         struct rte_mbuf *mbuf = NULL;
1642         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1643
1644         LOG_DEBUG(VHOST_CONFIG,
1645                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1646                 "mbuf_destroy_zcp is: %d\n",
1647                 mbuf_count);
1648         LOG_DEBUG(VHOST_CONFIG,
1649                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1650                 "mbuf_destroy_zcp  is : %d\n",
1651                 rte_ring_count(vpool->ring));
1652
1653         for (index = 0; index < mbuf_count; index++) {
1654                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1655                 if (likely(mbuf != NULL)) {
1656                         if (likely(MBUF_EXT_MEM(mbuf)))
1657                                 pktmbuf_detach_zcp(mbuf);
1658                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1659                 }
1660         }
1661
1662         LOG_DEBUG(VHOST_CONFIG,
1663                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1664                 "mbuf_destroy_zcp is: %d\n",
1665                 rte_mempool_count(vpool->pool));
1666         LOG_DEBUG(VHOST_CONFIG,
1667                 "in mbuf_destroy_zcp: mbuf count in ring after "
1668                 "mbuf_destroy_zcp is : %d\n",
1669                 rte_ring_count(vpool->ring));
1670 }
1671
1672 /*
1673  * This function update the use flag and counter.
1674  */
1675 static inline uint32_t __attribute__((always_inline))
1676 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1677         uint32_t count)
1678 {
1679         struct vhost_virtqueue *vq;
1680         struct vring_desc *desc;
1681         struct rte_mbuf *buff;
1682         /* The virtio_hdr is initialised to 0. */
1683         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1684                 = {{0, 0, 0, 0, 0, 0}, 0};
1685         uint64_t buff_hdr_addr = 0;
1686         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1687         uint32_t head_idx, packet_success = 0;
1688         uint16_t res_cur_idx;
1689
1690         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1691
1692         if (count == 0)
1693                 return 0;
1694
1695         vq = dev->virtqueue[VIRTIO_RXQ];
1696         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1697
1698         res_cur_idx = vq->last_used_idx;
1699         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1700                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1701
1702         /* Retrieve all of the head indexes first to avoid caching issues. */
1703         for (head_idx = 0; head_idx < count; head_idx++)
1704                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1705
1706         /*Prefetch descriptor index. */
1707         rte_prefetch0(&vq->desc[head[packet_success]]);
1708
1709         while (packet_success != count) {
1710                 /* Get descriptor from available ring */
1711                 desc = &vq->desc[head[packet_success]];
1712
1713                 buff = pkts[packet_success];
1714                 LOG_DEBUG(VHOST_DATA,
1715                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1716                         "pkt[%d] descriptor idx: %d\n",
1717                         dev->device_fh, packet_success,
1718                         MBUF_HEADROOM_UINT32(buff));
1719
1720                 PRINT_PACKET(dev,
1721                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1722                         + RTE_PKTMBUF_HEADROOM),
1723                         rte_pktmbuf_data_len(buff), 0);
1724
1725                 /* Buffer address translation for virtio header. */
1726                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1727                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1728
1729                 /*
1730                  * If the descriptors are chained the header and data are
1731                  * placed in separate buffers.
1732                  */
1733                 if (desc->flags & VRING_DESC_F_NEXT) {
1734                         desc->len = vq->vhost_hlen;
1735                         desc = &vq->desc[desc->next];
1736                         desc->len = rte_pktmbuf_data_len(buff);
1737                 } else {
1738                         desc->len = packet_len;
1739                 }
1740
1741                 /* Update used ring with desc information */
1742                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1743                         = head[packet_success];
1744                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1745                         = packet_len;
1746                 res_cur_idx++;
1747                 packet_success++;
1748
1749                 /* A header is required per buffer. */
1750                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1751                         (const void *)&virtio_hdr, vq->vhost_hlen);
1752
1753                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1754
1755                 if (likely(packet_success < count)) {
1756                         /* Prefetch descriptor index. */
1757                         rte_prefetch0(&vq->desc[head[packet_success]]);
1758                 }
1759         }
1760
1761         rte_compiler_barrier();
1762
1763         LOG_DEBUG(VHOST_DATA,
1764                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1765                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1766                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1767
1768         *(volatile uint16_t *)&vq->used->idx += count;
1769         vq->last_used_idx += count;
1770
1771         LOG_DEBUG(VHOST_DATA,
1772                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1773                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1774                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1775
1776         /* Kick the guest if necessary. */
1777         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1778                 eventfd_write((int)vq->kickfd, 1);
1779
1780         return count;
1781 }
1782
1783 /*
1784  * This function routes the TX packet to the correct interface.
1785  * This may be a local device or the physical port.
1786  */
1787 static inline void __attribute__((always_inline))
1788 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1789         uint32_t desc_idx, uint8_t need_copy)
1790 {
1791         struct mbuf_table *tx_q;
1792         struct rte_mbuf **m_table;
1793         struct rte_mbuf *mbuf = NULL;
1794         unsigned len, ret, offset = 0;
1795         struct vpool *vpool;
1796         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1797         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1798
1799         /*Add packet to the port tx queue*/
1800         tx_q = &tx_queue_zcp[vmdq_rx_q];
1801         len = tx_q->len;
1802
1803         /* Allocate an mbuf and populate the structure. */
1804         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1805         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1806         if (unlikely(mbuf == NULL)) {
1807                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1808                 RTE_LOG(ERR, VHOST_DATA,
1809                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1810                         dev->device_fh);
1811                 put_desc_to_used_list_zcp(vq, desc_idx);
1812                 return;
1813         }
1814
1815         if (vm2vm_mode == VM2VM_HARDWARE) {
1816                 /* Avoid using a vlan tag from any vm for external pkt, such as
1817                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1818                  * selection, MAC address determines it as an external pkt
1819                  * which should go to network, while vlan tag determine it as
1820                  * a vm2vm pkt should forward to another vm. Hardware confuse
1821                  * such a ambiguous situation, so pkt will lost.
1822                  */
1823                 vlan_tag = external_pkt_default_vlan_tag;
1824                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1825                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1826                         __rte_mbuf_raw_free(mbuf);
1827                         return;
1828                 }
1829         }
1830
1831         mbuf->nb_segs = m->nb_segs;
1832         mbuf->next = m->next;
1833         mbuf->data_len = m->data_len + offset;
1834         mbuf->pkt_len = mbuf->data_len;
1835         if (unlikely(need_copy)) {
1836                 /* Copy the packet contents to the mbuf. */
1837                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1838                         rte_pktmbuf_mtod(m, void *),
1839                         m->data_len);
1840         } else {
1841                 mbuf->data_off = m->data_off;
1842                 mbuf->buf_physaddr = m->buf_physaddr;
1843                 mbuf->buf_addr = m->buf_addr;
1844         }
1845         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1846         mbuf->vlan_tci = vlan_tag;
1847         mbuf->l2_len = sizeof(struct ether_hdr);
1848         mbuf->l3_len = sizeof(struct ipv4_hdr);
1849         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1850
1851         tx_q->m_table[len] = mbuf;
1852         len++;
1853
1854         LOG_DEBUG(VHOST_DATA,
1855                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1856                 dev->device_fh,
1857                 mbuf->nb_segs,
1858                 (mbuf->next == NULL) ? "null" : "non-null");
1859
1860         if (enable_stats) {
1861                 dev_statistics[dev->device_fh].tx_total++;
1862                 dev_statistics[dev->device_fh].tx++;
1863         }
1864
1865         if (unlikely(len == MAX_PKT_BURST)) {
1866                 m_table = (struct rte_mbuf **)tx_q->m_table;
1867                 ret = rte_eth_tx_burst(ports[0],
1868                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1869
1870                 /*
1871                  * Free any buffers not handled by TX and update
1872                  * the port stats.
1873                  */
1874                 if (unlikely(ret < len)) {
1875                         do {
1876                                 rte_pktmbuf_free(m_table[ret]);
1877                         } while (++ret < len);
1878                 }
1879
1880                 len = 0;
1881                 txmbuf_clean_zcp(dev, vpool);
1882         }
1883
1884         tx_q->len = len;
1885
1886         return;
1887 }
1888
1889 /*
1890  * This function TX all available packets in virtio TX queue for one
1891  * virtio-net device. If it is first packet, it learns MAC address and
1892  * setup VMDQ.
1893  */
1894 static inline void __attribute__((always_inline))
1895 virtio_dev_tx_zcp(struct virtio_net *dev)
1896 {
1897         struct rte_mbuf m;
1898         struct vhost_virtqueue *vq;
1899         struct vring_desc *desc;
1900         uint64_t buff_addr = 0, phys_addr;
1901         uint32_t head[MAX_PKT_BURST];
1902         uint32_t i;
1903         uint16_t free_entries, packet_success = 0;
1904         uint16_t avail_idx;
1905         uint8_t need_copy = 0;
1906         hpa_type addr_type;
1907         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1908
1909         vq = dev->virtqueue[VIRTIO_TXQ];
1910         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1911
1912         /* If there are no available buffers then return. */
1913         if (vq->last_used_idx_res == avail_idx)
1914                 return;
1915
1916         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1917
1918         /* Prefetch available ring to retrieve head indexes. */
1919         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1920
1921         /* Get the number of free entries in the ring */
1922         free_entries = (avail_idx - vq->last_used_idx_res);
1923
1924         /* Limit to MAX_PKT_BURST. */
1925         free_entries
1926                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1927
1928         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1929                 dev->device_fh, free_entries);
1930
1931         /* Retrieve all of the head indexes first to avoid caching issues. */
1932         for (i = 0; i < free_entries; i++)
1933                 head[i]
1934                         = vq->avail->ring[(vq->last_used_idx_res + i)
1935                         & (vq->size - 1)];
1936
1937         vq->last_used_idx_res += free_entries;
1938
1939         /* Prefetch descriptor index. */
1940         rte_prefetch0(&vq->desc[head[packet_success]]);
1941         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1942
1943         while (packet_success < free_entries) {
1944                 desc = &vq->desc[head[packet_success]];
1945
1946                 /* Discard first buffer as it is the virtio header */
1947                 desc = &vq->desc[desc->next];
1948
1949                 /* Buffer address translation. */
1950                 buff_addr = gpa_to_vva(dev, desc->addr);
1951                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1952                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1953                         &addr_type);
1954
1955                 if (likely(packet_success < (free_entries - 1)))
1956                         /* Prefetch descriptor index. */
1957                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1958
1959                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1960                         RTE_LOG(ERR, VHOST_DATA,
1961                                 "(%"PRIu64") Invalid frame buffer address found"
1962                                 "when TX packets!\n",
1963                                 dev->device_fh);
1964                         packet_success++;
1965                         continue;
1966                 }
1967
1968                 /* Prefetch buffer address. */
1969                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1970
1971                 /*
1972                  * Setup dummy mbuf. This is copied to a real mbuf if
1973                  * transmitted out the physical port.
1974                  */
1975                 m.data_len = desc->len;
1976                 m.nb_segs = 1;
1977                 m.next = NULL;
1978                 m.data_off = 0;
1979                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1980                 m.buf_physaddr = phys_addr;
1981
1982                 /*
1983                  * Check if the frame buffer address from guest crosses
1984                  * sub-region or not.
1985                  */
1986                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1987                         RTE_LOG(ERR, VHOST_DATA,
1988                                 "(%"PRIu64") Frame buffer address cross "
1989                                 "sub-regioin found when attaching TX frame "
1990                                 "buffer address!\n",
1991                                 dev->device_fh);
1992                         need_copy = 1;
1993                 } else
1994                         need_copy = 0;
1995
1996                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1997
1998                 /*
1999                  * If this is the first received packet we need to learn
2000                  * the MAC and setup VMDQ
2001                  */
2002                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2003                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2004                                 /*
2005                                  * Discard frame if device is scheduled for
2006                                  * removal or a duplicate MAC address is found.
2007                                  */
2008                                 packet_success += free_entries;
2009                                 vq->last_used_idx += packet_success;
2010                                 break;
2011                         }
2012                 }
2013
2014                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2015                 packet_success++;
2016         }
2017 }
2018
2019 /*
2020  * This function is called by each data core. It handles all RX/TX registered
2021  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2022  * addresses are compared with all devices in the main linked list.
2023  */
2024 static int
2025 switch_worker_zcp(__attribute__((unused)) void *arg)
2026 {
2027         struct virtio_net *dev = NULL;
2028         struct vhost_dev  *vdev = NULL;
2029         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2030         struct virtio_net_data_ll *dev_ll;
2031         struct mbuf_table *tx_q;
2032         volatile struct lcore_ll_info *lcore_ll;
2033         const uint64_t drain_tsc
2034                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2035                 * BURST_TX_DRAIN_US;
2036         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2037         unsigned ret;
2038         const uint16_t lcore_id = rte_lcore_id();
2039         uint16_t count_in_ring, rx_count = 0;
2040
2041         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2042
2043         lcore_ll = lcore_info[lcore_id].lcore_ll;
2044         prev_tsc = 0;
2045
2046         while (1) {
2047                 cur_tsc = rte_rdtsc();
2048
2049                 /* TX burst queue drain */
2050                 diff_tsc = cur_tsc - prev_tsc;
2051                 if (unlikely(diff_tsc > drain_tsc)) {
2052                         /*
2053                          * Get mbuf from vpool.pool and detach mbuf and
2054                          * put back into vpool.ring.
2055                          */
2056                         dev_ll = lcore_ll->ll_root_used;
2057                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2058                                 /* Get virtio device ID */
2059                                 vdev = dev_ll->vdev;
2060                                 dev = vdev->dev;
2061
2062                                 if (likely(!vdev->remove)) {
2063                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2064                                         if (tx_q->len) {
2065                                                 LOG_DEBUG(VHOST_DATA,
2066                                                 "TX queue drained after timeout"
2067                                                 " with burst size %u\n",
2068                                                 tx_q->len);
2069
2070                                                 /*
2071                                                  * Tx any packets in the queue
2072                                                  */
2073                                                 ret = rte_eth_tx_burst(
2074                                                         ports[0],
2075                                                         (uint16_t)tx_q->txq_id,
2076                                                         (struct rte_mbuf **)
2077                                                         tx_q->m_table,
2078                                                         (uint16_t)tx_q->len);
2079                                                 if (unlikely(ret < tx_q->len)) {
2080                                                         do {
2081                                                                 rte_pktmbuf_free(
2082                                                                         tx_q->m_table[ret]);
2083                                                         } while (++ret < tx_q->len);
2084                                                 }
2085                                                 tx_q->len = 0;
2086
2087                                                 txmbuf_clean_zcp(dev,
2088                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2089                                         }
2090                                 }
2091                                 dev_ll = dev_ll->next;
2092                         }
2093                         prev_tsc = cur_tsc;
2094                 }
2095
2096                 rte_prefetch0(lcore_ll->ll_root_used);
2097
2098                 /*
2099                  * Inform the configuration core that we have exited the linked
2100                  * list and that no devices are in use if requested.
2101                  */
2102                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2103                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2104
2105                 /* Process devices */
2106                 dev_ll = lcore_ll->ll_root_used;
2107
2108                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2109                         vdev = dev_ll->vdev;
2110                         dev  = vdev->dev;
2111                         if (unlikely(vdev->remove)) {
2112                                 dev_ll = dev_ll->next;
2113                                 unlink_vmdq(vdev);
2114                                 vdev->ready = DEVICE_SAFE_REMOVE;
2115                                 continue;
2116                         }
2117
2118                         if (likely(vdev->ready == DEVICE_RX)) {
2119                                 uint32_t index = vdev->vmdq_rx_q;
2120                                 uint16_t i;
2121                                 count_in_ring
2122                                 = rte_ring_count(vpool_array[index].ring);
2123                                 uint16_t free_entries
2124                                 = (uint16_t)get_available_ring_num_zcp(dev);
2125
2126                                 /*
2127                                  * Attach all mbufs in vpool.ring and put back
2128                                  * into vpool.pool.
2129                                  */
2130                                 for (i = 0;
2131                                 i < RTE_MIN(free_entries,
2132                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2133                                 i++)
2134                                         attach_rxmbuf_zcp(dev);
2135
2136                                 /* Handle guest RX */
2137                                 rx_count = rte_eth_rx_burst(ports[0],
2138                                         vdev->vmdq_rx_q, pkts_burst,
2139                                         MAX_PKT_BURST);
2140
2141                                 if (rx_count) {
2142                                         ret_count = virtio_dev_rx_zcp(dev,
2143                                                         pkts_burst, rx_count);
2144                                         if (enable_stats) {
2145                                                 dev_statistics[dev->device_fh].rx_total
2146                                                         += rx_count;
2147                                                 dev_statistics[dev->device_fh].rx
2148                                                         += ret_count;
2149                                         }
2150                                         while (likely(rx_count)) {
2151                                                 rx_count--;
2152                                                 pktmbuf_detach_zcp(
2153                                                         pkts_burst[rx_count]);
2154                                                 rte_ring_sp_enqueue(
2155                                                         vpool_array[index].ring,
2156                                                         (void *)pkts_burst[rx_count]);
2157                                         }
2158                                 }
2159                         }
2160
2161                         if (likely(!vdev->remove))
2162                                 /* Handle guest TX */
2163                                 virtio_dev_tx_zcp(dev);
2164
2165                         /* Move to the next device in the list */
2166                         dev_ll = dev_ll->next;
2167                 }
2168         }
2169
2170         return 0;
2171 }
2172
2173
2174 /*
2175  * Add an entry to a used linked list. A free entry must first be found
2176  * in the free linked list using get_data_ll_free_entry();
2177  */
2178 static void
2179 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2180         struct virtio_net_data_ll *ll_dev)
2181 {
2182         struct virtio_net_data_ll *ll = *ll_root_addr;
2183
2184         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2185         ll_dev->next = NULL;
2186         rte_compiler_barrier();
2187
2188         /* If ll == NULL then this is the first device. */
2189         if (ll) {
2190                 /* Increment to the tail of the linked list. */
2191                 while ((ll->next != NULL) )
2192                         ll = ll->next;
2193
2194                 ll->next = ll_dev;
2195         } else {
2196                 *ll_root_addr = ll_dev;
2197         }
2198 }
2199
2200 /*
2201  * Remove an entry from a used linked list. The entry must then be added to
2202  * the free linked list using put_data_ll_free_entry().
2203  */
2204 static void
2205 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206         struct virtio_net_data_ll *ll_dev,
2207         struct virtio_net_data_ll *ll_dev_last)
2208 {
2209         struct virtio_net_data_ll *ll = *ll_root_addr;
2210
2211         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2212                 return;
2213
2214         if (ll_dev == ll)
2215                 *ll_root_addr = ll_dev->next;
2216         else
2217                 if (likely(ll_dev_last != NULL))
2218                         ll_dev_last->next = ll_dev->next;
2219                 else
2220                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2221 }
2222
2223 /*
2224  * Find and return an entry from the free linked list.
2225  */
2226 static struct virtio_net_data_ll *
2227 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2228 {
2229         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2230         struct virtio_net_data_ll *ll_dev;
2231
2232         if (ll_free == NULL)
2233                 return NULL;
2234
2235         ll_dev = ll_free;
2236         *ll_root_addr = ll_free->next;
2237
2238         return ll_dev;
2239 }
2240
2241 /*
2242  * Place an entry back on to the free linked list.
2243  */
2244 static void
2245 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2246         struct virtio_net_data_ll *ll_dev)
2247 {
2248         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2249
2250         if (ll_dev == NULL)
2251                 return;
2252
2253         ll_dev->next = ll_free;
2254         *ll_root_addr = ll_dev;
2255 }
2256
2257 /*
2258  * Creates a linked list of a given size.
2259  */
2260 static struct virtio_net_data_ll *
2261 alloc_data_ll(uint32_t size)
2262 {
2263         struct virtio_net_data_ll *ll_new;
2264         uint32_t i;
2265
2266         /* Malloc and then chain the linked list. */
2267         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2268         if (ll_new == NULL) {
2269                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2270                 return NULL;
2271         }
2272
2273         for (i = 0; i < size - 1; i++) {
2274                 ll_new[i].vdev = NULL;
2275                 ll_new[i].next = &ll_new[i+1];
2276         }
2277         ll_new[i].next = NULL;
2278
2279         return (ll_new);
2280 }
2281
2282 /*
2283  * Create the main linked list along with each individual cores linked list. A used and a free list
2284  * are created to manage entries.
2285  */
2286 static int
2287 init_data_ll (void)
2288 {
2289         int lcore;
2290
2291         RTE_LCORE_FOREACH_SLAVE(lcore) {
2292                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2293                 if (lcore_info[lcore].lcore_ll == NULL) {
2294                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2295                         return -1;
2296                 }
2297
2298                 lcore_info[lcore].lcore_ll->device_num = 0;
2299                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2300                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2301                 if (num_devices % num_switching_cores)
2302                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2303                 else
2304                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2305         }
2306
2307         /* Allocate devices up to a maximum of MAX_DEVICES. */
2308         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2309
2310         return 0;
2311 }
2312
2313 /*
2314  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2315  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2316  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2317  */
2318 static void
2319 destroy_device (volatile struct virtio_net *dev)
2320 {
2321         struct virtio_net_data_ll *ll_lcore_dev_cur;
2322         struct virtio_net_data_ll *ll_main_dev_cur;
2323         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2324         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2325         struct vhost_dev *vdev;
2326         int lcore;
2327
2328         dev->flags &= ~VIRTIO_DEV_RUNNING;
2329
2330         vdev = (struct vhost_dev *)dev->priv;
2331         /*set the remove flag. */
2332         vdev->remove = 1;
2333         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2334                 rte_pause();
2335         }
2336
2337         /* Search for entry to be removed from lcore ll */
2338         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2339         while (ll_lcore_dev_cur != NULL) {
2340                 if (ll_lcore_dev_cur->vdev == vdev) {
2341                         break;
2342                 } else {
2343                         ll_lcore_dev_last = ll_lcore_dev_cur;
2344                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2345                 }
2346         }
2347
2348         if (ll_lcore_dev_cur == NULL) {
2349                 RTE_LOG(ERR, VHOST_CONFIG,
2350                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2351                         dev->device_fh);
2352                 return;
2353         }
2354
2355         /* Search for entry to be removed from main ll */
2356         ll_main_dev_cur = ll_root_used;
2357         ll_main_dev_last = NULL;
2358         while (ll_main_dev_cur != NULL) {
2359                 if (ll_main_dev_cur->vdev == vdev) {
2360                         break;
2361                 } else {
2362                         ll_main_dev_last = ll_main_dev_cur;
2363                         ll_main_dev_cur = ll_main_dev_cur->next;
2364                 }
2365         }
2366
2367         /* Remove entries from the lcore and main ll. */
2368         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2369         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2370
2371         /* Set the dev_removal_flag on each lcore. */
2372         RTE_LCORE_FOREACH_SLAVE(lcore) {
2373                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2374         }
2375
2376         /*
2377          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2378          * they can no longer access the device removed from the linked lists and that the devices
2379          * are no longer in use.
2380          */
2381         RTE_LCORE_FOREACH_SLAVE(lcore) {
2382                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2383                         rte_pause();
2384                 }
2385         }
2386
2387         /* Add the entries back to the lcore and main free ll.*/
2388         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2389         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2390
2391         /* Decrement number of device on the lcore. */
2392         lcore_info[vdev->coreid].lcore_ll->device_num--;
2393
2394         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2395
2396         if (zero_copy) {
2397                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2398
2399                 /* Stop the RX queue. */
2400                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2401                         LOG_DEBUG(VHOST_CONFIG,
2402                                 "(%"PRIu64") In destroy_device: Failed to stop "
2403                                 "rx queue:%d\n",
2404                                 dev->device_fh,
2405                                 vdev->vmdq_rx_q);
2406                 }
2407
2408                 LOG_DEBUG(VHOST_CONFIG,
2409                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2410                         "mempool back to ring for RX queue: %d\n",
2411                         dev->device_fh, vdev->vmdq_rx_q);
2412
2413                 mbuf_destroy_zcp(vpool);
2414
2415                 /* Stop the TX queue. */
2416                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2417                         LOG_DEBUG(VHOST_CONFIG,
2418                                 "(%"PRIu64") In destroy_device: Failed to "
2419                                 "stop tx queue:%d\n",
2420                                 dev->device_fh, vdev->vmdq_rx_q);
2421                 }
2422
2423                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2424
2425                 LOG_DEBUG(VHOST_CONFIG,
2426                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2427                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2428                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2429                         dev->device_fh);
2430
2431                 mbuf_destroy_zcp(vpool);
2432                 rte_free(vdev->regions_hpa);
2433         }
2434         rte_free(vdev);
2435
2436 }
2437
2438 /*
2439  * Calculate the region count of physical continous regions for one particular
2440  * region of whose vhost virtual address is continous. The particular region
2441  * start from vva_start, with size of 'size' in argument.
2442  */
2443 static uint32_t
2444 check_hpa_regions(uint64_t vva_start, uint64_t size)
2445 {
2446         uint32_t i, nregions = 0, page_size = getpagesize();
2447         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2448         if (vva_start % page_size) {
2449                 LOG_DEBUG(VHOST_CONFIG,
2450                         "in check_countinous: vva start(%p) mod page_size(%d) "
2451                         "has remainder\n",
2452                         (void *)(uintptr_t)vva_start, page_size);
2453                 return 0;
2454         }
2455         if (size % page_size) {
2456                 LOG_DEBUG(VHOST_CONFIG,
2457                         "in check_countinous: "
2458                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2459                         size, page_size);
2460                 return 0;
2461         }
2462         for (i = 0; i < size - page_size; i = i + page_size) {
2463                 cur_phys_addr
2464                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2465                 next_phys_addr = rte_mem_virt2phy(
2466                         (void *)(uintptr_t)(vva_start + i + page_size));
2467                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2468                         ++nregions;
2469                         LOG_DEBUG(VHOST_CONFIG,
2470                                 "in check_continuous: hva addr:(%p) is not "
2471                                 "continuous with hva addr:(%p), diff:%d\n",
2472                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2473                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2474                                 + page_size), page_size);
2475                         LOG_DEBUG(VHOST_CONFIG,
2476                                 "in check_continuous: hpa addr:(%p) is not "
2477                                 "continuous with hpa addr:(%p), "
2478                                 "diff:(%"PRIu64")\n",
2479                                 (void *)(uintptr_t)cur_phys_addr,
2480                                 (void *)(uintptr_t)next_phys_addr,
2481                                 (next_phys_addr-cur_phys_addr));
2482                 }
2483         }
2484         return nregions;
2485 }
2486
2487 /*
2488  * Divide each region whose vhost virtual address is continous into a few
2489  * sub-regions, make sure the physical address within each sub-region are
2490  * continous. And fill offset(to GPA) and size etc. information of each
2491  * sub-region into regions_hpa.
2492  */
2493 static uint32_t
2494 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2495 {
2496         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2497         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2498
2499         if (mem_region_hpa == NULL)
2500                 return 0;
2501
2502         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2503                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2504                         virtio_memory->regions[regionidx].address_offset;
2505                 mem_region_hpa[regionidx_hpa].guest_phys_address
2506                         = virtio_memory->regions[regionidx].guest_phys_address;
2507                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2508                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2509                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2510                 LOG_DEBUG(VHOST_CONFIG,
2511                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2512                         regionidx_hpa,
2513                         (void *)(uintptr_t)
2514                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2515                 LOG_DEBUG(VHOST_CONFIG,
2516                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2517                         regionidx_hpa,
2518                         (void *)(uintptr_t)
2519                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2520                 for (i = 0, k = 0;
2521                         i < virtio_memory->regions[regionidx].memory_size -
2522                                 page_size;
2523                         i += page_size) {
2524                         cur_phys_addr = rte_mem_virt2phy(
2525                                         (void *)(uintptr_t)(vva_start + i));
2526                         next_phys_addr = rte_mem_virt2phy(
2527                                         (void *)(uintptr_t)(vva_start +
2528                                         i + page_size));
2529                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2530                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2531                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2532                                         k + page_size;
2533                                 mem_region_hpa[regionidx_hpa].memory_size
2534                                         = k + page_size;
2535                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2536                                         "phys addr end  [%d]:(%p)\n",
2537                                         regionidx_hpa,
2538                                         (void *)(uintptr_t)
2539                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2540                                 LOG_DEBUG(VHOST_CONFIG,
2541                                         "in fill_hpa_regions: guest phys addr "
2542                                         "size [%d]:(%p)\n",
2543                                         regionidx_hpa,
2544                                         (void *)(uintptr_t)
2545                                         (mem_region_hpa[regionidx_hpa].memory_size));
2546                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2547                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2548                                 ++regionidx_hpa;
2549                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2550                                         next_phys_addr -
2551                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2552                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2553                                         " phys addr start[%d]:(%p)\n",
2554                                         regionidx_hpa,
2555                                         (void *)(uintptr_t)
2556                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2557                                 LOG_DEBUG(VHOST_CONFIG,
2558                                         "in fill_hpa_regions: host  phys addr "
2559                                         "start[%d]:(%p)\n",
2560                                         regionidx_hpa,
2561                                         (void *)(uintptr_t)
2562                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2563                                 k = 0;
2564                         } else {
2565                                 k += page_size;
2566                         }
2567                 }
2568                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2569                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2570                         + k + page_size;
2571                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2572                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2573                         "[%d]:(%p)\n", regionidx_hpa,
2574                         (void *)(uintptr_t)
2575                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2576                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2577                         "[%d]:(%p)\n", regionidx_hpa,
2578                         (void *)(uintptr_t)
2579                         (mem_region_hpa[regionidx_hpa].memory_size));
2580                 ++regionidx_hpa;
2581         }
2582         return regionidx_hpa;
2583 }
2584
2585 /*
2586  * A new device is added to a data core. First the device is added to the main linked list
2587  * and the allocated to a specific data core.
2588  */
2589 static int
2590 new_device (struct virtio_net *dev)
2591 {
2592         struct virtio_net_data_ll *ll_dev;
2593         int lcore, core_add = 0;
2594         uint32_t device_num_min = num_devices;
2595         struct vhost_dev *vdev;
2596         uint32_t regionidx;
2597
2598         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2599         if (vdev == NULL) {
2600                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2601                         dev->device_fh);
2602                 return -1;
2603         }
2604         vdev->dev = dev;
2605         dev->priv = vdev;
2606
2607         if (zero_copy) {
2608                 vdev->nregions_hpa = dev->mem->nregions;
2609                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2610                         vdev->nregions_hpa
2611                                 += check_hpa_regions(
2612                                         dev->mem->regions[regionidx].guest_phys_address
2613                                         + dev->mem->regions[regionidx].address_offset,
2614                                         dev->mem->regions[regionidx].memory_size);
2615
2616                 }
2617
2618                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2619                                                vdev->nregions_hpa,
2620                                                sizeof(struct virtio_memory_regions_hpa),
2621                                                RTE_CACHE_LINE_SIZE);
2622                 if (vdev->regions_hpa == NULL) {
2623                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2624                         rte_free(vdev);
2625                         return -1;
2626                 }
2627
2628
2629                 if (fill_hpa_memory_regions(
2630                         vdev->regions_hpa, dev->mem
2631                         ) != vdev->nregions_hpa) {
2632
2633                         RTE_LOG(ERR, VHOST_CONFIG,
2634                                 "hpa memory regions number mismatch: "
2635                                 "[%d]\n", vdev->nregions_hpa);
2636                         rte_free(vdev->regions_hpa);
2637                         rte_free(vdev);
2638                         return -1;
2639                 }
2640         }
2641
2642
2643         /* Add device to main ll */
2644         ll_dev = get_data_ll_free_entry(&ll_root_free);
2645         if (ll_dev == NULL) {
2646                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2647                         "of %d devices per core has been reached\n",
2648                         dev->device_fh, num_devices);
2649                 if (vdev->regions_hpa)
2650                         rte_free(vdev->regions_hpa);
2651                 rte_free(vdev);
2652                 return -1;
2653         }
2654         ll_dev->vdev = vdev;
2655         add_data_ll_entry(&ll_root_used, ll_dev);
2656         vdev->vmdq_rx_q
2657                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2658
2659         if (zero_copy) {
2660                 uint32_t index = vdev->vmdq_rx_q;
2661                 uint32_t count_in_ring, i;
2662                 struct mbuf_table *tx_q;
2663
2664                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2665
2666                 LOG_DEBUG(VHOST_CONFIG,
2667                         "(%"PRIu64") in new_device: mbuf count in mempool "
2668                         "before attach is: %d\n",
2669                         dev->device_fh,
2670                         rte_mempool_count(vpool_array[index].pool));
2671                 LOG_DEBUG(VHOST_CONFIG,
2672                         "(%"PRIu64") in new_device: mbuf count in  ring "
2673                         "before attach  is : %d\n",
2674                         dev->device_fh, count_in_ring);
2675
2676                 /*
2677                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2678                  */
2679                 for (i = 0; i < count_in_ring; i++)
2680                         attach_rxmbuf_zcp(dev);
2681
2682                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2683                         "mempool after attach is: %d\n",
2684                         dev->device_fh,
2685                         rte_mempool_count(vpool_array[index].pool));
2686                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2687                         "ring after attach  is : %d\n",
2688                         dev->device_fh,
2689                         rte_ring_count(vpool_array[index].ring));
2690
2691                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2692                 tx_q->txq_id = vdev->vmdq_rx_q;
2693
2694                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2695                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2696
2697                         LOG_DEBUG(VHOST_CONFIG,
2698                                 "(%"PRIu64") In new_device: Failed to start "
2699                                 "tx queue:%d\n",
2700                                 dev->device_fh, vdev->vmdq_rx_q);
2701
2702                         mbuf_destroy_zcp(vpool);
2703                         rte_free(vdev->regions_hpa);
2704                         rte_free(vdev);
2705                         return -1;
2706                 }
2707
2708                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2709                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2710
2711                         LOG_DEBUG(VHOST_CONFIG,
2712                                 "(%"PRIu64") In new_device: Failed to start "
2713                                 "rx queue:%d\n",
2714                                 dev->device_fh, vdev->vmdq_rx_q);
2715
2716                         /* Stop the TX queue. */
2717                         if (rte_eth_dev_tx_queue_stop(ports[0],
2718                                 vdev->vmdq_rx_q) != 0) {
2719                                 LOG_DEBUG(VHOST_CONFIG,
2720                                         "(%"PRIu64") In new_device: Failed to "
2721                                         "stop tx queue:%d\n",
2722                                         dev->device_fh, vdev->vmdq_rx_q);
2723                         }
2724
2725                         mbuf_destroy_zcp(vpool);
2726                         rte_free(vdev->regions_hpa);
2727                         rte_free(vdev);
2728                         return -1;
2729                 }
2730
2731         }
2732
2733         /*reset ready flag*/
2734         vdev->ready = DEVICE_MAC_LEARNING;
2735         vdev->remove = 0;
2736
2737         /* Find a suitable lcore to add the device. */
2738         RTE_LCORE_FOREACH_SLAVE(lcore) {
2739                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2740                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2741                         core_add = lcore;
2742                 }
2743         }
2744         /* Add device to lcore ll */
2745         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2746         if (ll_dev == NULL) {
2747                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2748                 vdev->ready = DEVICE_SAFE_REMOVE;
2749                 destroy_device(dev);
2750                 if (vdev->regions_hpa)
2751                         rte_free(vdev->regions_hpa);
2752                 rte_free(vdev);
2753                 return -1;
2754         }
2755         ll_dev->vdev = vdev;
2756         vdev->coreid = core_add;
2757
2758         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2759
2760         /* Initialize device stats */
2761         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2762
2763         /* Disable notifications. */
2764         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2765         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2766         lcore_info[vdev->coreid].lcore_ll->device_num++;
2767         dev->flags |= VIRTIO_DEV_RUNNING;
2768
2769         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2770
2771         return 0;
2772 }
2773
2774 /*
2775  * These callback allow devices to be added to the data core when configuration
2776  * has been fully complete.
2777  */
2778 static const struct virtio_net_device_ops virtio_net_device_ops =
2779 {
2780         .new_device =  new_device,
2781         .destroy_device = destroy_device,
2782 };
2783
2784 /*
2785  * This is a thread will wake up after a period to print stats if the user has
2786  * enabled them.
2787  */
2788 static void
2789 print_stats(void)
2790 {
2791         struct virtio_net_data_ll *dev_ll;
2792         uint64_t tx_dropped, rx_dropped;
2793         uint64_t tx, tx_total, rx, rx_total;
2794         uint32_t device_fh;
2795         const char clr[] = { 27, '[', '2', 'J', '\0' };
2796         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2797
2798         while(1) {
2799                 sleep(enable_stats);
2800
2801                 /* Clear screen and move to top left */
2802                 printf("%s%s", clr, top_left);
2803
2804                 printf("\nDevice statistics ====================================");
2805
2806                 dev_ll = ll_root_used;
2807                 while (dev_ll != NULL) {
2808                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2809                         tx_total = dev_statistics[device_fh].tx_total;
2810                         tx = dev_statistics[device_fh].tx;
2811                         tx_dropped = tx_total - tx;
2812                         if (zero_copy == 0) {
2813                                 rx_total = rte_atomic64_read(
2814                                         &dev_statistics[device_fh].rx_total_atomic);
2815                                 rx = rte_atomic64_read(
2816                                         &dev_statistics[device_fh].rx_atomic);
2817                         } else {
2818                                 rx_total = dev_statistics[device_fh].rx_total;
2819                                 rx = dev_statistics[device_fh].rx;
2820                         }
2821                         rx_dropped = rx_total - rx;
2822
2823                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2824                                         "\nTX total:            %"PRIu64""
2825                                         "\nTX dropped:          %"PRIu64""
2826                                         "\nTX successful:               %"PRIu64""
2827                                         "\nRX total:            %"PRIu64""
2828                                         "\nRX dropped:          %"PRIu64""
2829                                         "\nRX successful:               %"PRIu64"",
2830                                         device_fh,
2831                                         tx_total,
2832                                         tx_dropped,
2833                                         tx,
2834                                         rx_total,
2835                                         rx_dropped,
2836                                         rx);
2837
2838                         dev_ll = dev_ll->next;
2839                 }
2840                 printf("\n======================================================\n");
2841         }
2842 }
2843
2844 static void
2845 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2846         char *ring_name, uint32_t nb_mbuf)
2847 {
2848         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2849         vpool_array[index].pool
2850                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2851                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2852                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2853                 rte_pktmbuf_init, NULL, socket, 0);
2854         if (vpool_array[index].pool != NULL) {
2855                 vpool_array[index].ring
2856                         = rte_ring_create(ring_name,
2857                                 rte_align32pow2(nb_mbuf + 1),
2858                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2859                 if (likely(vpool_array[index].ring != NULL)) {
2860                         LOG_DEBUG(VHOST_CONFIG,
2861                                 "in setup_mempool_tbl: mbuf count in "
2862                                 "mempool is: %d\n",
2863                                 rte_mempool_count(vpool_array[index].pool));
2864                         LOG_DEBUG(VHOST_CONFIG,
2865                                 "in setup_mempool_tbl: mbuf count in "
2866                                 "ring   is: %d\n",
2867                                 rte_ring_count(vpool_array[index].ring));
2868                 } else {
2869                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2870                                 ring_name);
2871                 }
2872
2873                 /* Need consider head room. */
2874                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2875         } else {
2876                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2877         }
2878 }
2879
2880
2881 /*
2882  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2883  * device is also registered here to handle the IOCTLs.
2884  */
2885 int
2886 main(int argc, char *argv[])
2887 {
2888         struct rte_mempool *mbuf_pool = NULL;
2889         unsigned lcore_id, core_id = 0;
2890         unsigned nb_ports, valid_num_ports;
2891         int ret;
2892         uint8_t portid;
2893         uint16_t queue_id;
2894         static pthread_t tid;
2895
2896         /* init EAL */
2897         ret = rte_eal_init(argc, argv);
2898         if (ret < 0)
2899                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2900         argc -= ret;
2901         argv += ret;
2902
2903         /* parse app arguments */
2904         ret = us_vhost_parse_args(argc, argv);
2905         if (ret < 0)
2906                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2907
2908         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2909                 if (rte_lcore_is_enabled(lcore_id))
2910                         lcore_ids[core_id ++] = lcore_id;
2911
2912         if (rte_lcore_count() > RTE_MAX_LCORE)
2913                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2914
2915         /*set the number of swithcing cores available*/
2916         num_switching_cores = rte_lcore_count()-1;
2917
2918         /* Get the number of physical ports. */
2919         nb_ports = rte_eth_dev_count();
2920         if (nb_ports > RTE_MAX_ETHPORTS)
2921                 nb_ports = RTE_MAX_ETHPORTS;
2922
2923         /*
2924          * Update the global var NUM_PORTS and global array PORTS
2925          * and get value of var VALID_NUM_PORTS according to system ports number
2926          */
2927         valid_num_ports = check_ports_num(nb_ports);
2928
2929         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2930                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2931                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2932                 return -1;
2933         }
2934
2935         if (zero_copy == 0) {
2936                 /* Create the mbuf pool. */
2937                 mbuf_pool = rte_mempool_create(
2938                                 "MBUF_POOL",
2939                                 NUM_MBUFS_PER_PORT
2940                                 * valid_num_ports,
2941                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2942                                 sizeof(struct rte_pktmbuf_pool_private),
2943                                 rte_pktmbuf_pool_init, NULL,
2944                                 rte_pktmbuf_init, NULL,
2945                                 rte_socket_id(), 0);
2946                 if (mbuf_pool == NULL)
2947                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2948
2949                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2950                         vpool_array[queue_id].pool = mbuf_pool;
2951
2952                 if (vm2vm_mode == VM2VM_HARDWARE) {
2953                         /* Enable VT loop back to let L2 switch to do it. */
2954                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2955                         LOG_DEBUG(VHOST_CONFIG,
2956                                 "Enable loop back for L2 switch in vmdq.\n");
2957                 }
2958         } else {
2959                 uint32_t nb_mbuf;
2960                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2961                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2962
2963                 nb_mbuf = num_rx_descriptor
2964                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2965                         + num_switching_cores * MAX_PKT_BURST;
2966
2967                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2968                         snprintf(pool_name, sizeof(pool_name),
2969                                 "rxmbuf_pool_%u", queue_id);
2970                         snprintf(ring_name, sizeof(ring_name),
2971                                 "rxmbuf_ring_%u", queue_id);
2972                         setup_mempool_tbl(rte_socket_id(), queue_id,
2973                                 pool_name, ring_name, nb_mbuf);
2974                 }
2975
2976                 nb_mbuf = num_tx_descriptor
2977                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2978                                 + num_switching_cores * MAX_PKT_BURST;
2979
2980                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2981                         snprintf(pool_name, sizeof(pool_name),
2982                                 "txmbuf_pool_%u", queue_id);
2983                         snprintf(ring_name, sizeof(ring_name),
2984                                 "txmbuf_ring_%u", queue_id);
2985                         setup_mempool_tbl(rte_socket_id(),
2986                                 (queue_id + MAX_QUEUES),
2987                                 pool_name, ring_name, nb_mbuf);
2988                 }
2989
2990                 if (vm2vm_mode == VM2VM_HARDWARE) {
2991                         /* Enable VT loop back to let L2 switch to do it. */
2992                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2993                         LOG_DEBUG(VHOST_CONFIG,
2994                                 "Enable loop back for L2 switch in vmdq.\n");
2995                 }
2996         }
2997         /* Set log level. */
2998         rte_set_log_level(LOG_LEVEL);
2999
3000         /* initialize all ports */
3001         for (portid = 0; portid < nb_ports; portid++) {
3002                 /* skip ports that are not enabled */
3003                 if ((enabled_port_mask & (1 << portid)) == 0) {
3004                         RTE_LOG(INFO, VHOST_PORT,
3005                                 "Skipping disabled port %d\n", portid);
3006                         continue;
3007                 }
3008                 if (port_init(portid) != 0)
3009                         rte_exit(EXIT_FAILURE,
3010                                 "Cannot initialize network ports\n");
3011         }
3012
3013         /* Initialise all linked lists. */
3014         if (init_data_ll() == -1)
3015                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3016
3017         /* Initialize device stats */
3018         memset(&dev_statistics, 0, sizeof(dev_statistics));
3019
3020         /* Enable stats if the user option is set. */
3021         if (enable_stats)
3022                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3023
3024         /* Launch all data cores. */
3025         if (zero_copy == 0) {
3026                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3027                         rte_eal_remote_launch(switch_worker,
3028                                 mbuf_pool, lcore_id);
3029                 }
3030         } else {
3031                 uint32_t count_in_mempool, index, i;
3032                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3033                         /* For all RX and TX queues. */
3034                         count_in_mempool
3035                                 = rte_mempool_count(vpool_array[index].pool);
3036
3037                         /*
3038                          * Transfer all un-attached mbufs from vpool.pool
3039                          * to vpoo.ring.
3040                          */
3041                         for (i = 0; i < count_in_mempool; i++) {
3042                                 struct rte_mbuf *mbuf
3043                                         = __rte_mbuf_raw_alloc(
3044                                                 vpool_array[index].pool);
3045                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3046                                                 (void *)mbuf);
3047                         }
3048
3049                         LOG_DEBUG(VHOST_CONFIG,
3050                                 "in main: mbuf count in mempool at initial "
3051                                 "is: %d\n", count_in_mempool);
3052                         LOG_DEBUG(VHOST_CONFIG,
3053                                 "in main: mbuf count in  ring at initial  is :"
3054                                 " %d\n",
3055                                 rte_ring_count(vpool_array[index].ring));
3056                 }
3057
3058                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3059                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3060                                 lcore_id);
3061         }
3062
3063         if (mergeable == 0)
3064                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3065
3066         /* Register CUSE device to handle IOCTLs. */
3067         ret = rte_vhost_driver_register((char *)&dev_basename);
3068         if (ret != 0)
3069                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3070
3071         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3072
3073         /* Start CUSE session. */
3074         rte_vhost_driver_session_start();
3075         return 0;
3076
3077 }
3078