examples/vhost: fix hard forward of jumbo frames
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
84
85 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
87
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX                       1
93 #define DEVICE_SAFE_REMOVE      2
94
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117                 + sizeof(struct rte_mbuf)))
118
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121
122 #define INVALID_PORT_ID 0xFF
123
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168
169 struct vpool {
170         struct rte_mempool *pool;
171         struct rte_ring *ring;
172         uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177         VM2VM_DISABLED = 0,
178         VM2VM_SOFTWARE = 1,
179         VM2VM_HARDWARE = 2,
180         VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186         PHYS_ADDR_CONTINUOUS = 0,
187         PHYS_ADDR_CROSS_SUBREG = 1,
188         PHYS_ADDR_INVALID = 2,
189         PHYS_ADDR_LAST
190 } hpa_type;
191
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206         .rxmode = {
207                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208                 .split_hdr_size = 0,
209                 .header_split   = 0, /**< Header Split disabled */
210                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
211                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
212                 /*
213                  * It is necessary for 1G NIC such as I350,
214                  * this fixes bug of ipv4 forwarding in guest can't
215                  * forward pakets from one virtio dev to another virtio dev.
216                  */
217                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
220         },
221
222         .txmode = {
223                 .mq_mode = ETH_MQ_TX_NONE,
224         },
225         .rx_adv_conf = {
226                 /*
227                  * should be overridden separately in code with
228                  * appropriate values
229                  */
230                 .vmdq_rx_conf = {
231                         .nb_queue_pools = ETH_8_POOLS,
232                         .enable_default_pool = 0,
233                         .default_pool = 0,
234                         .nb_pool_maps = 0,
235                         .pool_map = {{0, 0},},
236                 },
237         },
238 };
239
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
251         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271         unsigned len;
272         unsigned txq_id;
273         struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284         unsigned char   h_dest[ETH_ALEN];
285         unsigned char   h_source[ETH_ALEN];
286         __be16          h_vlan_proto;
287         __be16          h_vlan_TCI;
288         __be16          h_vlan_encapsulated_proto;
289 };
290
291 /* IPv4 Header */
292 struct ipv4_hdr {
293         uint8_t  version_ihl;           /**< version and header length */
294         uint8_t  type_of_service;       /**< type of service */
295         uint16_t total_length;          /**< length of packet */
296         uint16_t packet_id;             /**< packet ID */
297         uint16_t fragment_offset;       /**< fragmentation offset */
298         uint8_t  time_to_live;          /**< time to live */
299         uint8_t  next_proto_id;         /**< protocol ID */
300         uint16_t hdr_checksum;          /**< header checksum */
301         uint32_t src_addr;              /**< source address */
302         uint32_t dst_addr;              /**< destination address */
303 } __attribute__((__packed__));
304
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308
309 /* Per-device statistics struct */
310 struct device_statistics {
311         uint64_t tx_total;
312         rte_atomic64_t rx_total_atomic;
313         uint64_t rx_total;
314         uint64_t tx;
315         rte_atomic64_t rx_atomic;
316         uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327         struct rte_eth_vmdq_rx_conf conf;
328         struct rte_eth_vmdq_rx_conf *def_conf =
329                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330         unsigned i;
331
332         memset(&conf, 0, sizeof(conf));
333         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334         conf.nb_pool_maps = num_devices;
335         conf.enable_loop_back = def_conf->enable_loop_back;
336         conf.rx_mode = def_conf->rx_mode;
337
338         for (i = 0; i < conf.nb_pool_maps; i++) {
339                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
340                 conf.pool_map[i].pools = (1UL << i);
341         }
342
343         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346         return 0;
347 }
348
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357         if (num_devices > max_nb_devices) {
358                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359                 return -1;
360         }
361         return 0;
362 }
363
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371         struct rte_eth_dev_info dev_info;
372         struct rte_eth_conf port_conf;
373         struct rte_eth_rxconf *rxconf;
374         struct rte_eth_txconf *txconf;
375         int16_t rx_rings, tx_rings;
376         uint16_t rx_ring_size, tx_ring_size;
377         int retval;
378         uint16_t q;
379
380         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381         rte_eth_dev_info_get (port, &dev_info);
382
383         if (dev_info.max_rx_queues > MAX_QUEUES) {
384                 rte_exit(EXIT_FAILURE,
385                         "please define MAX_QUEUES no less than %u in %s\n",
386                         dev_info.max_rx_queues, __FILE__);
387         }
388
389         rxconf = &dev_info.default_rxconf;
390         txconf = &dev_info.default_txconf;
391         rxconf->rx_drop_en = 1;
392
393         /*
394          * Zero copy defers queue RX/TX start to the time when guest
395          * finishes its startup and packet buffers from that guest are
396          * available.
397          */
398         if (zero_copy) {
399                 rxconf->rx_deferred_start = 1;
400                 rxconf->rx_drop_en = 0;
401                 txconf->tx_deferred_start = 1;
402         }
403
404         /*configure the number of supported virtio devices based on VMDQ limits */
405         num_devices = dev_info.max_vmdq_pools;
406
407         if (zero_copy) {
408                 rx_ring_size = num_rx_descriptor;
409                 tx_ring_size = num_tx_descriptor;
410                 tx_rings = dev_info.max_tx_queues;
411         } else {
412                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
413                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
414                 tx_rings = (uint16_t)rte_lcore_count();
415         }
416
417         retval = validate_num_devices(MAX_DEVICES);
418         if (retval < 0)
419                 return retval;
420
421         /* Get port configuration. */
422         retval = get_eth_conf(&port_conf, num_devices);
423         if (retval < 0)
424                 return retval;
425         /* NIC queues are divided into pf queues and vmdq queues.  */
426         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
427         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
428         num_vmdq_queues = num_devices * queues_per_pool;
429         num_queues = num_pf_queues + num_vmdq_queues;
430         vmdq_queue_base = dev_info.vmdq_queue_base;
431         vmdq_pool_base  = dev_info.vmdq_pool_base;
432         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
433                 num_pf_queues, num_devices, queues_per_pool);
434
435         if (port >= rte_eth_dev_count()) return -1;
436
437         rx_rings = (uint16_t)dev_info.max_rx_queues;
438         /* Configure ethernet device. */
439         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
440         if (retval != 0)
441                 return retval;
442
443         /* Setup the queues. */
444         for (q = 0; q < rx_rings; q ++) {
445                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
446                                                 rte_eth_dev_socket_id(port),
447                                                 rxconf,
448                                                 vpool_array[q].pool);
449                 if (retval < 0)
450                         return retval;
451         }
452         for (q = 0; q < tx_rings; q ++) {
453                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
454                                                 rte_eth_dev_socket_id(port),
455                                                 txconf);
456                 if (retval < 0)
457                         return retval;
458         }
459
460         /* Start the device. */
461         retval  = rte_eth_dev_start(port);
462         if (retval < 0) {
463                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
464                 return retval;
465         }
466
467         if (promiscuous)
468                 rte_eth_promiscuous_enable(port);
469
470         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
471         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
472         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
473                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
474                         (unsigned)port,
475                         vmdq_ports_eth_addr[port].addr_bytes[0],
476                         vmdq_ports_eth_addr[port].addr_bytes[1],
477                         vmdq_ports_eth_addr[port].addr_bytes[2],
478                         vmdq_ports_eth_addr[port].addr_bytes[3],
479                         vmdq_ports_eth_addr[port].addr_bytes[4],
480                         vmdq_ports_eth_addr[port].addr_bytes[5]);
481
482         return 0;
483 }
484
485 /*
486  * Set character device basename.
487  */
488 static int
489 us_vhost_parse_basename(const char *q_arg)
490 {
491         /* parse number string */
492
493         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
494                 return -1;
495         else
496                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
497
498         return 0;
499 }
500
501 /*
502  * Parse the portmask provided at run time.
503  */
504 static int
505 parse_portmask(const char *portmask)
506 {
507         char *end = NULL;
508         unsigned long pm;
509
510         errno = 0;
511
512         /* parse hexadecimal string */
513         pm = strtoul(portmask, &end, 16);
514         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
515                 return -1;
516
517         if (pm == 0)
518                 return -1;
519
520         return pm;
521
522 }
523
524 /*
525  * Parse num options at run time.
526  */
527 static int
528 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
529 {
530         char *end = NULL;
531         unsigned long num;
532
533         errno = 0;
534
535         /* parse unsigned int string */
536         num = strtoul(q_arg, &end, 10);
537         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
538                 return -1;
539
540         if (num > max_valid_value)
541                 return -1;
542
543         return num;
544
545 }
546
547 /*
548  * Display usage
549  */
550 static void
551 us_vhost_usage(const char *prgname)
552 {
553         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
554         "               --vm2vm [0|1|2]\n"
555         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
556         "               --dev-basename <name>\n"
557         "               --nb-devices ND\n"
558         "               -p PORTMASK: Set mask for ports to be used by application\n"
559         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
560         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
561         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
562         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
563         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
564         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
565         "               --dev-basename: The basename to be used for the character device.\n"
566         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
567                         "zero copy\n"
568         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
569                         "used only when zero copy is enabled.\n"
570         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
571                         "used only when zero copy is enabled.\n",
572                prgname);
573 }
574
575 /*
576  * Parse the arguments given in the command line of the application.
577  */
578 static int
579 us_vhost_parse_args(int argc, char **argv)
580 {
581         int opt, ret;
582         int option_index;
583         unsigned i;
584         const char *prgname = argv[0];
585         static struct option long_option[] = {
586                 {"vm2vm", required_argument, NULL, 0},
587                 {"rx-retry", required_argument, NULL, 0},
588                 {"rx-retry-delay", required_argument, NULL, 0},
589                 {"rx-retry-num", required_argument, NULL, 0},
590                 {"mergeable", required_argument, NULL, 0},
591                 {"stats", required_argument, NULL, 0},
592                 {"dev-basename", required_argument, NULL, 0},
593                 {"zero-copy", required_argument, NULL, 0},
594                 {"rx-desc-num", required_argument, NULL, 0},
595                 {"tx-desc-num", required_argument, NULL, 0},
596                 {NULL, 0, 0, 0},
597         };
598
599         /* Parse command line */
600         while ((opt = getopt_long(argc, argv, "p:P",
601                         long_option, &option_index)) != EOF) {
602                 switch (opt) {
603                 /* Portmask */
604                 case 'p':
605                         enabled_port_mask = parse_portmask(optarg);
606                         if (enabled_port_mask == 0) {
607                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608                                 us_vhost_usage(prgname);
609                                 return -1;
610                         }
611                         break;
612
613                 case 'P':
614                         promiscuous = 1;
615                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
616                                 ETH_VMDQ_ACCEPT_BROADCAST |
617                                 ETH_VMDQ_ACCEPT_MULTICAST;
618                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
619
620                         break;
621
622                 case 0:
623                         /* Enable/disable vm2vm comms. */
624                         if (!strncmp(long_option[option_index].name, "vm2vm",
625                                 MAX_LONG_OPT_SZ)) {
626                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
627                                 if (ret == -1) {
628                                         RTE_LOG(INFO, VHOST_CONFIG,
629                                                 "Invalid argument for "
630                                                 "vm2vm [0|1|2]\n");
631                                         us_vhost_usage(prgname);
632                                         return -1;
633                                 } else {
634                                         vm2vm_mode = (vm2vm_type)ret;
635                                 }
636                         }
637
638                         /* Enable/disable retries on RX. */
639                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
640                                 ret = parse_num_opt(optarg, 1);
641                                 if (ret == -1) {
642                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
643                                         us_vhost_usage(prgname);
644                                         return -1;
645                                 } else {
646                                         enable_retry = ret;
647                                 }
648                         }
649
650                         /* Specify the retries delay time (in useconds) on RX. */
651                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
652                                 ret = parse_num_opt(optarg, INT32_MAX);
653                                 if (ret == -1) {
654                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
655                                         us_vhost_usage(prgname);
656                                         return -1;
657                                 } else {
658                                         burst_rx_delay_time = ret;
659                                 }
660                         }
661
662                         /* Specify the retries number on RX. */
663                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
664                                 ret = parse_num_opt(optarg, INT32_MAX);
665                                 if (ret == -1) {
666                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
667                                         us_vhost_usage(prgname);
668                                         return -1;
669                                 } else {
670                                         burst_rx_retry_num = ret;
671                                 }
672                         }
673
674                         /* Enable/disable RX mergeable buffers. */
675                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
676                                 ret = parse_num_opt(optarg, 1);
677                                 if (ret == -1) {
678                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
679                                         us_vhost_usage(prgname);
680                                         return -1;
681                                 } else {
682                                         mergeable = !!ret;
683                                         if (ret) {
684                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
685                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
686                                                         = JUMBO_FRAME_MAX_SIZE;
687                                         }
688                                 }
689                         }
690
691                         /* Enable/disable stats. */
692                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
693                                 ret = parse_num_opt(optarg, INT32_MAX);
694                                 if (ret == -1) {
695                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
696                                         us_vhost_usage(prgname);
697                                         return -1;
698                                 } else {
699                                         enable_stats = ret;
700                                 }
701                         }
702
703                         /* Set character device basename. */
704                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
705                                 if (us_vhost_parse_basename(optarg) == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 }
710                         }
711
712                         /* Enable/disable rx/tx zero copy. */
713                         if (!strncmp(long_option[option_index].name,
714                                 "zero-copy", MAX_LONG_OPT_SZ)) {
715                                 ret = parse_num_opt(optarg, 1);
716                                 if (ret == -1) {
717                                         RTE_LOG(INFO, VHOST_CONFIG,
718                                                 "Invalid argument"
719                                                 " for zero-copy [0|1]\n");
720                                         us_vhost_usage(prgname);
721                                         return -1;
722                                 } else
723                                         zero_copy = ret;
724
725                                 if (zero_copy) {
726 #ifdef RTE_MBUF_REFCNT
727                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
728                                         "zero copy vhost APP, please "
729                                         "disable RTE_MBUF_REFCNT\n"
730                                         "in config file and then rebuild DPDK "
731                                         "core lib!\n"
732                                         "Otherwise please disable zero copy "
733                                         "flag in command line!\n");
734                                         return -1;
735 #endif
736                                 }
737                         }
738
739                         /* Specify the descriptor number on RX. */
740                         if (!strncmp(long_option[option_index].name,
741                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
742                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
743                                 if ((ret == -1) || (!POWEROF2(ret))) {
744                                         RTE_LOG(INFO, VHOST_CONFIG,
745                                         "Invalid argument for rx-desc-num[0-N],"
746                                         "power of 2 required.\n");
747                                         us_vhost_usage(prgname);
748                                         return -1;
749                                 } else {
750                                         num_rx_descriptor = ret;
751                                 }
752                         }
753
754                         /* Specify the descriptor number on TX. */
755                         if (!strncmp(long_option[option_index].name,
756                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
757                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
758                                 if ((ret == -1) || (!POWEROF2(ret))) {
759                                         RTE_LOG(INFO, VHOST_CONFIG,
760                                         "Invalid argument for tx-desc-num [0-N],"
761                                         "power of 2 required.\n");
762                                         us_vhost_usage(prgname);
763                                         return -1;
764                                 } else {
765                                         num_tx_descriptor = ret;
766                                 }
767                         }
768
769                         break;
770
771                         /* Invalid option - print options. */
772                 default:
773                         us_vhost_usage(prgname);
774                         return -1;
775                 }
776         }
777
778         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
779                 if (enabled_port_mask & (1 << i))
780                         ports[num_ports++] = (uint8_t)i;
781         }
782
783         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
784                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
785                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
786                 return -1;
787         }
788
789         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
790                 RTE_LOG(INFO, VHOST_PORT,
791                         "Vhost zero copy doesn't support software vm2vm,"
792                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
793                 return -1;
794         }
795
796         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
797                 RTE_LOG(INFO, VHOST_PORT,
798                         "Vhost zero copy doesn't support jumbo frame,"
799                         "please specify '--mergeable 0' to disable the "
800                         "mergeable feature.\n");
801                 return -1;
802         }
803
804         return 0;
805 }
806
807 /*
808  * Update the global var NUM_PORTS and array PORTS according to system ports number
809  * and return valid ports number
810  */
811 static unsigned check_ports_num(unsigned nb_ports)
812 {
813         unsigned valid_num_ports = num_ports;
814         unsigned portid;
815
816         if (num_ports > nb_ports) {
817                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
818                         num_ports, nb_ports);
819                 num_ports = nb_ports;
820         }
821
822         for (portid = 0; portid < num_ports; portid ++) {
823                 if (ports[portid] >= nb_ports) {
824                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
825                                 ports[portid], (nb_ports - 1));
826                         ports[portid] = INVALID_PORT_ID;
827                         valid_num_ports--;
828                 }
829         }
830         return valid_num_ports;
831 }
832
833 /*
834  * Macro to print out packet contents. Wrapped in debug define so that the
835  * data path is not effected when debug is disabled.
836  */
837 #ifdef DEBUG
838 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
839         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
840         unsigned int index;                                                                                                                                                                                             \
841         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
842                                                                                                                                                                                                                                         \
843         if ((header))                                                                                                                                                                                                   \
844                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
845         else                                                                                                                                                                                                                    \
846                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
847         for (index = 0; index < (size); index++) {                                                                                                                                              \
848                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
849                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
850         }                                                                                                                                                                                                                               \
851         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
852                                                                                                                                                                                                                                         \
853         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
854 } while(0)
855 #else
856 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
857 #endif
858
859 /*
860  * Function to convert guest physical addresses to vhost physical addresses.
861  * This is used to convert virtio buffer addresses.
862  */
863 static inline uint64_t __attribute__((always_inline))
864 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
865         uint32_t buf_len, hpa_type *addr_type)
866 {
867         struct virtio_memory_regions_hpa *region;
868         uint32_t regionidx;
869         uint64_t vhost_pa = 0;
870
871         *addr_type = PHYS_ADDR_INVALID;
872
873         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
874                 region = &vdev->regions_hpa[regionidx];
875                 if ((guest_pa >= region->guest_phys_address) &&
876                         (guest_pa <= region->guest_phys_address_end)) {
877                         vhost_pa = region->host_phys_addr_offset + guest_pa;
878                         if (likely((guest_pa + buf_len - 1)
879                                 <= region->guest_phys_address_end))
880                                 *addr_type = PHYS_ADDR_CONTINUOUS;
881                         else
882                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
883                         break;
884                 }
885         }
886
887         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
888                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
889                 (void *)(uintptr_t)vhost_pa);
890
891         return vhost_pa;
892 }
893
894 /*
895  * Compares a packet destination MAC address to a device MAC address.
896  */
897 static inline int __attribute__((always_inline))
898 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
899 {
900         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
901 }
902
903 /*
904  * This function learns the MAC address of the device and registers this along with a
905  * vlan tag to a VMDQ.
906  */
907 static int
908 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
909 {
910         struct ether_hdr *pkt_hdr;
911         struct virtio_net_data_ll *dev_ll;
912         struct virtio_net *dev = vdev->dev;
913         int i, ret;
914
915         /* Learn MAC address of guest device from packet */
916         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
917
918         dev_ll = ll_root_used;
919
920         while (dev_ll != NULL) {
921                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
922                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
923                         return -1;
924                 }
925                 dev_ll = dev_ll->next;
926         }
927
928         for (i = 0; i < ETHER_ADDR_LEN; i++)
929                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
930
931         /* vlan_tag currently uses the device_id. */
932         vdev->vlan_tag = vlan_tags[dev->device_fh];
933
934         /* Print out VMDQ registration info. */
935         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
936                 dev->device_fh,
937                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
938                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
939                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
940                 vdev->vlan_tag);
941
942         /* Register the MAC address. */
943         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
944                                 (uint32_t)dev->device_fh + vmdq_pool_base);
945         if (ret)
946                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
947                                         dev->device_fh);
948
949         /* Enable stripping of the vlan tag as we handle routing. */
950         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
951
952         /* Set device as ready for RX. */
953         vdev->ready = DEVICE_RX;
954
955         return 0;
956 }
957
958 /*
959  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
960  * queue before disabling RX on the device.
961  */
962 static inline void
963 unlink_vmdq(struct vhost_dev *vdev)
964 {
965         unsigned i = 0;
966         unsigned rx_count;
967         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
968
969         if (vdev->ready == DEVICE_RX) {
970                 /*clear MAC and VLAN settings*/
971                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
972                 for (i = 0; i < 6; i++)
973                         vdev->mac_address.addr_bytes[i] = 0;
974
975                 vdev->vlan_tag = 0;
976
977                 /*Clear out the receive buffers*/
978                 rx_count = rte_eth_rx_burst(ports[0],
979                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980
981                 while (rx_count) {
982                         for (i = 0; i < rx_count; i++)
983                                 rte_pktmbuf_free(pkts_burst[i]);
984
985                         rx_count = rte_eth_rx_burst(ports[0],
986                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
987                 }
988
989                 vdev->ready = DEVICE_MAC_LEARNING;
990         }
991 }
992
993 /*
994  * Check if the packet destination MAC address is for a local device. If so then put
995  * the packet on that devices RX queue. If not then return.
996  */
997 static inline int __attribute__((always_inline))
998 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
999 {
1000         struct virtio_net_data_ll *dev_ll;
1001         struct ether_hdr *pkt_hdr;
1002         uint64_t ret = 0;
1003         struct virtio_net *dev = vdev->dev;
1004         struct virtio_net *tdev; /* destination virito device */
1005
1006         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1007
1008         /*get the used devices list*/
1009         dev_ll = ll_root_used;
1010
1011         while (dev_ll != NULL) {
1012                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1013                                           &dev_ll->vdev->mac_address)) {
1014
1015                         /* Drop the packet if the TX packet is destined for the TX device. */
1016                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1017                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1018                                                         dev->device_fh);
1019                                 return 0;
1020                         }
1021                         tdev = dev_ll->vdev->dev;
1022
1023
1024                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1025
1026                         if (unlikely(dev_ll->vdev->remove)) {
1027                                 /*drop the packet if the device is marked for removal*/
1028                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1029                         } else {
1030                                 /*send the packet to the local virtio device*/
1031                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1032                                 if (enable_stats) {
1033                                         rte_atomic64_add(
1034                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1035                                         1);
1036                                         rte_atomic64_add(
1037                                         &dev_statistics[tdev->device_fh].rx_atomic,
1038                                         ret);
1039                                         dev_statistics[tdev->device_fh].tx_total++;
1040                                         dev_statistics[tdev->device_fh].tx += ret;
1041                                 }
1042                         }
1043
1044                         return 0;
1045                 }
1046                 dev_ll = dev_ll->next;
1047         }
1048
1049         return -1;
1050 }
1051
1052 /*
1053  * Check if the destination MAC of a packet is one local VM,
1054  * and get its vlan tag, and offset if it is.
1055  */
1056 static inline int __attribute__((always_inline))
1057 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1058         uint32_t *offset, uint16_t *vlan_tag)
1059 {
1060         struct virtio_net_data_ll *dev_ll = ll_root_used;
1061         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1062
1063         while (dev_ll != NULL) {
1064                 if ((dev_ll->vdev->ready == DEVICE_RX)
1065                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1066                 &dev_ll->vdev->mac_address)) {
1067                         /*
1068                          * Drop the packet if the TX packet is
1069                          * destined for the TX device.
1070                          */
1071                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1072                                 LOG_DEBUG(VHOST_DATA,
1073                                 "(%"PRIu64") TX: Source and destination"
1074                                 " MAC addresses are the same. Dropping "
1075                                 "packet.\n",
1076                                 dev_ll->vdev->dev->device_fh);
1077                                 return -1;
1078                         }
1079
1080                         /*
1081                          * HW vlan strip will reduce the packet length
1082                          * by minus length of vlan tag, so need restore
1083                          * the packet length by plus it.
1084                          */
1085                         *offset = VLAN_HLEN;
1086                         *vlan_tag =
1087                         (uint16_t)
1088                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1089
1090                         LOG_DEBUG(VHOST_DATA,
1091                         "(%"PRIu64") TX: pkt to local VM device id:"
1092                         "(%"PRIu64") vlan tag: %d.\n",
1093                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1094                         vlan_tag);
1095
1096                         break;
1097                 }
1098                 dev_ll = dev_ll->next;
1099         }
1100         return 0;
1101 }
1102
1103 /*
1104  * This function routes the TX packet to the correct interface. This may be a local device
1105  * or the physical port.
1106  */
1107 static inline void __attribute__((always_inline))
1108 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1109 {
1110         struct mbuf_table *tx_q;
1111         struct rte_mbuf **m_table;
1112         unsigned len, ret, offset = 0;
1113         const uint16_t lcore_id = rte_lcore_id();
1114         struct virtio_net *dev = vdev->dev;
1115
1116         /*check if destination is local VM*/
1117         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1118                 rte_pktmbuf_free(m);
1119                 return;
1120         }
1121
1122         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1123                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1124                         rte_pktmbuf_free(m);
1125                         return;
1126                 }
1127         }
1128
1129         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1130
1131         /*Add packet to the port tx queue*/
1132         tx_q = &lcore_tx_queue[lcore_id];
1133         len = tx_q->len;
1134
1135         m->ol_flags = PKT_TX_VLAN_PKT;
1136
1137         /*
1138          * Find the right seg to adjust the data len when offset is
1139          * bigger than tail room size.
1140          */
1141         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1142                 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1143                         m->data_len += offset;
1144                 else {
1145                         struct rte_mbuf *seg = m;
1146
1147                         while ((seg->next != NULL) &&
1148                                 (offset > rte_pktmbuf_tailroom(seg)))
1149                                 seg = seg->next;
1150
1151                         seg->data_len += offset;
1152                 }
1153                 m->pkt_len += offset;
1154         }
1155
1156         m->vlan_tci = vlan_tag;
1157
1158         tx_q->m_table[len] = m;
1159         len++;
1160         if (enable_stats) {
1161                 dev_statistics[dev->device_fh].tx_total++;
1162                 dev_statistics[dev->device_fh].tx++;
1163         }
1164
1165         if (unlikely(len == MAX_PKT_BURST)) {
1166                 m_table = (struct rte_mbuf **)tx_q->m_table;
1167                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1168                 /* Free any buffers not handled by TX and update the port stats. */
1169                 if (unlikely(ret < len)) {
1170                         do {
1171                                 rte_pktmbuf_free(m_table[ret]);
1172                         } while (++ret < len);
1173                 }
1174
1175                 len = 0;
1176         }
1177
1178         tx_q->len = len;
1179         return;
1180 }
1181 /*
1182  * This function is called by each data core. It handles all RX/TX registered with the
1183  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1184  * with all devices in the main linked list.
1185  */
1186 static int
1187 switch_worker(__attribute__((unused)) void *arg)
1188 {
1189         struct rte_mempool *mbuf_pool = arg;
1190         struct virtio_net *dev = NULL;
1191         struct vhost_dev *vdev = NULL;
1192         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1193         struct virtio_net_data_ll *dev_ll;
1194         struct mbuf_table *tx_q;
1195         volatile struct lcore_ll_info *lcore_ll;
1196         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1197         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1198         unsigned ret, i;
1199         const uint16_t lcore_id = rte_lcore_id();
1200         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1201         uint16_t rx_count = 0;
1202         uint16_t tx_count;
1203         uint32_t retry = 0;
1204
1205         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1206         lcore_ll = lcore_info[lcore_id].lcore_ll;
1207         prev_tsc = 0;
1208
1209         tx_q = &lcore_tx_queue[lcore_id];
1210         for (i = 0; i < num_cores; i ++) {
1211                 if (lcore_ids[i] == lcore_id) {
1212                         tx_q->txq_id = i;
1213                         break;
1214                 }
1215         }
1216
1217         while(1) {
1218                 cur_tsc = rte_rdtsc();
1219                 /*
1220                  * TX burst queue drain
1221                  */
1222                 diff_tsc = cur_tsc - prev_tsc;
1223                 if (unlikely(diff_tsc > drain_tsc)) {
1224
1225                         if (tx_q->len) {
1226                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1227
1228                                 /*Tx any packets in the queue*/
1229                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1230                                                                            (struct rte_mbuf **)tx_q->m_table,
1231                                                                            (uint16_t)tx_q->len);
1232                                 if (unlikely(ret < tx_q->len)) {
1233                                         do {
1234                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1235                                         } while (++ret < tx_q->len);
1236                                 }
1237
1238                                 tx_q->len = 0;
1239                         }
1240
1241                         prev_tsc = cur_tsc;
1242
1243                 }
1244
1245                 rte_prefetch0(lcore_ll->ll_root_used);
1246                 /*
1247                  * Inform the configuration core that we have exited the linked list and that no devices are
1248                  * in use if requested.
1249                  */
1250                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1251                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1252
1253                 /*
1254                  * Process devices
1255                  */
1256                 dev_ll = lcore_ll->ll_root_used;
1257
1258                 while (dev_ll != NULL) {
1259                         /*get virtio device ID*/
1260                         vdev = dev_ll->vdev;
1261                         dev = vdev->dev;
1262
1263                         if (unlikely(vdev->remove)) {
1264                                 dev_ll = dev_ll->next;
1265                                 unlink_vmdq(vdev);
1266                                 vdev->ready = DEVICE_SAFE_REMOVE;
1267                                 continue;
1268                         }
1269                         if (likely(vdev->ready == DEVICE_RX)) {
1270                                 /*Handle guest RX*/
1271                                 rx_count = rte_eth_rx_burst(ports[0],
1272                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1273
1274                                 if (rx_count) {
1275                                         /*
1276                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1277                                         * Here MAX_PKT_BURST must be less than virtio queue size
1278                                         */
1279                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1280                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1281                                                         rte_delay_us(burst_rx_delay_time);
1282                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1283                                                                 break;
1284                                                 }
1285                                         }
1286                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1287                                         if (enable_stats) {
1288                                                 rte_atomic64_add(
1289                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1290                                                 rx_count);
1291                                                 rte_atomic64_add(
1292                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1293                                         }
1294                                         while (likely(rx_count)) {
1295                                                 rx_count--;
1296                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1297                                         }
1298
1299                                 }
1300                         }
1301
1302                         if (likely(!vdev->remove)) {
1303                                 /* Handle guest TX*/
1304                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1305                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1306                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1307                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1308                                                 while (tx_count--)
1309                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1310                                         }
1311                                 }
1312                                 while (tx_count)
1313                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1314                         }
1315
1316                         /*move to the next device in the list*/
1317                         dev_ll = dev_ll->next;
1318                 }
1319         }
1320
1321         return 0;
1322 }
1323
1324 /*
1325  * This function gets available ring number for zero copy rx.
1326  * Only one thread will call this funciton for a paticular virtio device,
1327  * so, it is designed as non-thread-safe function.
1328  */
1329 static inline uint32_t __attribute__((always_inline))
1330 get_available_ring_num_zcp(struct virtio_net *dev)
1331 {
1332         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1333         uint16_t avail_idx;
1334
1335         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1336         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1337 }
1338
1339 /*
1340  * This function gets available ring index for zero copy rx,
1341  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_index_zcp(struct virtio_net *dev,
1347         uint16_t *res_base_idx, uint32_t count)
1348 {
1349         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1350         uint16_t avail_idx;
1351         uint32_t retry = 0;
1352         uint16_t free_entries;
1353
1354         *res_base_idx = vq->last_used_idx_res;
1355         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1356         free_entries = (avail_idx - *res_base_idx);
1357
1358         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1359                         "avail idx: %d, "
1360                         "res base idx:%d, free entries:%d\n",
1361                         dev->device_fh, avail_idx, *res_base_idx,
1362                         free_entries);
1363
1364         /*
1365          * If retry is enabled and the queue is full then we wait
1366          * and retry to avoid packet loss.
1367          */
1368         if (enable_retry && unlikely(count > free_entries)) {
1369                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1370                         rte_delay_us(burst_rx_delay_time);
1371                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372                         free_entries = (avail_idx - *res_base_idx);
1373                         if (count <= free_entries)
1374                                 break;
1375                 }
1376         }
1377
1378         /*check that we have enough buffers*/
1379         if (unlikely(count > free_entries))
1380                 count = free_entries;
1381
1382         if (unlikely(count == 0)) {
1383                 LOG_DEBUG(VHOST_DATA,
1384                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1385                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1386                         dev->device_fh, avail_idx,
1387                         *res_base_idx, free_entries);
1388                 return 0;
1389         }
1390
1391         vq->last_used_idx_res = *res_base_idx + count;
1392
1393         return count;
1394 }
1395
1396 /*
1397  * This function put descriptor back to used list.
1398  */
1399 static inline void __attribute__((always_inline))
1400 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1401 {
1402         uint16_t res_cur_idx = vq->last_used_idx;
1403         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1404         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1405         rte_compiler_barrier();
1406         *(volatile uint16_t *)&vq->used->idx += 1;
1407         vq->last_used_idx += 1;
1408
1409         /* Kick the guest if necessary. */
1410         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1411                 eventfd_write((int)vq->kickfd, 1);
1412 }
1413
1414 /*
1415  * This function get available descriptor from vitio vring and un-attached mbuf
1416  * from vpool->ring, and then attach them together. It needs adjust the offset
1417  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1418  * frame data may be put to wrong location in mbuf.
1419  */
1420 static inline void __attribute__((always_inline))
1421 attach_rxmbuf_zcp(struct virtio_net *dev)
1422 {
1423         uint16_t res_base_idx, desc_idx;
1424         uint64_t buff_addr, phys_addr;
1425         struct vhost_virtqueue *vq;
1426         struct vring_desc *desc;
1427         struct rte_mbuf *mbuf = NULL;
1428         struct vpool *vpool;
1429         hpa_type addr_type;
1430         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1431
1432         vpool = &vpool_array[vdev->vmdq_rx_q];
1433         vq = dev->virtqueue[VIRTIO_RXQ];
1434
1435         do {
1436                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1437                                 1) != 1))
1438                         return;
1439                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1440
1441                 desc = &vq->desc[desc_idx];
1442                 if (desc->flags & VRING_DESC_F_NEXT) {
1443                         desc = &vq->desc[desc->next];
1444                         buff_addr = gpa_to_vva(dev, desc->addr);
1445                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1446                                         &addr_type);
1447                 } else {
1448                         buff_addr = gpa_to_vva(dev,
1449                                         desc->addr + vq->vhost_hlen);
1450                         phys_addr = gpa_to_hpa(vdev,
1451                                         desc->addr + vq->vhost_hlen,
1452                                         desc->len, &addr_type);
1453                 }
1454
1455                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1456                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1457                                 " address found when attaching RX frame buffer"
1458                                 " address!\n", dev->device_fh);
1459                         put_desc_to_used_list_zcp(vq, desc_idx);
1460                         continue;
1461                 }
1462
1463                 /*
1464                  * Check if the frame buffer address from guest crosses
1465                  * sub-region or not.
1466                  */
1467                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1468                         RTE_LOG(ERR, VHOST_DATA,
1469                                 "(%"PRIu64") Frame buffer address cross "
1470                                 "sub-regioin found when attaching RX frame "
1471                                 "buffer address!\n",
1472                                 dev->device_fh);
1473                         put_desc_to_used_list_zcp(vq, desc_idx);
1474                         continue;
1475                 }
1476         } while (unlikely(phys_addr == 0));
1477
1478         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1479         if (unlikely(mbuf == NULL)) {
1480                 LOG_DEBUG(VHOST_DATA,
1481                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1482                         "ring_sc_dequeue fail.\n",
1483                         dev->device_fh);
1484                 put_desc_to_used_list_zcp(vq, desc_idx);
1485                 return;
1486         }
1487
1488         if (unlikely(vpool->buf_size > desc->len)) {
1489                 LOG_DEBUG(VHOST_DATA,
1490                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1491                         "length(%d) of descriptor idx: %d less than room "
1492                         "size required: %d\n",
1493                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1494                 put_desc_to_used_list_zcp(vq, desc_idx);
1495                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1496                 return;
1497         }
1498
1499         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1500         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1501         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1502         mbuf->data_len = desc->len;
1503         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1504
1505         LOG_DEBUG(VHOST_DATA,
1506                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1507                 "descriptor idx:%d\n",
1508                 dev->device_fh, res_base_idx, desc_idx);
1509
1510         __rte_mbuf_raw_free(mbuf);
1511
1512         return;
1513 }
1514
1515 /*
1516  * Detach an attched packet mbuf -
1517  *  - restore original mbuf address and length values.
1518  *  - reset pktmbuf data and data_len to their default values.
1519  *  All other fields of the given packet mbuf will be left intact.
1520  *
1521  * @param m
1522  *   The attached packet mbuf.
1523  */
1524 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1525 {
1526         const struct rte_mempool *mp = m->pool;
1527         void *buf = RTE_MBUF_TO_BADDR(m);
1528         uint32_t buf_ofs;
1529         uint32_t buf_len = mp->elt_size - sizeof(*m);
1530         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1531
1532         m->buf_addr = buf;
1533         m->buf_len = (uint16_t)buf_len;
1534
1535         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1536                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1537         m->data_off = buf_ofs;
1538
1539         m->data_len = 0;
1540 }
1541
1542 /*
1543  * This function is called after packets have been transimited. It fetchs mbuf
1544  * from vpool->pool, detached it and put into vpool->ring. It also update the
1545  * used index and kick the guest if necessary.
1546  */
1547 static inline uint32_t __attribute__((always_inline))
1548 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1549 {
1550         struct rte_mbuf *mbuf;
1551         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1552         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1553         uint32_t index = 0;
1554         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1555
1556         LOG_DEBUG(VHOST_DATA,
1557                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1558                 "clean is: %d\n",
1559                 dev->device_fh, mbuf_count);
1560         LOG_DEBUG(VHOST_DATA,
1561                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1562                 "clean  is : %d\n",
1563                 dev->device_fh, rte_ring_count(vpool->ring));
1564
1565         for (index = 0; index < mbuf_count; index++) {
1566                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1567                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1568                         pktmbuf_detach_zcp(mbuf);
1569                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1570
1571                 /* Update used index buffer information. */
1572                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1573                 vq->used->ring[used_idx].len = 0;
1574
1575                 used_idx = (used_idx + 1) & (vq->size - 1);
1576         }
1577
1578         LOG_DEBUG(VHOST_DATA,
1579                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1580                 "clean is: %d\n",
1581                 dev->device_fh, rte_mempool_count(vpool->pool));
1582         LOG_DEBUG(VHOST_DATA,
1583                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1584                 "clean  is : %d\n",
1585                 dev->device_fh, rte_ring_count(vpool->ring));
1586         LOG_DEBUG(VHOST_DATA,
1587                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1588                 "vq->last_used_idx:%d\n",
1589                 dev->device_fh, vq->last_used_idx);
1590
1591         vq->last_used_idx += mbuf_count;
1592
1593         LOG_DEBUG(VHOST_DATA,
1594                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1595                 "vq->last_used_idx:%d\n",
1596                 dev->device_fh, vq->last_used_idx);
1597
1598         rte_compiler_barrier();
1599
1600         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1601
1602         /* Kick guest if required. */
1603         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1604                 eventfd_write((int)vq->kickfd, 1);
1605
1606         return 0;
1607 }
1608
1609 /*
1610  * This function is called when a virtio device is destroy.
1611  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1612  */
1613 static void mbuf_destroy_zcp(struct vpool *vpool)
1614 {
1615         struct rte_mbuf *mbuf = NULL;
1616         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1617
1618         LOG_DEBUG(VHOST_CONFIG,
1619                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1620                 "mbuf_destroy_zcp is: %d\n",
1621                 mbuf_count);
1622         LOG_DEBUG(VHOST_CONFIG,
1623                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1624                 "mbuf_destroy_zcp  is : %d\n",
1625                 rte_ring_count(vpool->ring));
1626
1627         for (index = 0; index < mbuf_count; index++) {
1628                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1629                 if (likely(mbuf != NULL)) {
1630                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1631                                 pktmbuf_detach_zcp(mbuf);
1632                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1633                 }
1634         }
1635
1636         LOG_DEBUG(VHOST_CONFIG,
1637                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1638                 "mbuf_destroy_zcp is: %d\n",
1639                 rte_mempool_count(vpool->pool));
1640         LOG_DEBUG(VHOST_CONFIG,
1641                 "in mbuf_destroy_zcp: mbuf count in ring after "
1642                 "mbuf_destroy_zcp is : %d\n",
1643                 rte_ring_count(vpool->ring));
1644 }
1645
1646 /*
1647  * This function update the use flag and counter.
1648  */
1649 static inline uint32_t __attribute__((always_inline))
1650 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1651         uint32_t count)
1652 {
1653         struct vhost_virtqueue *vq;
1654         struct vring_desc *desc;
1655         struct rte_mbuf *buff;
1656         /* The virtio_hdr is initialised to 0. */
1657         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1658                 = {{0, 0, 0, 0, 0, 0}, 0};
1659         uint64_t buff_hdr_addr = 0;
1660         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1661         uint32_t head_idx, packet_success = 0;
1662         uint16_t res_cur_idx;
1663
1664         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1665
1666         if (count == 0)
1667                 return 0;
1668
1669         vq = dev->virtqueue[VIRTIO_RXQ];
1670         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1671
1672         res_cur_idx = vq->last_used_idx;
1673         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1674                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1675
1676         /* Retrieve all of the head indexes first to avoid caching issues. */
1677         for (head_idx = 0; head_idx < count; head_idx++)
1678                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1679
1680         /*Prefetch descriptor index. */
1681         rte_prefetch0(&vq->desc[head[packet_success]]);
1682
1683         while (packet_success != count) {
1684                 /* Get descriptor from available ring */
1685                 desc = &vq->desc[head[packet_success]];
1686
1687                 buff = pkts[packet_success];
1688                 LOG_DEBUG(VHOST_DATA,
1689                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1690                         "pkt[%d] descriptor idx: %d\n",
1691                         dev->device_fh, packet_success,
1692                         MBUF_HEADROOM_UINT32(buff));
1693
1694                 PRINT_PACKET(dev,
1695                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1696                         + RTE_PKTMBUF_HEADROOM),
1697                         rte_pktmbuf_data_len(buff), 0);
1698
1699                 /* Buffer address translation for virtio header. */
1700                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1701                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1702
1703                 /*
1704                  * If the descriptors are chained the header and data are
1705                  * placed in separate buffers.
1706                  */
1707                 if (desc->flags & VRING_DESC_F_NEXT) {
1708                         desc->len = vq->vhost_hlen;
1709                         desc = &vq->desc[desc->next];
1710                         desc->len = rte_pktmbuf_data_len(buff);
1711                 } else {
1712                         desc->len = packet_len;
1713                 }
1714
1715                 /* Update used ring with desc information */
1716                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1717                         = head[packet_success];
1718                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1719                         = packet_len;
1720                 res_cur_idx++;
1721                 packet_success++;
1722
1723                 /* A header is required per buffer. */
1724                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1725                         (const void *)&virtio_hdr, vq->vhost_hlen);
1726
1727                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1728
1729                 if (likely(packet_success < count)) {
1730                         /* Prefetch descriptor index. */
1731                         rte_prefetch0(&vq->desc[head[packet_success]]);
1732                 }
1733         }
1734
1735         rte_compiler_barrier();
1736
1737         LOG_DEBUG(VHOST_DATA,
1738                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1739                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1740                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1741
1742         *(volatile uint16_t *)&vq->used->idx += count;
1743         vq->last_used_idx += count;
1744
1745         LOG_DEBUG(VHOST_DATA,
1746                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1747                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1748                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1749
1750         /* Kick the guest if necessary. */
1751         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1752                 eventfd_write((int)vq->kickfd, 1);
1753
1754         return count;
1755 }
1756
1757 /*
1758  * This function routes the TX packet to the correct interface.
1759  * This may be a local device or the physical port.
1760  */
1761 static inline void __attribute__((always_inline))
1762 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1763         uint32_t desc_idx, uint8_t need_copy)
1764 {
1765         struct mbuf_table *tx_q;
1766         struct rte_mbuf **m_table;
1767         struct rte_mbuf *mbuf = NULL;
1768         unsigned len, ret, offset = 0;
1769         struct vpool *vpool;
1770         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1771         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1772
1773         /*Add packet to the port tx queue*/
1774         tx_q = &tx_queue_zcp[vmdq_rx_q];
1775         len = tx_q->len;
1776
1777         /* Allocate an mbuf and populate the structure. */
1778         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1779         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1780         if (unlikely(mbuf == NULL)) {
1781                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1782                 RTE_LOG(ERR, VHOST_DATA,
1783                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1784                         dev->device_fh);
1785                 put_desc_to_used_list_zcp(vq, desc_idx);
1786                 return;
1787         }
1788
1789         if (vm2vm_mode == VM2VM_HARDWARE) {
1790                 /* Avoid using a vlan tag from any vm for external pkt, such as
1791                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1792                  * selection, MAC address determines it as an external pkt
1793                  * which should go to network, while vlan tag determine it as
1794                  * a vm2vm pkt should forward to another vm. Hardware confuse
1795                  * such a ambiguous situation, so pkt will lost.
1796                  */
1797                 vlan_tag = external_pkt_default_vlan_tag;
1798                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1799                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1800                         __rte_mbuf_raw_free(mbuf);
1801                         return;
1802                 }
1803         }
1804
1805         mbuf->nb_segs = m->nb_segs;
1806         mbuf->next = m->next;
1807         mbuf->data_len = m->data_len + offset;
1808         mbuf->pkt_len = mbuf->data_len;
1809         if (unlikely(need_copy)) {
1810                 /* Copy the packet contents to the mbuf. */
1811                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1812                         rte_pktmbuf_mtod(m, void *),
1813                         m->data_len);
1814         } else {
1815                 mbuf->data_off = m->data_off;
1816                 mbuf->buf_physaddr = m->buf_physaddr;
1817                 mbuf->buf_addr = m->buf_addr;
1818         }
1819         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1820         mbuf->vlan_tci = vlan_tag;
1821         mbuf->l2_len = sizeof(struct ether_hdr);
1822         mbuf->l3_len = sizeof(struct ipv4_hdr);
1823         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1824
1825         tx_q->m_table[len] = mbuf;
1826         len++;
1827
1828         LOG_DEBUG(VHOST_DATA,
1829                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1830                 dev->device_fh,
1831                 mbuf->nb_segs,
1832                 (mbuf->next == NULL) ? "null" : "non-null");
1833
1834         if (enable_stats) {
1835                 dev_statistics[dev->device_fh].tx_total++;
1836                 dev_statistics[dev->device_fh].tx++;
1837         }
1838
1839         if (unlikely(len == MAX_PKT_BURST)) {
1840                 m_table = (struct rte_mbuf **)tx_q->m_table;
1841                 ret = rte_eth_tx_burst(ports[0],
1842                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1843
1844                 /*
1845                  * Free any buffers not handled by TX and update
1846                  * the port stats.
1847                  */
1848                 if (unlikely(ret < len)) {
1849                         do {
1850                                 rte_pktmbuf_free(m_table[ret]);
1851                         } while (++ret < len);
1852                 }
1853
1854                 len = 0;
1855                 txmbuf_clean_zcp(dev, vpool);
1856         }
1857
1858         tx_q->len = len;
1859
1860         return;
1861 }
1862
1863 /*
1864  * This function TX all available packets in virtio TX queue for one
1865  * virtio-net device. If it is first packet, it learns MAC address and
1866  * setup VMDQ.
1867  */
1868 static inline void __attribute__((always_inline))
1869 virtio_dev_tx_zcp(struct virtio_net *dev)
1870 {
1871         struct rte_mbuf m;
1872         struct vhost_virtqueue *vq;
1873         struct vring_desc *desc;
1874         uint64_t buff_addr = 0, phys_addr;
1875         uint32_t head[MAX_PKT_BURST];
1876         uint32_t i;
1877         uint16_t free_entries, packet_success = 0;
1878         uint16_t avail_idx;
1879         uint8_t need_copy = 0;
1880         hpa_type addr_type;
1881         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1882
1883         vq = dev->virtqueue[VIRTIO_TXQ];
1884         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1885
1886         /* If there are no available buffers then return. */
1887         if (vq->last_used_idx_res == avail_idx)
1888                 return;
1889
1890         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1891
1892         /* Prefetch available ring to retrieve head indexes. */
1893         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1894
1895         /* Get the number of free entries in the ring */
1896         free_entries = (avail_idx - vq->last_used_idx_res);
1897
1898         /* Limit to MAX_PKT_BURST. */
1899         free_entries
1900                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1901
1902         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1903                 dev->device_fh, free_entries);
1904
1905         /* Retrieve all of the head indexes first to avoid caching issues. */
1906         for (i = 0; i < free_entries; i++)
1907                 head[i]
1908                         = vq->avail->ring[(vq->last_used_idx_res + i)
1909                         & (vq->size - 1)];
1910
1911         vq->last_used_idx_res += free_entries;
1912
1913         /* Prefetch descriptor index. */
1914         rte_prefetch0(&vq->desc[head[packet_success]]);
1915         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1916
1917         while (packet_success < free_entries) {
1918                 desc = &vq->desc[head[packet_success]];
1919
1920                 /* Discard first buffer as it is the virtio header */
1921                 desc = &vq->desc[desc->next];
1922
1923                 /* Buffer address translation. */
1924                 buff_addr = gpa_to_vva(dev, desc->addr);
1925                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1926                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1927                         &addr_type);
1928
1929                 if (likely(packet_success < (free_entries - 1)))
1930                         /* Prefetch descriptor index. */
1931                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1932
1933                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1934                         RTE_LOG(ERR, VHOST_DATA,
1935                                 "(%"PRIu64") Invalid frame buffer address found"
1936                                 "when TX packets!\n",
1937                                 dev->device_fh);
1938                         packet_success++;
1939                         continue;
1940                 }
1941
1942                 /* Prefetch buffer address. */
1943                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1944
1945                 /*
1946                  * Setup dummy mbuf. This is copied to a real mbuf if
1947                  * transmitted out the physical port.
1948                  */
1949                 m.data_len = desc->len;
1950                 m.nb_segs = 1;
1951                 m.next = NULL;
1952                 m.data_off = 0;
1953                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1954                 m.buf_physaddr = phys_addr;
1955
1956                 /*
1957                  * Check if the frame buffer address from guest crosses
1958                  * sub-region or not.
1959                  */
1960                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1961                         RTE_LOG(ERR, VHOST_DATA,
1962                                 "(%"PRIu64") Frame buffer address cross "
1963                                 "sub-regioin found when attaching TX frame "
1964                                 "buffer address!\n",
1965                                 dev->device_fh);
1966                         need_copy = 1;
1967                 } else
1968                         need_copy = 0;
1969
1970                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1971
1972                 /*
1973                  * If this is the first received packet we need to learn
1974                  * the MAC and setup VMDQ
1975                  */
1976                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1977                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1978                                 /*
1979                                  * Discard frame if device is scheduled for
1980                                  * removal or a duplicate MAC address is found.
1981                                  */
1982                                 packet_success += free_entries;
1983                                 vq->last_used_idx += packet_success;
1984                                 break;
1985                         }
1986                 }
1987
1988                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1989                 packet_success++;
1990         }
1991 }
1992
1993 /*
1994  * This function is called by each data core. It handles all RX/TX registered
1995  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1996  * addresses are compared with all devices in the main linked list.
1997  */
1998 static int
1999 switch_worker_zcp(__attribute__((unused)) void *arg)
2000 {
2001         struct virtio_net *dev = NULL;
2002         struct vhost_dev  *vdev = NULL;
2003         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2004         struct virtio_net_data_ll *dev_ll;
2005         struct mbuf_table *tx_q;
2006         volatile struct lcore_ll_info *lcore_ll;
2007         const uint64_t drain_tsc
2008                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2009                 * BURST_TX_DRAIN_US;
2010         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2011         unsigned ret;
2012         const uint16_t lcore_id = rte_lcore_id();
2013         uint16_t count_in_ring, rx_count = 0;
2014
2015         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2016
2017         lcore_ll = lcore_info[lcore_id].lcore_ll;
2018         prev_tsc = 0;
2019
2020         while (1) {
2021                 cur_tsc = rte_rdtsc();
2022
2023                 /* TX burst queue drain */
2024                 diff_tsc = cur_tsc - prev_tsc;
2025                 if (unlikely(diff_tsc > drain_tsc)) {
2026                         /*
2027                          * Get mbuf from vpool.pool and detach mbuf and
2028                          * put back into vpool.ring.
2029                          */
2030                         dev_ll = lcore_ll->ll_root_used;
2031                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2032                                 /* Get virtio device ID */
2033                                 vdev = dev_ll->vdev;
2034                                 dev = vdev->dev;
2035
2036                                 if (likely(!vdev->remove)) {
2037                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2038                                         if (tx_q->len) {
2039                                                 LOG_DEBUG(VHOST_DATA,
2040                                                 "TX queue drained after timeout"
2041                                                 " with burst size %u\n",
2042                                                 tx_q->len);
2043
2044                                                 /*
2045                                                  * Tx any packets in the queue
2046                                                  */
2047                                                 ret = rte_eth_tx_burst(
2048                                                         ports[0],
2049                                                         (uint16_t)tx_q->txq_id,
2050                                                         (struct rte_mbuf **)
2051                                                         tx_q->m_table,
2052                                                         (uint16_t)tx_q->len);
2053                                                 if (unlikely(ret < tx_q->len)) {
2054                                                         do {
2055                                                                 rte_pktmbuf_free(
2056                                                                         tx_q->m_table[ret]);
2057                                                         } while (++ret < tx_q->len);
2058                                                 }
2059                                                 tx_q->len = 0;
2060
2061                                                 txmbuf_clean_zcp(dev,
2062                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2063                                         }
2064                                 }
2065                                 dev_ll = dev_ll->next;
2066                         }
2067                         prev_tsc = cur_tsc;
2068                 }
2069
2070                 rte_prefetch0(lcore_ll->ll_root_used);
2071
2072                 /*
2073                  * Inform the configuration core that we have exited the linked
2074                  * list and that no devices are in use if requested.
2075                  */
2076                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2077                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2078
2079                 /* Process devices */
2080                 dev_ll = lcore_ll->ll_root_used;
2081
2082                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2083                         vdev = dev_ll->vdev;
2084                         dev  = vdev->dev;
2085                         if (unlikely(vdev->remove)) {
2086                                 dev_ll = dev_ll->next;
2087                                 unlink_vmdq(vdev);
2088                                 vdev->ready = DEVICE_SAFE_REMOVE;
2089                                 continue;
2090                         }
2091
2092                         if (likely(vdev->ready == DEVICE_RX)) {
2093                                 uint32_t index = vdev->vmdq_rx_q;
2094                                 uint16_t i;
2095                                 count_in_ring
2096                                 = rte_ring_count(vpool_array[index].ring);
2097                                 uint16_t free_entries
2098                                 = (uint16_t)get_available_ring_num_zcp(dev);
2099
2100                                 /*
2101                                  * Attach all mbufs in vpool.ring and put back
2102                                  * into vpool.pool.
2103                                  */
2104                                 for (i = 0;
2105                                 i < RTE_MIN(free_entries,
2106                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2107                                 i++)
2108                                         attach_rxmbuf_zcp(dev);
2109
2110                                 /* Handle guest RX */
2111                                 rx_count = rte_eth_rx_burst(ports[0],
2112                                         vdev->vmdq_rx_q, pkts_burst,
2113                                         MAX_PKT_BURST);
2114
2115                                 if (rx_count) {
2116                                         ret_count = virtio_dev_rx_zcp(dev,
2117                                                         pkts_burst, rx_count);
2118                                         if (enable_stats) {
2119                                                 dev_statistics[dev->device_fh].rx_total
2120                                                         += rx_count;
2121                                                 dev_statistics[dev->device_fh].rx
2122                                                         += ret_count;
2123                                         }
2124                                         while (likely(rx_count)) {
2125                                                 rx_count--;
2126                                                 pktmbuf_detach_zcp(
2127                                                         pkts_burst[rx_count]);
2128                                                 rte_ring_sp_enqueue(
2129                                                         vpool_array[index].ring,
2130                                                         (void *)pkts_burst[rx_count]);
2131                                         }
2132                                 }
2133                         }
2134
2135                         if (likely(!vdev->remove))
2136                                 /* Handle guest TX */
2137                                 virtio_dev_tx_zcp(dev);
2138
2139                         /* Move to the next device in the list */
2140                         dev_ll = dev_ll->next;
2141                 }
2142         }
2143
2144         return 0;
2145 }
2146
2147
2148 /*
2149  * Add an entry to a used linked list. A free entry must first be found
2150  * in the free linked list using get_data_ll_free_entry();
2151  */
2152 static void
2153 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2154         struct virtio_net_data_ll *ll_dev)
2155 {
2156         struct virtio_net_data_ll *ll = *ll_root_addr;
2157
2158         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2159         ll_dev->next = NULL;
2160         rte_compiler_barrier();
2161
2162         /* If ll == NULL then this is the first device. */
2163         if (ll) {
2164                 /* Increment to the tail of the linked list. */
2165                 while ((ll->next != NULL) )
2166                         ll = ll->next;
2167
2168                 ll->next = ll_dev;
2169         } else {
2170                 *ll_root_addr = ll_dev;
2171         }
2172 }
2173
2174 /*
2175  * Remove an entry from a used linked list. The entry must then be added to
2176  * the free linked list using put_data_ll_free_entry().
2177  */
2178 static void
2179 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2180         struct virtio_net_data_ll *ll_dev,
2181         struct virtio_net_data_ll *ll_dev_last)
2182 {
2183         struct virtio_net_data_ll *ll = *ll_root_addr;
2184
2185         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2186                 return;
2187
2188         if (ll_dev == ll)
2189                 *ll_root_addr = ll_dev->next;
2190         else
2191                 if (likely(ll_dev_last != NULL))
2192                         ll_dev_last->next = ll_dev->next;
2193                 else
2194                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2195 }
2196
2197 /*
2198  * Find and return an entry from the free linked list.
2199  */
2200 static struct virtio_net_data_ll *
2201 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2202 {
2203         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2204         struct virtio_net_data_ll *ll_dev;
2205
2206         if (ll_free == NULL)
2207                 return NULL;
2208
2209         ll_dev = ll_free;
2210         *ll_root_addr = ll_free->next;
2211
2212         return ll_dev;
2213 }
2214
2215 /*
2216  * Place an entry back on to the free linked list.
2217  */
2218 static void
2219 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2220         struct virtio_net_data_ll *ll_dev)
2221 {
2222         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2223
2224         if (ll_dev == NULL)
2225                 return;
2226
2227         ll_dev->next = ll_free;
2228         *ll_root_addr = ll_dev;
2229 }
2230
2231 /*
2232  * Creates a linked list of a given size.
2233  */
2234 static struct virtio_net_data_ll *
2235 alloc_data_ll(uint32_t size)
2236 {
2237         struct virtio_net_data_ll *ll_new;
2238         uint32_t i;
2239
2240         /* Malloc and then chain the linked list. */
2241         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2242         if (ll_new == NULL) {
2243                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2244                 return NULL;
2245         }
2246
2247         for (i = 0; i < size - 1; i++) {
2248                 ll_new[i].vdev = NULL;
2249                 ll_new[i].next = &ll_new[i+1];
2250         }
2251         ll_new[i].next = NULL;
2252
2253         return (ll_new);
2254 }
2255
2256 /*
2257  * Create the main linked list along with each individual cores linked list. A used and a free list
2258  * are created to manage entries.
2259  */
2260 static int
2261 init_data_ll (void)
2262 {
2263         int lcore;
2264
2265         RTE_LCORE_FOREACH_SLAVE(lcore) {
2266                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2267                 if (lcore_info[lcore].lcore_ll == NULL) {
2268                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2269                         return -1;
2270                 }
2271
2272                 lcore_info[lcore].lcore_ll->device_num = 0;
2273                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2274                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2275                 if (num_devices % num_switching_cores)
2276                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2277                 else
2278                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2279         }
2280
2281         /* Allocate devices up to a maximum of MAX_DEVICES. */
2282         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2283
2284         return 0;
2285 }
2286
2287 /*
2288  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2289  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2290  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2291  */
2292 static void
2293 destroy_device (volatile struct virtio_net *dev)
2294 {
2295         struct virtio_net_data_ll *ll_lcore_dev_cur;
2296         struct virtio_net_data_ll *ll_main_dev_cur;
2297         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2298         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2299         struct vhost_dev *vdev;
2300         int lcore;
2301
2302         dev->flags &= ~VIRTIO_DEV_RUNNING;
2303
2304         vdev = (struct vhost_dev *)dev->priv;
2305         /*set the remove flag. */
2306         vdev->remove = 1;
2307         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2308                 rte_pause();
2309         }
2310
2311         /* Search for entry to be removed from lcore ll */
2312         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2313         while (ll_lcore_dev_cur != NULL) {
2314                 if (ll_lcore_dev_cur->vdev == vdev) {
2315                         break;
2316                 } else {
2317                         ll_lcore_dev_last = ll_lcore_dev_cur;
2318                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2319                 }
2320         }
2321
2322         if (ll_lcore_dev_cur == NULL) {
2323                 RTE_LOG(ERR, VHOST_CONFIG,
2324                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2325                         dev->device_fh);
2326                 return;
2327         }
2328
2329         /* Search for entry to be removed from main ll */
2330         ll_main_dev_cur = ll_root_used;
2331         ll_main_dev_last = NULL;
2332         while (ll_main_dev_cur != NULL) {
2333                 if (ll_main_dev_cur->vdev == vdev) {
2334                         break;
2335                 } else {
2336                         ll_main_dev_last = ll_main_dev_cur;
2337                         ll_main_dev_cur = ll_main_dev_cur->next;
2338                 }
2339         }
2340
2341         /* Remove entries from the lcore and main ll. */
2342         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2343         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2344
2345         /* Set the dev_removal_flag on each lcore. */
2346         RTE_LCORE_FOREACH_SLAVE(lcore) {
2347                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2348         }
2349
2350         /*
2351          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2352          * they can no longer access the device removed from the linked lists and that the devices
2353          * are no longer in use.
2354          */
2355         RTE_LCORE_FOREACH_SLAVE(lcore) {
2356                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2357                         rte_pause();
2358                 }
2359         }
2360
2361         /* Add the entries back to the lcore and main free ll.*/
2362         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2363         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2364
2365         /* Decrement number of device on the lcore. */
2366         lcore_info[vdev->coreid].lcore_ll->device_num--;
2367
2368         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2369
2370         if (zero_copy) {
2371                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2372
2373                 /* Stop the RX queue. */
2374                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2375                         LOG_DEBUG(VHOST_CONFIG,
2376                                 "(%"PRIu64") In destroy_device: Failed to stop "
2377                                 "rx queue:%d\n",
2378                                 dev->device_fh,
2379                                 vdev->vmdq_rx_q);
2380                 }
2381
2382                 LOG_DEBUG(VHOST_CONFIG,
2383                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2384                         "mempool back to ring for RX queue: %d\n",
2385                         dev->device_fh, vdev->vmdq_rx_q);
2386
2387                 mbuf_destroy_zcp(vpool);
2388
2389                 /* Stop the TX queue. */
2390                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2391                         LOG_DEBUG(VHOST_CONFIG,
2392                                 "(%"PRIu64") In destroy_device: Failed to "
2393                                 "stop tx queue:%d\n",
2394                                 dev->device_fh, vdev->vmdq_rx_q);
2395                 }
2396
2397                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2398
2399                 LOG_DEBUG(VHOST_CONFIG,
2400                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2401                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2402                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2403                         dev->device_fh);
2404
2405                 mbuf_destroy_zcp(vpool);
2406                 rte_free(vdev->regions_hpa);
2407         }
2408         rte_free(vdev);
2409
2410 }
2411
2412 /*
2413  * Calculate the region count of physical continous regions for one particular
2414  * region of whose vhost virtual address is continous. The particular region
2415  * start from vva_start, with size of 'size' in argument.
2416  */
2417 static uint32_t
2418 check_hpa_regions(uint64_t vva_start, uint64_t size)
2419 {
2420         uint32_t i, nregions = 0, page_size = getpagesize();
2421         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2422         if (vva_start % page_size) {
2423                 LOG_DEBUG(VHOST_CONFIG,
2424                         "in check_countinous: vva start(%p) mod page_size(%d) "
2425                         "has remainder\n",
2426                         (void *)(uintptr_t)vva_start, page_size);
2427                 return 0;
2428         }
2429         if (size % page_size) {
2430                 LOG_DEBUG(VHOST_CONFIG,
2431                         "in check_countinous: "
2432                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2433                         size, page_size);
2434                 return 0;
2435         }
2436         for (i = 0; i < size - page_size; i = i + page_size) {
2437                 cur_phys_addr
2438                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2439                 next_phys_addr = rte_mem_virt2phy(
2440                         (void *)(uintptr_t)(vva_start + i + page_size));
2441                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2442                         ++nregions;
2443                         LOG_DEBUG(VHOST_CONFIG,
2444                                 "in check_continuous: hva addr:(%p) is not "
2445                                 "continuous with hva addr:(%p), diff:%d\n",
2446                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2447                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2448                                 + page_size), page_size);
2449                         LOG_DEBUG(VHOST_CONFIG,
2450                                 "in check_continuous: hpa addr:(%p) is not "
2451                                 "continuous with hpa addr:(%p), "
2452                                 "diff:(%"PRIu64")\n",
2453                                 (void *)(uintptr_t)cur_phys_addr,
2454                                 (void *)(uintptr_t)next_phys_addr,
2455                                 (next_phys_addr-cur_phys_addr));
2456                 }
2457         }
2458         return nregions;
2459 }
2460
2461 /*
2462  * Divide each region whose vhost virtual address is continous into a few
2463  * sub-regions, make sure the physical address within each sub-region are
2464  * continous. And fill offset(to GPA) and size etc. information of each
2465  * sub-region into regions_hpa.
2466  */
2467 static uint32_t
2468 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2469 {
2470         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2471         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2472
2473         if (mem_region_hpa == NULL)
2474                 return 0;
2475
2476         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2477                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2478                         virtio_memory->regions[regionidx].address_offset;
2479                 mem_region_hpa[regionidx_hpa].guest_phys_address
2480                         = virtio_memory->regions[regionidx].guest_phys_address;
2481                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2482                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2483                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2484                 LOG_DEBUG(VHOST_CONFIG,
2485                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2486                         regionidx_hpa,
2487                         (void *)(uintptr_t)
2488                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2489                 LOG_DEBUG(VHOST_CONFIG,
2490                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2491                         regionidx_hpa,
2492                         (void *)(uintptr_t)
2493                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2494                 for (i = 0, k = 0;
2495                         i < virtio_memory->regions[regionidx].memory_size -
2496                                 page_size;
2497                         i += page_size) {
2498                         cur_phys_addr = rte_mem_virt2phy(
2499                                         (void *)(uintptr_t)(vva_start + i));
2500                         next_phys_addr = rte_mem_virt2phy(
2501                                         (void *)(uintptr_t)(vva_start +
2502                                         i + page_size));
2503                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2504                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2505                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2506                                         k + page_size;
2507                                 mem_region_hpa[regionidx_hpa].memory_size
2508                                         = k + page_size;
2509                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2510                                         "phys addr end  [%d]:(%p)\n",
2511                                         regionidx_hpa,
2512                                         (void *)(uintptr_t)
2513                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2514                                 LOG_DEBUG(VHOST_CONFIG,
2515                                         "in fill_hpa_regions: guest phys addr "
2516                                         "size [%d]:(%p)\n",
2517                                         regionidx_hpa,
2518                                         (void *)(uintptr_t)
2519                                         (mem_region_hpa[regionidx_hpa].memory_size));
2520                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2521                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2522                                 ++regionidx_hpa;
2523                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2524                                         next_phys_addr -
2525                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2526                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2527                                         " phys addr start[%d]:(%p)\n",
2528                                         regionidx_hpa,
2529                                         (void *)(uintptr_t)
2530                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2531                                 LOG_DEBUG(VHOST_CONFIG,
2532                                         "in fill_hpa_regions: host  phys addr "
2533                                         "start[%d]:(%p)\n",
2534                                         regionidx_hpa,
2535                                         (void *)(uintptr_t)
2536                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2537                                 k = 0;
2538                         } else {
2539                                 k += page_size;
2540                         }
2541                 }
2542                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2543                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2544                         + k + page_size;
2545                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2546                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2547                         "[%d]:(%p)\n", regionidx_hpa,
2548                         (void *)(uintptr_t)
2549                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2550                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2551                         "[%d]:(%p)\n", regionidx_hpa,
2552                         (void *)(uintptr_t)
2553                         (mem_region_hpa[regionidx_hpa].memory_size));
2554                 ++regionidx_hpa;
2555         }
2556         return regionidx_hpa;
2557 }
2558
2559 /*
2560  * A new device is added to a data core. First the device is added to the main linked list
2561  * and the allocated to a specific data core.
2562  */
2563 static int
2564 new_device (struct virtio_net *dev)
2565 {
2566         struct virtio_net_data_ll *ll_dev;
2567         int lcore, core_add = 0;
2568         uint32_t device_num_min = num_devices;
2569         struct vhost_dev *vdev;
2570         uint32_t regionidx;
2571
2572         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2573         if (vdev == NULL) {
2574                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2575                         dev->device_fh);
2576                 return -1;
2577         }
2578         vdev->dev = dev;
2579         dev->priv = vdev;
2580
2581         if (zero_copy) {
2582                 vdev->nregions_hpa = dev->mem->nregions;
2583                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2584                         vdev->nregions_hpa
2585                                 += check_hpa_regions(
2586                                         dev->mem->regions[regionidx].guest_phys_address
2587                                         + dev->mem->regions[regionidx].address_offset,
2588                                         dev->mem->regions[regionidx].memory_size);
2589
2590                 }
2591
2592                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2593                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2594                         RTE_CACHE_LINE_SIZE);
2595                 if (vdev->regions_hpa == NULL) {
2596                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2597                         rte_free(vdev);
2598                         return -1;
2599                 }
2600
2601
2602                 if (fill_hpa_memory_regions(
2603                         vdev->regions_hpa, dev->mem
2604                         ) != vdev->nregions_hpa) {
2605
2606                         RTE_LOG(ERR, VHOST_CONFIG,
2607                                 "hpa memory regions number mismatch: "
2608                                 "[%d]\n", vdev->nregions_hpa);
2609                         rte_free(vdev->regions_hpa);
2610                         rte_free(vdev);
2611                         return -1;
2612                 }
2613         }
2614
2615
2616         /* Add device to main ll */
2617         ll_dev = get_data_ll_free_entry(&ll_root_free);
2618         if (ll_dev == NULL) {
2619                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2620                         "of %d devices per core has been reached\n",
2621                         dev->device_fh, num_devices);
2622                 if (vdev->regions_hpa)
2623                         rte_free(vdev->regions_hpa);
2624                 rte_free(vdev);
2625                 return -1;
2626         }
2627         ll_dev->vdev = vdev;
2628         add_data_ll_entry(&ll_root_used, ll_dev);
2629         vdev->vmdq_rx_q
2630                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2631
2632         if (zero_copy) {
2633                 uint32_t index = vdev->vmdq_rx_q;
2634                 uint32_t count_in_ring, i;
2635                 struct mbuf_table *tx_q;
2636
2637                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2638
2639                 LOG_DEBUG(VHOST_CONFIG,
2640                         "(%"PRIu64") in new_device: mbuf count in mempool "
2641                         "before attach is: %d\n",
2642                         dev->device_fh,
2643                         rte_mempool_count(vpool_array[index].pool));
2644                 LOG_DEBUG(VHOST_CONFIG,
2645                         "(%"PRIu64") in new_device: mbuf count in  ring "
2646                         "before attach  is : %d\n",
2647                         dev->device_fh, count_in_ring);
2648
2649                 /*
2650                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2651                  */
2652                 for (i = 0; i < count_in_ring; i++)
2653                         attach_rxmbuf_zcp(dev);
2654
2655                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2656                         "mempool after attach is: %d\n",
2657                         dev->device_fh,
2658                         rte_mempool_count(vpool_array[index].pool));
2659                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2660                         "ring after attach  is : %d\n",
2661                         dev->device_fh,
2662                         rte_ring_count(vpool_array[index].ring));
2663
2664                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2665                 tx_q->txq_id = vdev->vmdq_rx_q;
2666
2667                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2668                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2669
2670                         LOG_DEBUG(VHOST_CONFIG,
2671                                 "(%"PRIu64") In new_device: Failed to start "
2672                                 "tx queue:%d\n",
2673                                 dev->device_fh, vdev->vmdq_rx_q);
2674
2675                         mbuf_destroy_zcp(vpool);
2676                         rte_free(vdev->regions_hpa);
2677                         rte_free(vdev);
2678                         return -1;
2679                 }
2680
2681                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2682                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2683
2684                         LOG_DEBUG(VHOST_CONFIG,
2685                                 "(%"PRIu64") In new_device: Failed to start "
2686                                 "rx queue:%d\n",
2687                                 dev->device_fh, vdev->vmdq_rx_q);
2688
2689                         /* Stop the TX queue. */
2690                         if (rte_eth_dev_tx_queue_stop(ports[0],
2691                                 vdev->vmdq_rx_q) != 0) {
2692                                 LOG_DEBUG(VHOST_CONFIG,
2693                                         "(%"PRIu64") In new_device: Failed to "
2694                                         "stop tx queue:%d\n",
2695                                         dev->device_fh, vdev->vmdq_rx_q);
2696                         }
2697
2698                         mbuf_destroy_zcp(vpool);
2699                         rte_free(vdev->regions_hpa);
2700                         rte_free(vdev);
2701                         return -1;
2702                 }
2703
2704         }
2705
2706         /*reset ready flag*/
2707         vdev->ready = DEVICE_MAC_LEARNING;
2708         vdev->remove = 0;
2709
2710         /* Find a suitable lcore to add the device. */
2711         RTE_LCORE_FOREACH_SLAVE(lcore) {
2712                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2713                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2714                         core_add = lcore;
2715                 }
2716         }
2717         /* Add device to lcore ll */
2718         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2719         if (ll_dev == NULL) {
2720                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2721                 vdev->ready = DEVICE_SAFE_REMOVE;
2722                 destroy_device(dev);
2723                 if (vdev->regions_hpa)
2724                         rte_free(vdev->regions_hpa);
2725                 rte_free(vdev);
2726                 return -1;
2727         }
2728         ll_dev->vdev = vdev;
2729         vdev->coreid = core_add;
2730
2731         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2732
2733         /* Initialize device stats */
2734         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2735
2736         /* Disable notifications. */
2737         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2738         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2739         lcore_info[vdev->coreid].lcore_ll->device_num++;
2740         dev->flags |= VIRTIO_DEV_RUNNING;
2741
2742         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2743
2744         return 0;
2745 }
2746
2747 /*
2748  * These callback allow devices to be added to the data core when configuration
2749  * has been fully complete.
2750  */
2751 static const struct virtio_net_device_ops virtio_net_device_ops =
2752 {
2753         .new_device =  new_device,
2754         .destroy_device = destroy_device,
2755 };
2756
2757 /*
2758  * This is a thread will wake up after a period to print stats if the user has
2759  * enabled them.
2760  */
2761 static void
2762 print_stats(void)
2763 {
2764         struct virtio_net_data_ll *dev_ll;
2765         uint64_t tx_dropped, rx_dropped;
2766         uint64_t tx, tx_total, rx, rx_total;
2767         uint32_t device_fh;
2768         const char clr[] = { 27, '[', '2', 'J', '\0' };
2769         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2770
2771         while(1) {
2772                 sleep(enable_stats);
2773
2774                 /* Clear screen and move to top left */
2775                 printf("%s%s", clr, top_left);
2776
2777                 printf("\nDevice statistics ====================================");
2778
2779                 dev_ll = ll_root_used;
2780                 while (dev_ll != NULL) {
2781                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2782                         tx_total = dev_statistics[device_fh].tx_total;
2783                         tx = dev_statistics[device_fh].tx;
2784                         tx_dropped = tx_total - tx;
2785                         if (zero_copy == 0) {
2786                                 rx_total = rte_atomic64_read(
2787                                         &dev_statistics[device_fh].rx_total_atomic);
2788                                 rx = rte_atomic64_read(
2789                                         &dev_statistics[device_fh].rx_atomic);
2790                         } else {
2791                                 rx_total = dev_statistics[device_fh].rx_total;
2792                                 rx = dev_statistics[device_fh].rx;
2793                         }
2794                         rx_dropped = rx_total - rx;
2795
2796                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2797                                         "\nTX total:            %"PRIu64""
2798                                         "\nTX dropped:          %"PRIu64""
2799                                         "\nTX successful:               %"PRIu64""
2800                                         "\nRX total:            %"PRIu64""
2801                                         "\nRX dropped:          %"PRIu64""
2802                                         "\nRX successful:               %"PRIu64"",
2803                                         device_fh,
2804                                         tx_total,
2805                                         tx_dropped,
2806                                         tx,
2807                                         rx_total,
2808                                         rx_dropped,
2809                                         rx);
2810
2811                         dev_ll = dev_ll->next;
2812                 }
2813                 printf("\n======================================================\n");
2814         }
2815 }
2816
2817 static void
2818 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2819         char *ring_name, uint32_t nb_mbuf)
2820 {
2821         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2822         vpool_array[index].pool
2823                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2824                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2825                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2826                 rte_pktmbuf_init, NULL, socket, 0);
2827         if (vpool_array[index].pool != NULL) {
2828                 vpool_array[index].ring
2829                         = rte_ring_create(ring_name,
2830                                 rte_align32pow2(nb_mbuf + 1),
2831                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2832                 if (likely(vpool_array[index].ring != NULL)) {
2833                         LOG_DEBUG(VHOST_CONFIG,
2834                                 "in setup_mempool_tbl: mbuf count in "
2835                                 "mempool is: %d\n",
2836                                 rte_mempool_count(vpool_array[index].pool));
2837                         LOG_DEBUG(VHOST_CONFIG,
2838                                 "in setup_mempool_tbl: mbuf count in "
2839                                 "ring   is: %d\n",
2840                                 rte_ring_count(vpool_array[index].ring));
2841                 } else {
2842                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2843                                 ring_name);
2844                 }
2845
2846                 /* Need consider head room. */
2847                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2848         } else {
2849                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2850         }
2851 }
2852
2853
2854 /*
2855  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2856  * device is also registered here to handle the IOCTLs.
2857  */
2858 int
2859 main(int argc, char *argv[])
2860 {
2861         struct rte_mempool *mbuf_pool = NULL;
2862         unsigned lcore_id, core_id = 0;
2863         unsigned nb_ports, valid_num_ports;
2864         int ret;
2865         uint8_t portid;
2866         uint16_t queue_id;
2867         static pthread_t tid;
2868
2869         /* init EAL */
2870         ret = rte_eal_init(argc, argv);
2871         if (ret < 0)
2872                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2873         argc -= ret;
2874         argv += ret;
2875
2876         /* parse app arguments */
2877         ret = us_vhost_parse_args(argc, argv);
2878         if (ret < 0)
2879                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2880
2881         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2882                 if (rte_lcore_is_enabled(lcore_id))
2883                         lcore_ids[core_id ++] = lcore_id;
2884
2885         if (rte_lcore_count() > RTE_MAX_LCORE)
2886                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2887
2888         /*set the number of swithcing cores available*/
2889         num_switching_cores = rte_lcore_count()-1;
2890
2891         /* Get the number of physical ports. */
2892         nb_ports = rte_eth_dev_count();
2893         if (nb_ports > RTE_MAX_ETHPORTS)
2894                 nb_ports = RTE_MAX_ETHPORTS;
2895
2896         /*
2897          * Update the global var NUM_PORTS and global array PORTS
2898          * and get value of var VALID_NUM_PORTS according to system ports number
2899          */
2900         valid_num_ports = check_ports_num(nb_ports);
2901
2902         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2903                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2904                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2905                 return -1;
2906         }
2907
2908         if (zero_copy == 0) {
2909                 /* Create the mbuf pool. */
2910                 mbuf_pool = rte_mempool_create(
2911                                 "MBUF_POOL",
2912                                 NUM_MBUFS_PER_PORT
2913                                 * valid_num_ports,
2914                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2915                                 sizeof(struct rte_pktmbuf_pool_private),
2916                                 rte_pktmbuf_pool_init, NULL,
2917                                 rte_pktmbuf_init, NULL,
2918                                 rte_socket_id(), 0);
2919                 if (mbuf_pool == NULL)
2920                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2921
2922                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2923                         vpool_array[queue_id].pool = mbuf_pool;
2924
2925                 if (vm2vm_mode == VM2VM_HARDWARE) {
2926                         /* Enable VT loop back to let L2 switch to do it. */
2927                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2928                         LOG_DEBUG(VHOST_CONFIG,
2929                                 "Enable loop back for L2 switch in vmdq.\n");
2930                 }
2931         } else {
2932                 uint32_t nb_mbuf;
2933                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2934                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2935
2936                 nb_mbuf = num_rx_descriptor
2937                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2938                         + num_switching_cores * MAX_PKT_BURST;
2939
2940                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2941                         snprintf(pool_name, sizeof(pool_name),
2942                                 "rxmbuf_pool_%u", queue_id);
2943                         snprintf(ring_name, sizeof(ring_name),
2944                                 "rxmbuf_ring_%u", queue_id);
2945                         setup_mempool_tbl(rte_socket_id(), queue_id,
2946                                 pool_name, ring_name, nb_mbuf);
2947                 }
2948
2949                 nb_mbuf = num_tx_descriptor
2950                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951                                 + num_switching_cores * MAX_PKT_BURST;
2952
2953                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954                         snprintf(pool_name, sizeof(pool_name),
2955                                 "txmbuf_pool_%u", queue_id);
2956                         snprintf(ring_name, sizeof(ring_name),
2957                                 "txmbuf_ring_%u", queue_id);
2958                         setup_mempool_tbl(rte_socket_id(),
2959                                 (queue_id + MAX_QUEUES),
2960                                 pool_name, ring_name, nb_mbuf);
2961                 }
2962
2963                 if (vm2vm_mode == VM2VM_HARDWARE) {
2964                         /* Enable VT loop back to let L2 switch to do it. */
2965                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2966                         LOG_DEBUG(VHOST_CONFIG,
2967                                 "Enable loop back for L2 switch in vmdq.\n");
2968                 }
2969         }
2970         /* Set log level. */
2971         rte_set_log_level(LOG_LEVEL);
2972
2973         /* initialize all ports */
2974         for (portid = 0; portid < nb_ports; portid++) {
2975                 /* skip ports that are not enabled */
2976                 if ((enabled_port_mask & (1 << portid)) == 0) {
2977                         RTE_LOG(INFO, VHOST_PORT,
2978                                 "Skipping disabled port %d\n", portid);
2979                         continue;
2980                 }
2981                 if (port_init(portid) != 0)
2982                         rte_exit(EXIT_FAILURE,
2983                                 "Cannot initialize network ports\n");
2984         }
2985
2986         /* Initialise all linked lists. */
2987         if (init_data_ll() == -1)
2988                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2989
2990         /* Initialize device stats */
2991         memset(&dev_statistics, 0, sizeof(dev_statistics));
2992
2993         /* Enable stats if the user option is set. */
2994         if (enable_stats)
2995                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2996
2997         /* Launch all data cores. */
2998         if (zero_copy == 0) {
2999                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3000                         rte_eal_remote_launch(switch_worker,
3001                                 mbuf_pool, lcore_id);
3002                 }
3003         } else {
3004                 uint32_t count_in_mempool, index, i;
3005                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3006                         /* For all RX and TX queues. */
3007                         count_in_mempool
3008                                 = rte_mempool_count(vpool_array[index].pool);
3009
3010                         /*
3011                          * Transfer all un-attached mbufs from vpool.pool
3012                          * to vpoo.ring.
3013                          */
3014                         for (i = 0; i < count_in_mempool; i++) {
3015                                 struct rte_mbuf *mbuf
3016                                         = __rte_mbuf_raw_alloc(
3017                                                 vpool_array[index].pool);
3018                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3019                                                 (void *)mbuf);
3020                         }
3021
3022                         LOG_DEBUG(VHOST_CONFIG,
3023                                 "in main: mbuf count in mempool at initial "
3024                                 "is: %d\n", count_in_mempool);
3025                         LOG_DEBUG(VHOST_CONFIG,
3026                                 "in main: mbuf count in  ring at initial  is :"
3027                                 " %d\n",
3028                                 rte_ring_count(vpool_array[index].ring));
3029                 }
3030
3031                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3032                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3033                                 lcore_id);
3034         }
3035
3036         if (mergeable == 0)
3037                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3038
3039         /* Register CUSE device to handle IOCTLs. */
3040         ret = rte_vhost_driver_register((char *)&dev_basename);
3041         if (ret != 0)
3042                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3043
3044         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3045
3046         /* Start CUSE session. */
3047         rte_vhost_driver_session_start();
3048         return 0;
3049
3050 }
3051