examples/vhost: rework duplicated code
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
103
104 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
106
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223         .rx_thresh = {
224                 .pthresh = RX_PTHRESH,
225                 .hthresh = RX_HTHRESH,
226                 .wthresh = RX_WTHRESH,
227         },
228         .rx_drop_en = 1,
229 };
230
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237         .tx_thresh = {
238                 .pthresh = TX_PTHRESH,
239                 .hthresh = TX_HTHRESH,
240                 .wthresh = TX_WTHRESH,
241         },
242         .tx_free_thresh = 0, /* Use PMD default values */
243         .tx_rs_thresh = 0, /* Use PMD default values */
244 };
245
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248         .rxmode = {
249                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250                 .split_hdr_size = 0,
251                 .header_split   = 0, /**< Header Split disabled */
252                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
253                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
254                 /*
255                  * It is necessary for 1G NIC such as I350,
256                  * this fixes bug of ipv4 forwarding in guest can't
257                  * forward pakets from one virtio dev to another virtio dev.
258                  */
259                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
262         },
263
264         .txmode = {
265                 .mq_mode = ETH_MQ_TX_NONE,
266         },
267         .rx_adv_conf = {
268                 /*
269                  * should be overridden separately in code with
270                  * appropriate values
271                  */
272                 .vmdq_rx_conf = {
273                         .nb_queue_pools = ETH_8_POOLS,
274                         .enable_default_pool = 0,
275                         .default_pool = 0,
276                         .nb_pool_maps = 0,
277                         .pool_map = {{0, 0},},
278                 },
279         },
280 };
281
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
290         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310         unsigned len;
311         unsigned txq_id;
312         struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323         unsigned char   h_dest[ETH_ALEN];
324         unsigned char   h_source[ETH_ALEN];
325         __be16          h_vlan_proto;
326         __be16          h_vlan_TCI;
327         __be16          h_vlan_encapsulated_proto;
328 };
329
330 /* IPv4 Header */
331 struct ipv4_hdr {
332         uint8_t  version_ihl;           /**< version and header length */
333         uint8_t  type_of_service;       /**< type of service */
334         uint16_t total_length;          /**< length of packet */
335         uint16_t packet_id;             /**< packet ID */
336         uint16_t fragment_offset;       /**< fragmentation offset */
337         uint8_t  time_to_live;          /**< time to live */
338         uint8_t  next_proto_id;         /**< protocol ID */
339         uint16_t hdr_checksum;          /**< header checksum */
340         uint32_t src_addr;              /**< source address */
341         uint32_t dst_addr;              /**< destination address */
342 } __attribute__((__packed__));
343
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347
348 /* Per-device statistics struct */
349 struct device_statistics {
350         uint64_t tx_total;
351         rte_atomic64_t rx_total_atomic;
352         uint64_t rx_total;
353         uint64_t tx;
354         rte_atomic64_t rx_atomic;
355         uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366         struct rte_eth_vmdq_rx_conf conf;
367         unsigned i;
368
369         memset(&conf, 0, sizeof(conf));
370         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371         conf.nb_pool_maps = num_devices;
372         conf.enable_loop_back =
373                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374
375         for (i = 0; i < conf.nb_pool_maps; i++) {
376                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
377                 conf.pool_map[i].pools = (1UL << i);
378         }
379
380         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383         return 0;
384 }
385
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394         if (num_devices > max_nb_devices) {
395                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396                 return -1;
397         }
398         return 0;
399 }
400
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408         struct rte_eth_dev_info dev_info;
409         struct rte_eth_conf port_conf;
410         uint16_t rx_rings, tx_rings;
411         uint16_t rx_ring_size, tx_ring_size;
412         int retval;
413         uint16_t q;
414
415         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416         rte_eth_dev_info_get (port, &dev_info);
417
418         /*configure the number of supported virtio devices based on VMDQ limits */
419         num_devices = dev_info.max_vmdq_pools;
420         num_queues = dev_info.max_rx_queues;
421
422         if (zero_copy) {
423                 rx_ring_size = num_rx_descriptor;
424                 tx_ring_size = num_tx_descriptor;
425                 tx_rings = dev_info.max_tx_queues;
426         } else {
427                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429                 tx_rings = (uint16_t)rte_lcore_count();
430         }
431
432         retval = validate_num_devices(MAX_DEVICES);
433         if (retval < 0)
434                 return retval;
435
436         /* Get port configuration. */
437         retval = get_eth_conf(&port_conf, num_devices);
438         if (retval < 0)
439                 return retval;
440
441         if (port >= rte_eth_dev_count()) return -1;
442
443         rx_rings = (uint16_t)num_queues,
444         /* Configure ethernet device. */
445         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446         if (retval != 0)
447                 return retval;
448
449         /* Setup the queues. */
450         for (q = 0; q < rx_rings; q ++) {
451                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
453                                                 vpool_array[q].pool);
454                 if (retval < 0)
455                         return retval;
456         }
457         for (q = 0; q < tx_rings; q ++) {
458                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
460                 if (retval < 0)
461                         return retval;
462         }
463
464         /* Start the device. */
465         retval  = rte_eth_dev_start(port);
466         if (retval < 0) {
467                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468                 return retval;
469         }
470
471         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475                         (unsigned)port,
476                         vmdq_ports_eth_addr[port].addr_bytes[0],
477                         vmdq_ports_eth_addr[port].addr_bytes[1],
478                         vmdq_ports_eth_addr[port].addr_bytes[2],
479                         vmdq_ports_eth_addr[port].addr_bytes[3],
480                         vmdq_ports_eth_addr[port].addr_bytes[4],
481                         vmdq_ports_eth_addr[port].addr_bytes[5]);
482
483         return 0;
484 }
485
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492         /* parse number string */
493
494         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495                 return -1;
496         else
497                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498
499         return 0;
500 }
501
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508         char *end = NULL;
509         unsigned long pm;
510
511         errno = 0;
512
513         /* parse hexadecimal string */
514         pm = strtoul(portmask, &end, 16);
515         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516                 return -1;
517
518         if (pm == 0)
519                 return -1;
520
521         return pm;
522
523 }
524
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531         char *end = NULL;
532         unsigned long num;
533
534         errno = 0;
535
536         /* parse unsigned int string */
537         num = strtoul(q_arg, &end, 10);
538         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539                 return -1;
540
541         if (num > max_valid_value)
542                 return -1;
543
544         return num;
545
546 }
547
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555         "               --vm2vm [0|1|2]\n"
556         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557         "               --dev-basename <name>\n"
558         "               --nb-devices ND\n"
559         "               -p PORTMASK: Set mask for ports to be used by application\n"
560         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566         "               --dev-basename: The basename to be used for the character device.\n"
567         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
568                         "zero copy\n"
569         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
570                         "used only when zero copy is enabled.\n"
571         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
572                         "used only when zero copy is enabled.\n",
573                prgname);
574 }
575
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582         int opt, ret;
583         int option_index;
584         unsigned i;
585         const char *prgname = argv[0];
586         static struct option long_option[] = {
587                 {"vm2vm", required_argument, NULL, 0},
588                 {"rx-retry", required_argument, NULL, 0},
589                 {"rx-retry-delay", required_argument, NULL, 0},
590                 {"rx-retry-num", required_argument, NULL, 0},
591                 {"mergeable", required_argument, NULL, 0},
592                 {"stats", required_argument, NULL, 0},
593                 {"dev-basename", required_argument, NULL, 0},
594                 {"zero-copy", required_argument, NULL, 0},
595                 {"rx-desc-num", required_argument, NULL, 0},
596                 {"tx-desc-num", required_argument, NULL, 0},
597                 {NULL, 0, 0, 0},
598         };
599
600         /* Parse command line */
601         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602                 switch (opt) {
603                 /* Portmask */
604                 case 'p':
605                         enabled_port_mask = parse_portmask(optarg);
606                         if (enabled_port_mask == 0) {
607                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608                                 us_vhost_usage(prgname);
609                                 return -1;
610                         }
611                         break;
612
613                 case 0:
614                         /* Enable/disable vm2vm comms. */
615                         if (!strncmp(long_option[option_index].name, "vm2vm",
616                                 MAX_LONG_OPT_SZ)) {
617                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618                                 if (ret == -1) {
619                                         RTE_LOG(INFO, VHOST_CONFIG,
620                                                 "Invalid argument for "
621                                                 "vm2vm [0|1|2]\n");
622                                         us_vhost_usage(prgname);
623                                         return -1;
624                                 } else {
625                                         vm2vm_mode = (vm2vm_type)ret;
626                                 }
627                         }
628
629                         /* Enable/disable retries on RX. */
630                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631                                 ret = parse_num_opt(optarg, 1);
632                                 if (ret == -1) {
633                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634                                         us_vhost_usage(prgname);
635                                         return -1;
636                                 } else {
637                                         enable_retry = ret;
638                                 }
639                         }
640
641                         /* Specify the retries delay time (in useconds) on RX. */
642                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643                                 ret = parse_num_opt(optarg, INT32_MAX);
644                                 if (ret == -1) {
645                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646                                         us_vhost_usage(prgname);
647                                         return -1;
648                                 } else {
649                                         burst_rx_delay_time = ret;
650                                 }
651                         }
652
653                         /* Specify the retries number on RX. */
654                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655                                 ret = parse_num_opt(optarg, INT32_MAX);
656                                 if (ret == -1) {
657                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658                                         us_vhost_usage(prgname);
659                                         return -1;
660                                 } else {
661                                         burst_rx_retry_num = ret;
662                                 }
663                         }
664
665                         /* Enable/disable RX mergeable buffers. */
666                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667                                 ret = parse_num_opt(optarg, 1);
668                                 if (ret == -1) {
669                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670                                         us_vhost_usage(prgname);
671                                         return -1;
672                                 } else {
673                                         mergeable = !!ret;
674                                         if (ret) {
675                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
676                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
677                                                         = JUMBO_FRAME_MAX_SIZE;
678                                         }
679                                 }
680                         }
681
682                         /* Enable/disable stats. */
683                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684                                 ret = parse_num_opt(optarg, INT32_MAX);
685                                 if (ret == -1) {
686                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687                                         us_vhost_usage(prgname);
688                                         return -1;
689                                 } else {
690                                         enable_stats = ret;
691                                 }
692                         }
693
694                         /* Set character device basename. */
695                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696                                 if (us_vhost_parse_basename(optarg) == -1) {
697                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698                                         us_vhost_usage(prgname);
699                                         return -1;
700                                 }
701                         }
702
703                         /* Enable/disable rx/tx zero copy. */
704                         if (!strncmp(long_option[option_index].name,
705                                 "zero-copy", MAX_LONG_OPT_SZ)) {
706                                 ret = parse_num_opt(optarg, 1);
707                                 if (ret == -1) {
708                                         RTE_LOG(INFO, VHOST_CONFIG,
709                                                 "Invalid argument"
710                                                 " for zero-copy [0|1]\n");
711                                         us_vhost_usage(prgname);
712                                         return -1;
713                                 } else
714                                         zero_copy = ret;
715
716                                 if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719                                         "zero copy vhost APP, please "
720                                         "disable RTE_MBUF_REFCNT\n"
721                                         "in config file and then rebuild DPDK "
722                                         "core lib!\n"
723                                         "Otherwise please disable zero copy "
724                                         "flag in command line!\n");
725                                         return -1;
726 #endif
727                                 }
728                         }
729
730                         /* Specify the descriptor number on RX. */
731                         if (!strncmp(long_option[option_index].name,
732                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
733                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
734                                 if ((ret == -1) || (!POWEROF2(ret))) {
735                                         RTE_LOG(INFO, VHOST_CONFIG,
736                                         "Invalid argument for rx-desc-num[0-N],"
737                                         "power of 2 required.\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         num_rx_descriptor = ret;
742                                 }
743                         }
744
745                         /* Specify the descriptor number on TX. */
746                         if (!strncmp(long_option[option_index].name,
747                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
748                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
749                                 if ((ret == -1) || (!POWEROF2(ret))) {
750                                         RTE_LOG(INFO, VHOST_CONFIG,
751                                         "Invalid argument for tx-desc-num [0-N],"
752                                         "power of 2 required.\n");
753                                         us_vhost_usage(prgname);
754                                         return -1;
755                                 } else {
756                                         num_tx_descriptor = ret;
757                                 }
758                         }
759
760                         break;
761
762                         /* Invalid option - print options. */
763                 default:
764                         us_vhost_usage(prgname);
765                         return -1;
766                 }
767         }
768
769         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770                 if (enabled_port_mask & (1 << i))
771                         ports[num_ports++] = (uint8_t)i;
772         }
773
774         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777                 return -1;
778         }
779
780         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781                 RTE_LOG(INFO, VHOST_PORT,
782                         "Vhost zero copy doesn't support software vm2vm,"
783                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
784                 return -1;
785         }
786
787         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788                 RTE_LOG(INFO, VHOST_PORT,
789                         "Vhost zero copy doesn't support jumbo frame,"
790                         "please specify '--mergeable 0' to disable the "
791                         "mergeable feature.\n");
792                 return -1;
793         }
794
795         return 0;
796 }
797
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804         unsigned valid_num_ports = num_ports;
805         unsigned portid;
806
807         if (num_ports > nb_ports) {
808                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809                         num_ports, nb_ports);
810                 num_ports = nb_ports;
811         }
812
813         for (portid = 0; portid < num_ports; portid ++) {
814                 if (ports[portid] >= nb_ports) {
815                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816                                 ports[portid], (nb_ports - 1));
817                         ports[portid] = INVALID_PORT_ID;
818                         valid_num_ports--;
819                 }
820         }
821         return valid_num_ports;
822 }
823
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
830         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
831         unsigned int index;                                                                                                                                                                                             \
832         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
833                                                                                                                                                                                                                                         \
834         if ((header))                                                                                                                                                                                                   \
835                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
836         else                                                                                                                                                                                                                    \
837                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
838         for (index = 0; index < (size); index++) {                                                                                                                                              \
839                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
840                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
841         }                                                                                                                                                                                                                               \
842         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
843                                                                                                                                                                                                                                         \
844         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856         uint32_t buf_len, hpa_type *addr_type)
857 {
858         struct virtio_memory_regions_hpa *region;
859         uint32_t regionidx;
860         uint64_t vhost_pa = 0;
861
862         *addr_type = PHYS_ADDR_INVALID;
863
864         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865                 region = &vdev->regions_hpa[regionidx];
866                 if ((guest_pa >= region->guest_phys_address) &&
867                         (guest_pa <= region->guest_phys_address_end)) {
868                         vhost_pa = region->host_phys_addr_offset + guest_pa;
869                         if (likely((guest_pa + buf_len - 1)
870                                 <= region->guest_phys_address_end))
871                                 *addr_type = PHYS_ADDR_CONTINUOUS;
872                         else
873                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
874                         break;
875                 }
876         }
877
878         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880                 (void *)(uintptr_t)vhost_pa);
881
882         return vhost_pa;
883 }
884
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901         struct ether_hdr *pkt_hdr;
902         struct virtio_net_data_ll *dev_ll;
903         struct virtio_net *dev = vdev->dev;
904         int i, ret;
905
906         /* Learn MAC address of guest device from packet */
907         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908
909         dev_ll = ll_root_used;
910
911         while (dev_ll != NULL) {
912                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914                         return -1;
915                 }
916                 dev_ll = dev_ll->next;
917         }
918
919         for (i = 0; i < ETHER_ADDR_LEN; i++)
920                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921
922         /* vlan_tag currently uses the device_id. */
923         vdev->vlan_tag = vlan_tags[dev->device_fh];
924
925         /* Print out VMDQ registration info. */
926         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927                 dev->device_fh,
928                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931                 vdev->vlan_tag);
932
933         /* Register the MAC address. */
934         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935         if (ret)
936                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937                                         dev->device_fh);
938
939         /* Enable stripping of the vlan tag as we handle routing. */
940         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941
942         /* Set device as ready for RX. */
943         vdev->ready = DEVICE_RX;
944
945         return 0;
946 }
947
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955         unsigned i = 0;
956         unsigned rx_count;
957         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958
959         if (vdev->ready == DEVICE_RX) {
960                 /*clear MAC and VLAN settings*/
961                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962                 for (i = 0; i < 6; i++)
963                         vdev->mac_address.addr_bytes[i] = 0;
964
965                 vdev->vlan_tag = 0;
966
967                 /*Clear out the receive buffers*/
968                 rx_count = rte_eth_rx_burst(ports[0],
969                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970
971                 while (rx_count) {
972                         for (i = 0; i < rx_count; i++)
973                                 rte_pktmbuf_free(pkts_burst[i]);
974
975                         rx_count = rte_eth_rx_burst(ports[0],
976                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977                 }
978
979                 vdev->ready = DEVICE_MAC_LEARNING;
980         }
981 }
982
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990         struct virtio_net_data_ll *dev_ll;
991         struct ether_hdr *pkt_hdr;
992         uint64_t ret = 0;
993         struct virtio_net *dev = vdev->dev;
994         struct virtio_net *tdev; /* destination virito device */
995
996         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997
998         /*get the used devices list*/
999         dev_ll = ll_root_used;
1000
1001         while (dev_ll != NULL) {
1002                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003                                           &dev_ll->vdev->mac_address)) {
1004
1005                         /* Drop the packet if the TX packet is destined for the TX device. */
1006                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008                                                         dev->device_fh);
1009                                 return 0;
1010                         }
1011                         tdev = dev_ll->vdev->dev;
1012
1013
1014                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015
1016                         if (unlikely(dev_ll->vdev->remove)) {
1017                                 /*drop the packet if the device is marked for removal*/
1018                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019                         } else {
1020                                 /*send the packet to the local virtio device*/
1021                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022                                 if (enable_stats) {
1023                                         rte_atomic64_add(
1024                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1025                                         1);
1026                                         rte_atomic64_add(
1027                                         &dev_statistics[tdev->device_fh].rx_atomic,
1028                                         ret);
1029                                         dev_statistics[tdev->device_fh].tx_total++;
1030                                         dev_statistics[tdev->device_fh].tx += ret;
1031                                 }
1032                         }
1033
1034                         return 0;
1035                 }
1036                 dev_ll = dev_ll->next;
1037         }
1038
1039         return -1;
1040 }
1041
1042 /*
1043  * Check if the destination MAC of a packet is one local VM,
1044  * and get its vlan tag, and offset if it is.
1045  */
1046 static inline int __attribute__((always_inline))
1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1048         uint32_t *offset, uint16_t *vlan_tag)
1049 {
1050         struct virtio_net_data_ll *dev_ll = ll_root_used;
1051         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052
1053         while (dev_ll != NULL) {
1054                 if ((dev_ll->vdev->ready == DEVICE_RX)
1055                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1056                 &dev_ll->vdev->mac_address)) {
1057                         /*
1058                          * Drop the packet if the TX packet is
1059                          * destined for the TX device.
1060                          */
1061                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062                                 LOG_DEBUG(VHOST_DATA,
1063                                 "(%"PRIu64") TX: Source and destination"
1064                                 " MAC addresses are the same. Dropping "
1065                                 "packet.\n",
1066                                 dev_ll->vdev->dev->device_fh);
1067                                 return -1;
1068                         }
1069
1070                         /*
1071                          * HW vlan strip will reduce the packet length
1072                          * by minus length of vlan tag, so need restore
1073                          * the packet length by plus it.
1074                          */
1075                         *offset = VLAN_HLEN;
1076                         *vlan_tag =
1077                         (uint16_t)
1078                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1079
1080                         LOG_DEBUG(VHOST_DATA,
1081                         "(%"PRIu64") TX: pkt to local VM device id:"
1082                         "(%"PRIu64") vlan tag: %d.\n",
1083                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1084                         vlan_tag);
1085
1086                         break;
1087                 }
1088                 dev_ll = dev_ll->next;
1089         }
1090         return 0;
1091 }
1092
1093 /*
1094  * This function routes the TX packet to the correct interface. This may be a local device
1095  * or the physical port.
1096  */
1097 static inline void __attribute__((always_inline))
1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1099 {
1100         struct mbuf_table *tx_q;
1101         struct rte_mbuf **m_table;
1102         unsigned len, ret, offset = 0;
1103         const uint16_t lcore_id = rte_lcore_id();
1104         struct virtio_net *dev = vdev->dev;
1105
1106         /*check if destination is local VM*/
1107         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1108                 rte_pktmbuf_free(m);
1109                 return;
1110         }
1111
1112         if (vm2vm_mode == VM2VM_HARDWARE) {
1113                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1114                         rte_pktmbuf_free(m);
1115                         return;
1116                 }
1117         }
1118
1119         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1120
1121         /*Add packet to the port tx queue*/
1122         tx_q = &lcore_tx_queue[lcore_id];
1123         len = tx_q->len;
1124
1125         m->ol_flags = PKT_TX_VLAN_PKT;
1126
1127         m->data_len += offset;
1128         m->pkt_len += offset;
1129
1130         m->vlan_tci = vlan_tag;
1131
1132         tx_q->m_table[len] = m;
1133         len++;
1134         if (enable_stats) {
1135                 dev_statistics[dev->device_fh].tx_total++;
1136                 dev_statistics[dev->device_fh].tx++;
1137         }
1138
1139         if (unlikely(len == MAX_PKT_BURST)) {
1140                 m_table = (struct rte_mbuf **)tx_q->m_table;
1141                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1142                 /* Free any buffers not handled by TX and update the port stats. */
1143                 if (unlikely(ret < len)) {
1144                         do {
1145                                 rte_pktmbuf_free(m_table[ret]);
1146                         } while (++ret < len);
1147                 }
1148
1149                 len = 0;
1150         }
1151
1152         tx_q->len = len;
1153         return;
1154 }
1155 /*
1156  * This function is called by each data core. It handles all RX/TX registered with the
1157  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1158  * with all devices in the main linked list.
1159  */
1160 static int
1161 switch_worker(__attribute__((unused)) void *arg)
1162 {
1163         struct rte_mempool *mbuf_pool = arg;
1164         struct virtio_net *dev = NULL;
1165         struct vhost_dev *vdev = NULL;
1166         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1167         struct virtio_net_data_ll *dev_ll;
1168         struct mbuf_table *tx_q;
1169         volatile struct lcore_ll_info *lcore_ll;
1170         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1171         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1172         unsigned ret, i;
1173         const uint16_t lcore_id = rte_lcore_id();
1174         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1175         uint16_t rx_count = 0;
1176         uint16_t tx_count;
1177         uint32_t retry = 0;
1178
1179         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1180         lcore_ll = lcore_info[lcore_id].lcore_ll;
1181         prev_tsc = 0;
1182
1183         tx_q = &lcore_tx_queue[lcore_id];
1184         for (i = 0; i < num_cores; i ++) {
1185                 if (lcore_ids[i] == lcore_id) {
1186                         tx_q->txq_id = i;
1187                         break;
1188                 }
1189         }
1190
1191         while(1) {
1192                 cur_tsc = rte_rdtsc();
1193                 /*
1194                  * TX burst queue drain
1195                  */
1196                 diff_tsc = cur_tsc - prev_tsc;
1197                 if (unlikely(diff_tsc > drain_tsc)) {
1198
1199                         if (tx_q->len) {
1200                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1201
1202                                 /*Tx any packets in the queue*/
1203                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1204                                                                            (struct rte_mbuf **)tx_q->m_table,
1205                                                                            (uint16_t)tx_q->len);
1206                                 if (unlikely(ret < tx_q->len)) {
1207                                         do {
1208                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1209                                         } while (++ret < tx_q->len);
1210                                 }
1211
1212                                 tx_q->len = 0;
1213                         }
1214
1215                         prev_tsc = cur_tsc;
1216
1217                 }
1218
1219                 rte_prefetch0(lcore_ll->ll_root_used);
1220                 /*
1221                  * Inform the configuration core that we have exited the linked list and that no devices are
1222                  * in use if requested.
1223                  */
1224                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1225                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1226
1227                 /*
1228                  * Process devices
1229                  */
1230                 dev_ll = lcore_ll->ll_root_used;
1231
1232                 while (dev_ll != NULL) {
1233                         /*get virtio device ID*/
1234                         vdev = dev_ll->vdev;
1235                         dev = vdev->dev;
1236
1237                         if (unlikely(vdev->remove)) {
1238                                 dev_ll = dev_ll->next;
1239                                 unlink_vmdq(vdev);
1240                                 vdev->ready = DEVICE_SAFE_REMOVE;
1241                                 continue;
1242                         }
1243                         if (likely(vdev->ready == DEVICE_RX)) {
1244                                 /*Handle guest RX*/
1245                                 rx_count = rte_eth_rx_burst(ports[0],
1246                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1247
1248                                 if (rx_count) {
1249                                         /*
1250                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1251                                         * Here MAX_PKT_BURST must be less than virtio queue size
1252                                         */
1253                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1254                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1255                                                         rte_delay_us(burst_rx_delay_time);
1256                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1257                                                                 break;
1258                                                 }
1259                                         }
1260                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1261                                         if (enable_stats) {
1262                                                 rte_atomic64_add(
1263                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1264                                                 rx_count);
1265                                                 rte_atomic64_add(
1266                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1267                                         }
1268                                         while (likely(rx_count)) {
1269                                                 rx_count--;
1270                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1271                                         }
1272
1273                                 }
1274                         }
1275
1276                         if (likely(!vdev->remove)) {
1277                                 /* Handle guest TX*/
1278                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1279                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1280                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1281                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1282                                                 while (tx_count--)
1283                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1284                                         }
1285                                 }
1286                                 while (tx_count)
1287                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1288                         }
1289
1290                         /*move to the next device in the list*/
1291                         dev_ll = dev_ll->next;
1292                 }
1293         }
1294
1295         return 0;
1296 }
1297
1298 /*
1299  * This function gets available ring number for zero copy rx.
1300  * Only one thread will call this funciton for a paticular virtio device,
1301  * so, it is designed as non-thread-safe function.
1302  */
1303 static inline uint32_t __attribute__((always_inline))
1304 get_available_ring_num_zcp(struct virtio_net *dev)
1305 {
1306         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1307         uint16_t avail_idx;
1308
1309         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1310         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1311 }
1312
1313 /*
1314  * This function gets available ring index for zero copy rx,
1315  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1316  * Only one thread will call this funciton for a paticular virtio device,
1317  * so, it is designed as non-thread-safe function.
1318  */
1319 static inline uint32_t __attribute__((always_inline))
1320 get_available_ring_index_zcp(struct virtio_net *dev,
1321         uint16_t *res_base_idx, uint32_t count)
1322 {
1323         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1324         uint16_t avail_idx;
1325         uint32_t retry = 0;
1326         uint16_t free_entries;
1327
1328         *res_base_idx = vq->last_used_idx_res;
1329         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1330         free_entries = (avail_idx - *res_base_idx);
1331
1332         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1333                         "avail idx: %d, "
1334                         "res base idx:%d, free entries:%d\n",
1335                         dev->device_fh, avail_idx, *res_base_idx,
1336                         free_entries);
1337
1338         /*
1339          * If retry is enabled and the queue is full then we wait
1340          * and retry to avoid packet loss.
1341          */
1342         if (enable_retry && unlikely(count > free_entries)) {
1343                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1344                         rte_delay_us(burst_rx_delay_time);
1345                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1346                         free_entries = (avail_idx - *res_base_idx);
1347                         if (count <= free_entries)
1348                                 break;
1349                 }
1350         }
1351
1352         /*check that we have enough buffers*/
1353         if (unlikely(count > free_entries))
1354                 count = free_entries;
1355
1356         if (unlikely(count == 0)) {
1357                 LOG_DEBUG(VHOST_DATA,
1358                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1359                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1360                         dev->device_fh, avail_idx,
1361                         *res_base_idx, free_entries);
1362                 return 0;
1363         }
1364
1365         vq->last_used_idx_res = *res_base_idx + count;
1366
1367         return count;
1368 }
1369
1370 /*
1371  * This function put descriptor back to used list.
1372  */
1373 static inline void __attribute__((always_inline))
1374 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1375 {
1376         uint16_t res_cur_idx = vq->last_used_idx;
1377         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1378         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1379         rte_compiler_barrier();
1380         *(volatile uint16_t *)&vq->used->idx += 1;
1381         vq->last_used_idx += 1;
1382
1383         /* Kick the guest if necessary. */
1384         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1385                 eventfd_write((int)vq->kickfd, 1);
1386 }
1387
1388 /*
1389  * This function get available descriptor from vitio vring and un-attached mbuf
1390  * from vpool->ring, and then attach them together. It needs adjust the offset
1391  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1392  * frame data may be put to wrong location in mbuf.
1393  */
1394 static inline void __attribute__((always_inline))
1395 attach_rxmbuf_zcp(struct virtio_net *dev)
1396 {
1397         uint16_t res_base_idx, desc_idx;
1398         uint64_t buff_addr, phys_addr;
1399         struct vhost_virtqueue *vq;
1400         struct vring_desc *desc;
1401         struct rte_mbuf *mbuf = NULL;
1402         struct vpool *vpool;
1403         hpa_type addr_type;
1404         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1405
1406         vpool = &vpool_array[vdev->vmdq_rx_q];
1407         vq = dev->virtqueue[VIRTIO_RXQ];
1408
1409         do {
1410                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1411                                 1) != 1))
1412                         return;
1413                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1414
1415                 desc = &vq->desc[desc_idx];
1416                 if (desc->flags & VRING_DESC_F_NEXT) {
1417                         desc = &vq->desc[desc->next];
1418                         buff_addr = gpa_to_vva(dev, desc->addr);
1419                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1420                                         &addr_type);
1421                 } else {
1422                         buff_addr = gpa_to_vva(dev,
1423                                         desc->addr + vq->vhost_hlen);
1424                         phys_addr = gpa_to_hpa(vdev,
1425                                         desc->addr + vq->vhost_hlen,
1426                                         desc->len, &addr_type);
1427                 }
1428
1429                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1430                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1431                                 " address found when attaching RX frame buffer"
1432                                 " address!\n", dev->device_fh);
1433                         put_desc_to_used_list_zcp(vq, desc_idx);
1434                         continue;
1435                 }
1436
1437                 /*
1438                  * Check if the frame buffer address from guest crosses
1439                  * sub-region or not.
1440                  */
1441                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1442                         RTE_LOG(ERR, VHOST_DATA,
1443                                 "(%"PRIu64") Frame buffer address cross "
1444                                 "sub-regioin found when attaching RX frame "
1445                                 "buffer address!\n",
1446                                 dev->device_fh);
1447                         put_desc_to_used_list_zcp(vq, desc_idx);
1448                         continue;
1449                 }
1450         } while (unlikely(phys_addr == 0));
1451
1452         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1453         if (unlikely(mbuf == NULL)) {
1454                 LOG_DEBUG(VHOST_DATA,
1455                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1456                         "ring_sc_dequeue fail.\n",
1457                         dev->device_fh);
1458                 put_desc_to_used_list_zcp(vq, desc_idx);
1459                 return;
1460         }
1461
1462         if (unlikely(vpool->buf_size > desc->len)) {
1463                 LOG_DEBUG(VHOST_DATA,
1464                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1465                         "length(%d) of descriptor idx: %d less than room "
1466                         "size required: %d\n",
1467                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1468                 put_desc_to_used_list_zcp(vq, desc_idx);
1469                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1470                 return;
1471         }
1472
1473         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1474         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1475         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1476         mbuf->data_len = desc->len;
1477         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1478
1479         LOG_DEBUG(VHOST_DATA,
1480                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1481                 "descriptor idx:%d\n",
1482                 dev->device_fh, res_base_idx, desc_idx);
1483
1484         __rte_mbuf_raw_free(mbuf);
1485
1486         return;
1487 }
1488
1489 /*
1490  * Detach an attched packet mbuf -
1491  *  - restore original mbuf address and length values.
1492  *  - reset pktmbuf data and data_len to their default values.
1493  *  All other fields of the given packet mbuf will be left intact.
1494  *
1495  * @param m
1496  *   The attached packet mbuf.
1497  */
1498 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1499 {
1500         const struct rte_mempool *mp = m->pool;
1501         void *buf = RTE_MBUF_TO_BADDR(m);
1502         uint32_t buf_ofs;
1503         uint32_t buf_len = mp->elt_size - sizeof(*m);
1504         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1505
1506         m->buf_addr = buf;
1507         m->buf_len = (uint16_t)buf_len;
1508
1509         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1510                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1511         m->data_off = buf_ofs;
1512
1513         m->data_len = 0;
1514 }
1515
1516 /*
1517  * This function is called after packets have been transimited. It fetchs mbuf
1518  * from vpool->pool, detached it and put into vpool->ring. It also update the
1519  * used index and kick the guest if necessary.
1520  */
1521 static inline uint32_t __attribute__((always_inline))
1522 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1523 {
1524         struct rte_mbuf *mbuf;
1525         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1526         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1527         uint32_t index = 0;
1528         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1529
1530         LOG_DEBUG(VHOST_DATA,
1531                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1532                 "clean is: %d\n",
1533                 dev->device_fh, mbuf_count);
1534         LOG_DEBUG(VHOST_DATA,
1535                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1536                 "clean  is : %d\n",
1537                 dev->device_fh, rte_ring_count(vpool->ring));
1538
1539         for (index = 0; index < mbuf_count; index++) {
1540                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1541                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1542                         pktmbuf_detach_zcp(mbuf);
1543                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1544
1545                 /* Update used index buffer information. */
1546                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1547                 vq->used->ring[used_idx].len = 0;
1548
1549                 used_idx = (used_idx + 1) & (vq->size - 1);
1550         }
1551
1552         LOG_DEBUG(VHOST_DATA,
1553                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1554                 "clean is: %d\n",
1555                 dev->device_fh, rte_mempool_count(vpool->pool));
1556         LOG_DEBUG(VHOST_DATA,
1557                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1558                 "clean  is : %d\n",
1559                 dev->device_fh, rte_ring_count(vpool->ring));
1560         LOG_DEBUG(VHOST_DATA,
1561                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1562                 "vq->last_used_idx:%d\n",
1563                 dev->device_fh, vq->last_used_idx);
1564
1565         vq->last_used_idx += mbuf_count;
1566
1567         LOG_DEBUG(VHOST_DATA,
1568                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1569                 "vq->last_used_idx:%d\n",
1570                 dev->device_fh, vq->last_used_idx);
1571
1572         rte_compiler_barrier();
1573
1574         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1575
1576         /* Kick guest if required. */
1577         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1578                 eventfd_write((int)vq->kickfd, 1);
1579
1580         return 0;
1581 }
1582
1583 /*
1584  * This function is called when a virtio device is destroy.
1585  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1586  */
1587 static void mbuf_destroy_zcp(struct vpool *vpool)
1588 {
1589         struct rte_mbuf *mbuf = NULL;
1590         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1591
1592         LOG_DEBUG(VHOST_CONFIG,
1593                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1594                 "mbuf_destroy_zcp is: %d\n",
1595                 mbuf_count);
1596         LOG_DEBUG(VHOST_CONFIG,
1597                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1598                 "mbuf_destroy_zcp  is : %d\n",
1599                 rte_ring_count(vpool->ring));
1600
1601         for (index = 0; index < mbuf_count; index++) {
1602                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1603                 if (likely(mbuf != NULL)) {
1604                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1605                                 pktmbuf_detach_zcp(mbuf);
1606                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1607                 }
1608         }
1609
1610         LOG_DEBUG(VHOST_CONFIG,
1611                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1612                 "mbuf_destroy_zcp is: %d\n",
1613                 rte_mempool_count(vpool->pool));
1614         LOG_DEBUG(VHOST_CONFIG,
1615                 "in mbuf_destroy_zcp: mbuf count in ring after "
1616                 "mbuf_destroy_zcp is : %d\n",
1617                 rte_ring_count(vpool->ring));
1618 }
1619
1620 /*
1621  * This function update the use flag and counter.
1622  */
1623 static inline uint32_t __attribute__((always_inline))
1624 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1625         uint32_t count)
1626 {
1627         struct vhost_virtqueue *vq;
1628         struct vring_desc *desc;
1629         struct rte_mbuf *buff;
1630         /* The virtio_hdr is initialised to 0. */
1631         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1632                 = {{0, 0, 0, 0, 0, 0}, 0};
1633         uint64_t buff_hdr_addr = 0;
1634         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1635         uint32_t head_idx, packet_success = 0;
1636         uint16_t res_cur_idx;
1637
1638         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1639
1640         if (count == 0)
1641                 return 0;
1642
1643         vq = dev->virtqueue[VIRTIO_RXQ];
1644         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1645
1646         res_cur_idx = vq->last_used_idx;
1647         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1648                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1649
1650         /* Retrieve all of the head indexes first to avoid caching issues. */
1651         for (head_idx = 0; head_idx < count; head_idx++)
1652                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1653
1654         /*Prefetch descriptor index. */
1655         rte_prefetch0(&vq->desc[head[packet_success]]);
1656
1657         while (packet_success != count) {
1658                 /* Get descriptor from available ring */
1659                 desc = &vq->desc[head[packet_success]];
1660
1661                 buff = pkts[packet_success];
1662                 LOG_DEBUG(VHOST_DATA,
1663                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1664                         "pkt[%d] descriptor idx: %d\n",
1665                         dev->device_fh, packet_success,
1666                         MBUF_HEADROOM_UINT32(buff));
1667
1668                 PRINT_PACKET(dev,
1669                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1670                         + RTE_PKTMBUF_HEADROOM),
1671                         rte_pktmbuf_data_len(buff), 0);
1672
1673                 /* Buffer address translation for virtio header. */
1674                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1675                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1676
1677                 /*
1678                  * If the descriptors are chained the header and data are
1679                  * placed in separate buffers.
1680                  */
1681                 if (desc->flags & VRING_DESC_F_NEXT) {
1682                         desc->len = vq->vhost_hlen;
1683                         desc = &vq->desc[desc->next];
1684                         desc->len = rte_pktmbuf_data_len(buff);
1685                 } else {
1686                         desc->len = packet_len;
1687                 }
1688
1689                 /* Update used ring with desc information */
1690                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1691                         = head[packet_success];
1692                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1693                         = packet_len;
1694                 res_cur_idx++;
1695                 packet_success++;
1696
1697                 /* A header is required per buffer. */
1698                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1699                         (const void *)&virtio_hdr, vq->vhost_hlen);
1700
1701                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1702
1703                 if (likely(packet_success < count)) {
1704                         /* Prefetch descriptor index. */
1705                         rte_prefetch0(&vq->desc[head[packet_success]]);
1706                 }
1707         }
1708
1709         rte_compiler_barrier();
1710
1711         LOG_DEBUG(VHOST_DATA,
1712                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1713                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1714                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1715
1716         *(volatile uint16_t *)&vq->used->idx += count;
1717         vq->last_used_idx += count;
1718
1719         LOG_DEBUG(VHOST_DATA,
1720                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1721                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1722                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1723
1724         /* Kick the guest if necessary. */
1725         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1726                 eventfd_write((int)vq->kickfd, 1);
1727
1728         return count;
1729 }
1730
1731 /*
1732  * This function routes the TX packet to the correct interface.
1733  * This may be a local device or the physical port.
1734  */
1735 static inline void __attribute__((always_inline))
1736 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1737         uint32_t desc_idx, uint8_t need_copy)
1738 {
1739         struct mbuf_table *tx_q;
1740         struct rte_mbuf **m_table;
1741         struct rte_mbuf *mbuf = NULL;
1742         unsigned len, ret, offset = 0;
1743         struct vpool *vpool;
1744         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1745         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1746
1747         /*Add packet to the port tx queue*/
1748         tx_q = &tx_queue_zcp[vmdq_rx_q];
1749         len = tx_q->len;
1750
1751         /* Allocate an mbuf and populate the structure. */
1752         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1753         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1754         if (unlikely(mbuf == NULL)) {
1755                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1756                 RTE_LOG(ERR, VHOST_DATA,
1757                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1758                         dev->device_fh);
1759                 put_desc_to_used_list_zcp(vq, desc_idx);
1760                 return;
1761         }
1762
1763         if (vm2vm_mode == VM2VM_HARDWARE) {
1764                 /* Avoid using a vlan tag from any vm for external pkt, such as
1765                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1766                  * selection, MAC address determines it as an external pkt
1767                  * which should go to network, while vlan tag determine it as
1768                  * a vm2vm pkt should forward to another vm. Hardware confuse
1769                  * such a ambiguous situation, so pkt will lost.
1770                  */
1771                 vlan_tag = external_pkt_default_vlan_tag;
1772                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1773                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1774                         __rte_mbuf_raw_free(mbuf);
1775                         return;
1776                 }
1777         }
1778
1779         mbuf->nb_segs = m->nb_segs;
1780         mbuf->next = m->next;
1781         mbuf->data_len = m->data_len + offset;
1782         mbuf->pkt_len = mbuf->data_len;
1783         if (unlikely(need_copy)) {
1784                 /* Copy the packet contents to the mbuf. */
1785                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1786                         rte_pktmbuf_mtod(m, void *),
1787                         m->data_len);
1788         } else {
1789                 mbuf->data_off = m->data_off;
1790                 mbuf->buf_physaddr = m->buf_physaddr;
1791                 mbuf->buf_addr = m->buf_addr;
1792         }
1793         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1794         mbuf->vlan_tci = vlan_tag;
1795         mbuf->l2_len = sizeof(struct ether_hdr);
1796         mbuf->l3_len = sizeof(struct ipv4_hdr);
1797         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1798
1799         tx_q->m_table[len] = mbuf;
1800         len++;
1801
1802         LOG_DEBUG(VHOST_DATA,
1803                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1804                 dev->device_fh,
1805                 mbuf->nb_segs,
1806                 (mbuf->next == NULL) ? "null" : "non-null");
1807
1808         if (enable_stats) {
1809                 dev_statistics[dev->device_fh].tx_total++;
1810                 dev_statistics[dev->device_fh].tx++;
1811         }
1812
1813         if (unlikely(len == MAX_PKT_BURST)) {
1814                 m_table = (struct rte_mbuf **)tx_q->m_table;
1815                 ret = rte_eth_tx_burst(ports[0],
1816                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1817
1818                 /*
1819                  * Free any buffers not handled by TX and update
1820                  * the port stats.
1821                  */
1822                 if (unlikely(ret < len)) {
1823                         do {
1824                                 rte_pktmbuf_free(m_table[ret]);
1825                         } while (++ret < len);
1826                 }
1827
1828                 len = 0;
1829                 txmbuf_clean_zcp(dev, vpool);
1830         }
1831
1832         tx_q->len = len;
1833
1834         return;
1835 }
1836
1837 /*
1838  * This function TX all available packets in virtio TX queue for one
1839  * virtio-net device. If it is first packet, it learns MAC address and
1840  * setup VMDQ.
1841  */
1842 static inline void __attribute__((always_inline))
1843 virtio_dev_tx_zcp(struct virtio_net *dev)
1844 {
1845         struct rte_mbuf m;
1846         struct vhost_virtqueue *vq;
1847         struct vring_desc *desc;
1848         uint64_t buff_addr = 0, phys_addr;
1849         uint32_t head[MAX_PKT_BURST];
1850         uint32_t i;
1851         uint16_t free_entries, packet_success = 0;
1852         uint16_t avail_idx;
1853         uint8_t need_copy = 0;
1854         hpa_type addr_type;
1855         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1856
1857         vq = dev->virtqueue[VIRTIO_TXQ];
1858         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1859
1860         /* If there are no available buffers then return. */
1861         if (vq->last_used_idx_res == avail_idx)
1862                 return;
1863
1864         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1865
1866         /* Prefetch available ring to retrieve head indexes. */
1867         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1868
1869         /* Get the number of free entries in the ring */
1870         free_entries = (avail_idx - vq->last_used_idx_res);
1871
1872         /* Limit to MAX_PKT_BURST. */
1873         free_entries
1874                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1875
1876         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1877                 dev->device_fh, free_entries);
1878
1879         /* Retrieve all of the head indexes first to avoid caching issues. */
1880         for (i = 0; i < free_entries; i++)
1881                 head[i]
1882                         = vq->avail->ring[(vq->last_used_idx_res + i)
1883                         & (vq->size - 1)];
1884
1885         vq->last_used_idx_res += free_entries;
1886
1887         /* Prefetch descriptor index. */
1888         rte_prefetch0(&vq->desc[head[packet_success]]);
1889         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1890
1891         while (packet_success < free_entries) {
1892                 desc = &vq->desc[head[packet_success]];
1893
1894                 /* Discard first buffer as it is the virtio header */
1895                 desc = &vq->desc[desc->next];
1896
1897                 /* Buffer address translation. */
1898                 buff_addr = gpa_to_vva(dev, desc->addr);
1899                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1900
1901                 if (likely(packet_success < (free_entries - 1)))
1902                         /* Prefetch descriptor index. */
1903                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1904
1905                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1906                         RTE_LOG(ERR, VHOST_DATA,
1907                                 "(%"PRIu64") Invalid frame buffer address found"
1908                                 "when TX packets!\n",
1909                                 dev->device_fh);
1910                         packet_success++;
1911                         continue;
1912                 }
1913
1914                 /* Prefetch buffer address. */
1915                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1916
1917                 /*
1918                  * Setup dummy mbuf. This is copied to a real mbuf if
1919                  * transmitted out the physical port.
1920                  */
1921                 m.data_len = desc->len;
1922                 m.nb_segs = 1;
1923                 m.next = NULL;
1924                 m.data_off = 0;
1925                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1926                 m.buf_physaddr = phys_addr;
1927
1928                 /*
1929                  * Check if the frame buffer address from guest crosses
1930                  * sub-region or not.
1931                  */
1932                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1933                         RTE_LOG(ERR, VHOST_DATA,
1934                                 "(%"PRIu64") Frame buffer address cross "
1935                                 "sub-regioin found when attaching TX frame "
1936                                 "buffer address!\n",
1937                                 dev->device_fh);
1938                         need_copy = 1;
1939                 } else
1940                         need_copy = 0;
1941
1942                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1943
1944                 /*
1945                  * If this is the first received packet we need to learn
1946                  * the MAC and setup VMDQ
1947                  */
1948                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1949                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1950                                 /*
1951                                  * Discard frame if device is scheduled for
1952                                  * removal or a duplicate MAC address is found.
1953                                  */
1954                                 packet_success += free_entries;
1955                                 vq->last_used_idx += packet_success;
1956                                 break;
1957                         }
1958                 }
1959
1960                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1961                 packet_success++;
1962         }
1963 }
1964
1965 /*
1966  * This function is called by each data core. It handles all RX/TX registered
1967  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1968  * addresses are compared with all devices in the main linked list.
1969  */
1970 static int
1971 switch_worker_zcp(__attribute__((unused)) void *arg)
1972 {
1973         struct virtio_net *dev = NULL;
1974         struct vhost_dev  *vdev = NULL;
1975         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1976         struct virtio_net_data_ll *dev_ll;
1977         struct mbuf_table *tx_q;
1978         volatile struct lcore_ll_info *lcore_ll;
1979         const uint64_t drain_tsc
1980                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1981                 * BURST_TX_DRAIN_US;
1982         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1983         unsigned ret;
1984         const uint16_t lcore_id = rte_lcore_id();
1985         uint16_t count_in_ring, rx_count = 0;
1986
1987         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1988
1989         lcore_ll = lcore_info[lcore_id].lcore_ll;
1990         prev_tsc = 0;
1991
1992         while (1) {
1993                 cur_tsc = rte_rdtsc();
1994
1995                 /* TX burst queue drain */
1996                 diff_tsc = cur_tsc - prev_tsc;
1997                 if (unlikely(diff_tsc > drain_tsc)) {
1998                         /*
1999                          * Get mbuf from vpool.pool and detach mbuf and
2000                          * put back into vpool.ring.
2001                          */
2002                         dev_ll = lcore_ll->ll_root_used;
2003                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2004                                 /* Get virtio device ID */
2005                                 vdev = dev_ll->vdev;
2006                                 dev = vdev->dev;
2007
2008                                 if (likely(!vdev->remove)) {
2009                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2010                                         if (tx_q->len) {
2011                                                 LOG_DEBUG(VHOST_DATA,
2012                                                 "TX queue drained after timeout"
2013                                                 " with burst size %u\n",
2014                                                 tx_q->len);
2015
2016                                                 /*
2017                                                  * Tx any packets in the queue
2018                                                  */
2019                                                 ret = rte_eth_tx_burst(
2020                                                         ports[0],
2021                                                         (uint16_t)tx_q->txq_id,
2022                                                         (struct rte_mbuf **)
2023                                                         tx_q->m_table,
2024                                                         (uint16_t)tx_q->len);
2025                                                 if (unlikely(ret < tx_q->len)) {
2026                                                         do {
2027                                                                 rte_pktmbuf_free(
2028                                                                         tx_q->m_table[ret]);
2029                                                         } while (++ret < tx_q->len);
2030                                                 }
2031                                                 tx_q->len = 0;
2032
2033                                                 txmbuf_clean_zcp(dev,
2034                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2035                                         }
2036                                 }
2037                                 dev_ll = dev_ll->next;
2038                         }
2039                         prev_tsc = cur_tsc;
2040                 }
2041
2042                 rte_prefetch0(lcore_ll->ll_root_used);
2043
2044                 /*
2045                  * Inform the configuration core that we have exited the linked
2046                  * list and that no devices are in use if requested.
2047                  */
2048                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2049                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2050
2051                 /* Process devices */
2052                 dev_ll = lcore_ll->ll_root_used;
2053
2054                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2055                         vdev = dev_ll->vdev;
2056                         dev  = vdev->dev;
2057                         if (unlikely(vdev->remove)) {
2058                                 dev_ll = dev_ll->next;
2059                                 unlink_vmdq(vdev);
2060                                 vdev->ready = DEVICE_SAFE_REMOVE;
2061                                 continue;
2062                         }
2063
2064                         if (likely(vdev->ready == DEVICE_RX)) {
2065                                 uint32_t index = vdev->vmdq_rx_q;
2066                                 uint16_t i;
2067                                 count_in_ring
2068                                 = rte_ring_count(vpool_array[index].ring);
2069                                 uint16_t free_entries
2070                                 = (uint16_t)get_available_ring_num_zcp(dev);
2071
2072                                 /*
2073                                  * Attach all mbufs in vpool.ring and put back
2074                                  * into vpool.pool.
2075                                  */
2076                                 for (i = 0;
2077                                 i < RTE_MIN(free_entries,
2078                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2079                                 i++)
2080                                         attach_rxmbuf_zcp(dev);
2081
2082                                 /* Handle guest RX */
2083                                 rx_count = rte_eth_rx_burst(ports[0],
2084                                         vdev->vmdq_rx_q, pkts_burst,
2085                                         MAX_PKT_BURST);
2086
2087                                 if (rx_count) {
2088                                         ret_count = virtio_dev_rx_zcp(dev,
2089                                                         pkts_burst, rx_count);
2090                                         if (enable_stats) {
2091                                                 dev_statistics[dev->device_fh].rx_total
2092                                                         += rx_count;
2093                                                 dev_statistics[dev->device_fh].rx
2094                                                         += ret_count;
2095                                         }
2096                                         while (likely(rx_count)) {
2097                                                 rx_count--;
2098                                                 pktmbuf_detach_zcp(
2099                                                         pkts_burst[rx_count]);
2100                                                 rte_ring_sp_enqueue(
2101                                                         vpool_array[index].ring,
2102                                                         (void *)pkts_burst[rx_count]);
2103                                         }
2104                                 }
2105                         }
2106
2107                         if (likely(!vdev->remove))
2108                                 /* Handle guest TX */
2109                                 virtio_dev_tx_zcp(dev);
2110
2111                         /* Move to the next device in the list */
2112                         dev_ll = dev_ll->next;
2113                 }
2114         }
2115
2116         return 0;
2117 }
2118
2119
2120 /*
2121  * Add an entry to a used linked list. A free entry must first be found
2122  * in the free linked list using get_data_ll_free_entry();
2123  */
2124 static void
2125 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2126         struct virtio_net_data_ll *ll_dev)
2127 {
2128         struct virtio_net_data_ll *ll = *ll_root_addr;
2129
2130         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2131         ll_dev->next = NULL;
2132         rte_compiler_barrier();
2133
2134         /* If ll == NULL then this is the first device. */
2135         if (ll) {
2136                 /* Increment to the tail of the linked list. */
2137                 while ((ll->next != NULL) )
2138                         ll = ll->next;
2139
2140                 ll->next = ll_dev;
2141         } else {
2142                 *ll_root_addr = ll_dev;
2143         }
2144 }
2145
2146 /*
2147  * Remove an entry from a used linked list. The entry must then be added to
2148  * the free linked list using put_data_ll_free_entry().
2149  */
2150 static void
2151 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2152         struct virtio_net_data_ll *ll_dev,
2153         struct virtio_net_data_ll *ll_dev_last)
2154 {
2155         struct virtio_net_data_ll *ll = *ll_root_addr;
2156
2157         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2158                 return;
2159
2160         if (ll_dev == ll)
2161                 *ll_root_addr = ll_dev->next;
2162         else
2163                 if (likely(ll_dev_last != NULL))
2164                         ll_dev_last->next = ll_dev->next;
2165                 else
2166                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2167 }
2168
2169 /*
2170  * Find and return an entry from the free linked list.
2171  */
2172 static struct virtio_net_data_ll *
2173 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2174 {
2175         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2176         struct virtio_net_data_ll *ll_dev;
2177
2178         if (ll_free == NULL)
2179                 return NULL;
2180
2181         ll_dev = ll_free;
2182         *ll_root_addr = ll_free->next;
2183
2184         return ll_dev;
2185 }
2186
2187 /*
2188  * Place an entry back on to the free linked list.
2189  */
2190 static void
2191 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2192         struct virtio_net_data_ll *ll_dev)
2193 {
2194         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2195
2196         if (ll_dev == NULL)
2197                 return;
2198
2199         ll_dev->next = ll_free;
2200         *ll_root_addr = ll_dev;
2201 }
2202
2203 /*
2204  * Creates a linked list of a given size.
2205  */
2206 static struct virtio_net_data_ll *
2207 alloc_data_ll(uint32_t size)
2208 {
2209         struct virtio_net_data_ll *ll_new;
2210         uint32_t i;
2211
2212         /* Malloc and then chain the linked list. */
2213         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2214         if (ll_new == NULL) {
2215                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2216                 return NULL;
2217         }
2218
2219         for (i = 0; i < size - 1; i++) {
2220                 ll_new[i].vdev = NULL;
2221                 ll_new[i].next = &ll_new[i+1];
2222         }
2223         ll_new[i].next = NULL;
2224
2225         return (ll_new);
2226 }
2227
2228 /*
2229  * Create the main linked list along with each individual cores linked list. A used and a free list
2230  * are created to manage entries.
2231  */
2232 static int
2233 init_data_ll (void)
2234 {
2235         int lcore;
2236
2237         RTE_LCORE_FOREACH_SLAVE(lcore) {
2238                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2239                 if (lcore_info[lcore].lcore_ll == NULL) {
2240                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2241                         return -1;
2242                 }
2243
2244                 lcore_info[lcore].lcore_ll->device_num = 0;
2245                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2246                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2247                 if (num_devices % num_switching_cores)
2248                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2249                 else
2250                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2251         }
2252
2253         /* Allocate devices up to a maximum of MAX_DEVICES. */
2254         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2255
2256         return 0;
2257 }
2258
2259 /*
2260  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2261  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2262  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2263  */
2264 static void
2265 destroy_device (volatile struct virtio_net *dev)
2266 {
2267         struct virtio_net_data_ll *ll_lcore_dev_cur;
2268         struct virtio_net_data_ll *ll_main_dev_cur;
2269         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2270         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2271         struct vhost_dev *vdev;
2272         int lcore;
2273
2274         dev->flags &= ~VIRTIO_DEV_RUNNING;
2275
2276         vdev = (struct vhost_dev *)dev->priv;
2277         /*set the remove flag. */
2278         vdev->remove = 1;
2279         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2280                 rte_pause();
2281         }
2282
2283         /* Search for entry to be removed from lcore ll */
2284         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2285         while (ll_lcore_dev_cur != NULL) {
2286                 if (ll_lcore_dev_cur->vdev == vdev) {
2287                         break;
2288                 } else {
2289                         ll_lcore_dev_last = ll_lcore_dev_cur;
2290                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2291                 }
2292         }
2293
2294         if (ll_lcore_dev_cur == NULL) {
2295                 RTE_LOG(ERR, VHOST_CONFIG,
2296                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2297                         dev->device_fh);
2298                 return;
2299         }
2300
2301         /* Search for entry to be removed from main ll */
2302         ll_main_dev_cur = ll_root_used;
2303         ll_main_dev_last = NULL;
2304         while (ll_main_dev_cur != NULL) {
2305                 if (ll_main_dev_cur->vdev == vdev) {
2306                         break;
2307                 } else {
2308                         ll_main_dev_last = ll_main_dev_cur;
2309                         ll_main_dev_cur = ll_main_dev_cur->next;
2310                 }
2311         }
2312
2313         /* Remove entries from the lcore and main ll. */
2314         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2315         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2316
2317         /* Set the dev_removal_flag on each lcore. */
2318         RTE_LCORE_FOREACH_SLAVE(lcore) {
2319                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2320         }
2321
2322         /*
2323          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2324          * they can no longer access the device removed from the linked lists and that the devices
2325          * are no longer in use.
2326          */
2327         RTE_LCORE_FOREACH_SLAVE(lcore) {
2328                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2329                         rte_pause();
2330                 }
2331         }
2332
2333         /* Add the entries back to the lcore and main free ll.*/
2334         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2335         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2336
2337         /* Decrement number of device on the lcore. */
2338         lcore_info[vdev->coreid].lcore_ll->device_num--;
2339
2340         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2341
2342         if (zero_copy) {
2343                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2344
2345                 /* Stop the RX queue. */
2346                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2347                         LOG_DEBUG(VHOST_CONFIG,
2348                                 "(%"PRIu64") In destroy_device: Failed to stop "
2349                                 "rx queue:%d\n",
2350                                 dev->device_fh,
2351                                 vdev->vmdq_rx_q);
2352                 }
2353
2354                 LOG_DEBUG(VHOST_CONFIG,
2355                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2356                         "mempool back to ring for RX queue: %d\n",
2357                         dev->device_fh, vdev->vmdq_rx_q);
2358
2359                 mbuf_destroy_zcp(vpool);
2360
2361                 /* Stop the TX queue. */
2362                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2363                         LOG_DEBUG(VHOST_CONFIG,
2364                                 "(%"PRIu64") In destroy_device: Failed to "
2365                                 "stop tx queue:%d\n",
2366                                 dev->device_fh, vdev->vmdq_rx_q);
2367                 }
2368
2369                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2370
2371                 LOG_DEBUG(VHOST_CONFIG,
2372                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2373                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2374                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2375                         dev->device_fh);
2376
2377                 mbuf_destroy_zcp(vpool);
2378                 rte_free(vdev->regions_hpa);
2379         }
2380         rte_free(vdev);
2381
2382 }
2383
2384 /*
2385  * Calculate the region count of physical continous regions for one particular
2386  * region of whose vhost virtual address is continous. The particular region
2387  * start from vva_start, with size of 'size' in argument.
2388  */
2389 static uint32_t
2390 check_hpa_regions(uint64_t vva_start, uint64_t size)
2391 {
2392         uint32_t i, nregions = 0, page_size = getpagesize();
2393         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2394         if (vva_start % page_size) {
2395                 LOG_DEBUG(VHOST_CONFIG,
2396                         "in check_countinous: vva start(%p) mod page_size(%d) "
2397                         "has remainder\n",
2398                         (void *)(uintptr_t)vva_start, page_size);
2399                 return 0;
2400         }
2401         if (size % page_size) {
2402                 LOG_DEBUG(VHOST_CONFIG,
2403                         "in check_countinous: "
2404                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2405                         size, page_size);
2406                 return 0;
2407         }
2408         for (i = 0; i < size - page_size; i = i + page_size) {
2409                 cur_phys_addr
2410                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2411                 next_phys_addr = rte_mem_virt2phy(
2412                         (void *)(uintptr_t)(vva_start + i + page_size));
2413                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2414                         ++nregions;
2415                         LOG_DEBUG(VHOST_CONFIG,
2416                                 "in check_continuous: hva addr:(%p) is not "
2417                                 "continuous with hva addr:(%p), diff:%d\n",
2418                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2419                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2420                                 + page_size), page_size);
2421                         LOG_DEBUG(VHOST_CONFIG,
2422                                 "in check_continuous: hpa addr:(%p) is not "
2423                                 "continuous with hpa addr:(%p), "
2424                                 "diff:(%"PRIu64")\n",
2425                                 (void *)(uintptr_t)cur_phys_addr,
2426                                 (void *)(uintptr_t)next_phys_addr,
2427                                 (next_phys_addr-cur_phys_addr));
2428                 }
2429         }
2430         return nregions;
2431 }
2432
2433 /*
2434  * Divide each region whose vhost virtual address is continous into a few
2435  * sub-regions, make sure the physical address within each sub-region are
2436  * continous. And fill offset(to GPA) and size etc. information of each
2437  * sub-region into regions_hpa.
2438  */
2439 static uint32_t
2440 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2441 {
2442         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2443         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2444
2445         if (mem_region_hpa == NULL)
2446                 return 0;
2447
2448         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2449                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2450                         virtio_memory->regions[regionidx].address_offset;
2451                 mem_region_hpa[regionidx_hpa].guest_phys_address
2452                         = virtio_memory->regions[regionidx].guest_phys_address;
2453                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2454                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2455                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2456                 LOG_DEBUG(VHOST_CONFIG,
2457                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2458                         regionidx_hpa,
2459                         (void *)(uintptr_t)
2460                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2461                 LOG_DEBUG(VHOST_CONFIG,
2462                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2463                         regionidx_hpa,
2464                         (void *)(uintptr_t)
2465                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2466                 for (i = 0, k = 0;
2467                         i < virtio_memory->regions[regionidx].memory_size -
2468                                 page_size;
2469                         i += page_size) {
2470                         cur_phys_addr = rte_mem_virt2phy(
2471                                         (void *)(uintptr_t)(vva_start + i));
2472                         next_phys_addr = rte_mem_virt2phy(
2473                                         (void *)(uintptr_t)(vva_start +
2474                                         i + page_size));
2475                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2476                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2477                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2478                                         k + page_size;
2479                                 mem_region_hpa[regionidx_hpa].memory_size
2480                                         = k + page_size;
2481                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2482                                         "phys addr end  [%d]:(%p)\n",
2483                                         regionidx_hpa,
2484                                         (void *)(uintptr_t)
2485                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2486                                 LOG_DEBUG(VHOST_CONFIG,
2487                                         "in fill_hpa_regions: guest phys addr "
2488                                         "size [%d]:(%p)\n",
2489                                         regionidx_hpa,
2490                                         (void *)(uintptr_t)
2491                                         (mem_region_hpa[regionidx_hpa].memory_size));
2492                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2493                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2494                                 ++regionidx_hpa;
2495                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2496                                         next_phys_addr -
2497                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2498                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2499                                         " phys addr start[%d]:(%p)\n",
2500                                         regionidx_hpa,
2501                                         (void *)(uintptr_t)
2502                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2503                                 LOG_DEBUG(VHOST_CONFIG,
2504                                         "in fill_hpa_regions: host  phys addr "
2505                                         "start[%d]:(%p)\n",
2506                                         regionidx_hpa,
2507                                         (void *)(uintptr_t)
2508                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2509                                 k = 0;
2510                         } else {
2511                                 k += page_size;
2512                         }
2513                 }
2514                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2515                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2516                         + k + page_size;
2517                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2518                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2519                         "[%d]:(%p)\n", regionidx_hpa,
2520                         (void *)(uintptr_t)
2521                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2522                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2523                         "[%d]:(%p)\n", regionidx_hpa,
2524                         (void *)(uintptr_t)
2525                         (mem_region_hpa[regionidx_hpa].memory_size));
2526                 ++regionidx_hpa;
2527         }
2528         return regionidx_hpa;
2529 }
2530
2531 /*
2532  * A new device is added to a data core. First the device is added to the main linked list
2533  * and the allocated to a specific data core.
2534  */
2535 static int
2536 new_device (struct virtio_net *dev)
2537 {
2538         struct virtio_net_data_ll *ll_dev;
2539         int lcore, core_add = 0;
2540         uint32_t device_num_min = num_devices;
2541         struct vhost_dev *vdev;
2542         uint32_t regionidx;
2543
2544         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2545         if (vdev == NULL) {
2546                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2547                         dev->device_fh);
2548                 return -1;
2549         }
2550         vdev->dev = dev;
2551         dev->priv = vdev;
2552
2553         if (zero_copy) {
2554                 vdev->nregions_hpa = dev->mem->nregions;
2555                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2556                         vdev->nregions_hpa
2557                                 += check_hpa_regions(
2558                                         dev->mem->regions[regionidx].guest_phys_address
2559                                         + dev->mem->regions[regionidx].address_offset,
2560                                         dev->mem->regions[regionidx].memory_size);
2561
2562                 }
2563
2564                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2565                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2566                         CACHE_LINE_SIZE);
2567                 if (vdev->regions_hpa == NULL) {
2568                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2569                         rte_free(vdev);
2570                         return -1;
2571                 }
2572
2573
2574                 if (fill_hpa_memory_regions(
2575                         vdev->regions_hpa, dev->mem
2576                         ) != vdev->nregions_hpa) {
2577
2578                         RTE_LOG(ERR, VHOST_CONFIG,
2579                                 "hpa memory regions number mismatch: "
2580                                 "[%d]\n", vdev->nregions_hpa);
2581                         rte_free(vdev->regions_hpa);
2582                         rte_free(vdev);
2583                         return -1;
2584                 }
2585         }
2586
2587
2588         /* Add device to main ll */
2589         ll_dev = get_data_ll_free_entry(&ll_root_free);
2590         if (ll_dev == NULL) {
2591                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2592                         "of %d devices per core has been reached\n",
2593                         dev->device_fh, num_devices);
2594                 if (vdev->regions_hpa)
2595                         rte_free(vdev->regions_hpa);
2596                 rte_free(vdev);
2597                 return -1;
2598         }
2599         ll_dev->vdev = vdev;
2600         add_data_ll_entry(&ll_root_used, ll_dev);
2601         vdev->vmdq_rx_q
2602                 = dev->device_fh * (num_queues / num_devices);
2603
2604         if (zero_copy) {
2605                 uint32_t index = vdev->vmdq_rx_q;
2606                 uint32_t count_in_ring, i;
2607                 struct mbuf_table *tx_q;
2608
2609                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2610
2611                 LOG_DEBUG(VHOST_CONFIG,
2612                         "(%"PRIu64") in new_device: mbuf count in mempool "
2613                         "before attach is: %d\n",
2614                         dev->device_fh,
2615                         rte_mempool_count(vpool_array[index].pool));
2616                 LOG_DEBUG(VHOST_CONFIG,
2617                         "(%"PRIu64") in new_device: mbuf count in  ring "
2618                         "before attach  is : %d\n",
2619                         dev->device_fh, count_in_ring);
2620
2621                 /*
2622                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2623                  */
2624                 for (i = 0; i < count_in_ring; i++)
2625                         attach_rxmbuf_zcp(dev);
2626
2627                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2628                         "mempool after attach is: %d\n",
2629                         dev->device_fh,
2630                         rte_mempool_count(vpool_array[index].pool));
2631                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2632                         "ring after attach  is : %d\n",
2633                         dev->device_fh,
2634                         rte_ring_count(vpool_array[index].ring));
2635
2636                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2637                 tx_q->txq_id = vdev->vmdq_rx_q;
2638
2639                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2640                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2641
2642                         LOG_DEBUG(VHOST_CONFIG,
2643                                 "(%"PRIu64") In new_device: Failed to start "
2644                                 "tx queue:%d\n",
2645                                 dev->device_fh, vdev->vmdq_rx_q);
2646
2647                         mbuf_destroy_zcp(vpool);
2648                         rte_free(vdev->regions_hpa);
2649                         rte_free(vdev);
2650                         return -1;
2651                 }
2652
2653                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2654                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2655
2656                         LOG_DEBUG(VHOST_CONFIG,
2657                                 "(%"PRIu64") In new_device: Failed to start "
2658                                 "rx queue:%d\n",
2659                                 dev->device_fh, vdev->vmdq_rx_q);
2660
2661                         /* Stop the TX queue. */
2662                         if (rte_eth_dev_tx_queue_stop(ports[0],
2663                                 vdev->vmdq_rx_q) != 0) {
2664                                 LOG_DEBUG(VHOST_CONFIG,
2665                                         "(%"PRIu64") In new_device: Failed to "
2666                                         "stop tx queue:%d\n",
2667                                         dev->device_fh, vdev->vmdq_rx_q);
2668                         }
2669
2670                         mbuf_destroy_zcp(vpool);
2671                         rte_free(vdev->regions_hpa);
2672                         rte_free(vdev);
2673                         return -1;
2674                 }
2675
2676         }
2677
2678         /*reset ready flag*/
2679         vdev->ready = DEVICE_MAC_LEARNING;
2680         vdev->remove = 0;
2681
2682         /* Find a suitable lcore to add the device. */
2683         RTE_LCORE_FOREACH_SLAVE(lcore) {
2684                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2685                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2686                         core_add = lcore;
2687                 }
2688         }
2689         /* Add device to lcore ll */
2690         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2691         if (ll_dev == NULL) {
2692                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2693                 vdev->ready = DEVICE_SAFE_REMOVE;
2694                 destroy_device(dev);
2695                 if (vdev->regions_hpa)
2696                         rte_free(vdev->regions_hpa);
2697                 rte_free(vdev);
2698                 return -1;
2699         }
2700         ll_dev->vdev = vdev;
2701         vdev->coreid = core_add;
2702
2703         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2704
2705         /* Initialize device stats */
2706         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2707
2708         /* Disable notifications. */
2709         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2710         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2711         lcore_info[vdev->coreid].lcore_ll->device_num++;
2712         dev->flags |= VIRTIO_DEV_RUNNING;
2713
2714         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2715
2716         return 0;
2717 }
2718
2719 /*
2720  * These callback allow devices to be added to the data core when configuration
2721  * has been fully complete.
2722  */
2723 static const struct virtio_net_device_ops virtio_net_device_ops =
2724 {
2725         .new_device =  new_device,
2726         .destroy_device = destroy_device,
2727 };
2728
2729 /*
2730  * This is a thread will wake up after a period to print stats if the user has
2731  * enabled them.
2732  */
2733 static void
2734 print_stats(void)
2735 {
2736         struct virtio_net_data_ll *dev_ll;
2737         uint64_t tx_dropped, rx_dropped;
2738         uint64_t tx, tx_total, rx, rx_total;
2739         uint32_t device_fh;
2740         const char clr[] = { 27, '[', '2', 'J', '\0' };
2741         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2742
2743         while(1) {
2744                 sleep(enable_stats);
2745
2746                 /* Clear screen and move to top left */
2747                 printf("%s%s", clr, top_left);
2748
2749                 printf("\nDevice statistics ====================================");
2750
2751                 dev_ll = ll_root_used;
2752                 while (dev_ll != NULL) {
2753                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2754                         tx_total = dev_statistics[device_fh].tx_total;
2755                         tx = dev_statistics[device_fh].tx;
2756                         tx_dropped = tx_total - tx;
2757                         if (zero_copy == 0) {
2758                                 rx_total = rte_atomic64_read(
2759                                         &dev_statistics[device_fh].rx_total_atomic);
2760                                 rx = rte_atomic64_read(
2761                                         &dev_statistics[device_fh].rx_atomic);
2762                         } else {
2763                                 rx_total = dev_statistics[device_fh].rx_total;
2764                                 rx = dev_statistics[device_fh].rx;
2765                         }
2766                         rx_dropped = rx_total - rx;
2767
2768                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2769                                         "\nTX total:            %"PRIu64""
2770                                         "\nTX dropped:          %"PRIu64""
2771                                         "\nTX successful:               %"PRIu64""
2772                                         "\nRX total:            %"PRIu64""
2773                                         "\nRX dropped:          %"PRIu64""
2774                                         "\nRX successful:               %"PRIu64"",
2775                                         device_fh,
2776                                         tx_total,
2777                                         tx_dropped,
2778                                         tx,
2779                                         rx_total,
2780                                         rx_dropped,
2781                                         rx);
2782
2783                         dev_ll = dev_ll->next;
2784                 }
2785                 printf("\n======================================================\n");
2786         }
2787 }
2788
2789 static void
2790 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2791         char *ring_name, uint32_t nb_mbuf)
2792 {
2793         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2794         vpool_array[index].pool
2795                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2796                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2797                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2798                 rte_pktmbuf_init, NULL, socket, 0);
2799         if (vpool_array[index].pool != NULL) {
2800                 vpool_array[index].ring
2801                         = rte_ring_create(ring_name,
2802                                 rte_align32pow2(nb_mbuf + 1),
2803                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2804                 if (likely(vpool_array[index].ring != NULL)) {
2805                         LOG_DEBUG(VHOST_CONFIG,
2806                                 "in setup_mempool_tbl: mbuf count in "
2807                                 "mempool is: %d\n",
2808                                 rte_mempool_count(vpool_array[index].pool));
2809                         LOG_DEBUG(VHOST_CONFIG,
2810                                 "in setup_mempool_tbl: mbuf count in "
2811                                 "ring   is: %d\n",
2812                                 rte_ring_count(vpool_array[index].ring));
2813                 } else {
2814                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2815                                 ring_name);
2816                 }
2817
2818                 /* Need consider head room. */
2819                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2820         } else {
2821                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2822         }
2823 }
2824
2825
2826 /*
2827  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2828  * device is also registered here to handle the IOCTLs.
2829  */
2830 int
2831 MAIN(int argc, char *argv[])
2832 {
2833         struct rte_mempool *mbuf_pool = NULL;
2834         unsigned lcore_id, core_id = 0;
2835         unsigned nb_ports, valid_num_ports;
2836         int ret;
2837         uint8_t portid, queue_id = 0;
2838         static pthread_t tid;
2839
2840         /* init EAL */
2841         ret = rte_eal_init(argc, argv);
2842         if (ret < 0)
2843                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2844         argc -= ret;
2845         argv += ret;
2846
2847         /* parse app arguments */
2848         ret = us_vhost_parse_args(argc, argv);
2849         if (ret < 0)
2850                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2851
2852         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2853                 if (rte_lcore_is_enabled(lcore_id))
2854                         lcore_ids[core_id ++] = lcore_id;
2855
2856         if (rte_lcore_count() > RTE_MAX_LCORE)
2857                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2858
2859         /*set the number of swithcing cores available*/
2860         num_switching_cores = rte_lcore_count()-1;
2861
2862         /* Get the number of physical ports. */
2863         nb_ports = rte_eth_dev_count();
2864         if (nb_ports > RTE_MAX_ETHPORTS)
2865                 nb_ports = RTE_MAX_ETHPORTS;
2866
2867         /*
2868          * Update the global var NUM_PORTS and global array PORTS
2869          * and get value of var VALID_NUM_PORTS according to system ports number
2870          */
2871         valid_num_ports = check_ports_num(nb_ports);
2872
2873         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2874                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2875                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2876                 return -1;
2877         }
2878
2879         if (zero_copy == 0) {
2880                 /* Create the mbuf pool. */
2881                 mbuf_pool = rte_mempool_create(
2882                                 "MBUF_POOL",
2883                                 NUM_MBUFS_PER_PORT
2884                                 * valid_num_ports,
2885                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2886                                 sizeof(struct rte_pktmbuf_pool_private),
2887                                 rte_pktmbuf_pool_init, NULL,
2888                                 rte_pktmbuf_init, NULL,
2889                                 rte_socket_id(), 0);
2890                 if (mbuf_pool == NULL)
2891                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2892
2893                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2894                         vpool_array[queue_id].pool = mbuf_pool;
2895
2896                 if (vm2vm_mode == VM2VM_HARDWARE) {
2897                         /* Enable VT loop back to let L2 switch to do it. */
2898                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2899                         LOG_DEBUG(VHOST_CONFIG,
2900                                 "Enable loop back for L2 switch in vmdq.\n");
2901                 }
2902         } else {
2903                 uint32_t nb_mbuf;
2904                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2905                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2906
2907                 /*
2908                  * Zero copy defers queue RX/TX start to the time when guest
2909                  * finishes its startup and packet buffers from that guest are
2910                  * available.
2911                  */
2912                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2913                 rx_conf_default.rx_drop_en = 0;
2914                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2915                 nb_mbuf = num_rx_descriptor
2916                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2917                         + num_switching_cores * MAX_PKT_BURST;
2918
2919                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2920                         snprintf(pool_name, sizeof(pool_name),
2921                                 "rxmbuf_pool_%u", queue_id);
2922                         snprintf(ring_name, sizeof(ring_name),
2923                                 "rxmbuf_ring_%u", queue_id);
2924                         setup_mempool_tbl(rte_socket_id(), queue_id,
2925                                 pool_name, ring_name, nb_mbuf);
2926                 }
2927
2928                 nb_mbuf = num_tx_descriptor
2929                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2930                                 + num_switching_cores * MAX_PKT_BURST;
2931
2932                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2933                         snprintf(pool_name, sizeof(pool_name),
2934                                 "txmbuf_pool_%u", queue_id);
2935                         snprintf(ring_name, sizeof(ring_name),
2936                                 "txmbuf_ring_%u", queue_id);
2937                         setup_mempool_tbl(rte_socket_id(),
2938                                 (queue_id + MAX_QUEUES),
2939                                 pool_name, ring_name, nb_mbuf);
2940                 }
2941
2942                 if (vm2vm_mode == VM2VM_HARDWARE) {
2943                         /* Enable VT loop back to let L2 switch to do it. */
2944                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2945                         LOG_DEBUG(VHOST_CONFIG,
2946                                 "Enable loop back for L2 switch in vmdq.\n");
2947                 }
2948         }
2949         /* Set log level. */
2950         rte_set_log_level(LOG_LEVEL);
2951
2952         /* initialize all ports */
2953         for (portid = 0; portid < nb_ports; portid++) {
2954                 /* skip ports that are not enabled */
2955                 if ((enabled_port_mask & (1 << portid)) == 0) {
2956                         RTE_LOG(INFO, VHOST_PORT,
2957                                 "Skipping disabled port %d\n", portid);
2958                         continue;
2959                 }
2960                 if (port_init(portid) != 0)
2961                         rte_exit(EXIT_FAILURE,
2962                                 "Cannot initialize network ports\n");
2963         }
2964
2965         /* Initialise all linked lists. */
2966         if (init_data_ll() == -1)
2967                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2968
2969         /* Initialize device stats */
2970         memset(&dev_statistics, 0, sizeof(dev_statistics));
2971
2972         /* Enable stats if the user option is set. */
2973         if (enable_stats)
2974                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2975
2976         /* Launch all data cores. */
2977         if (zero_copy == 0) {
2978                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2979                         rte_eal_remote_launch(switch_worker,
2980                                 mbuf_pool, lcore_id);
2981                 }
2982         } else {
2983                 uint32_t count_in_mempool, index, i;
2984                 for (index = 0; index < 2*MAX_QUEUES; index++) {
2985                         /* For all RX and TX queues. */
2986                         count_in_mempool
2987                                 = rte_mempool_count(vpool_array[index].pool);
2988
2989                         /*
2990                          * Transfer all un-attached mbufs from vpool.pool
2991                          * to vpoo.ring.
2992                          */
2993                         for (i = 0; i < count_in_mempool; i++) {
2994                                 struct rte_mbuf *mbuf
2995                                         = __rte_mbuf_raw_alloc(
2996                                                 vpool_array[index].pool);
2997                                 rte_ring_sp_enqueue(vpool_array[index].ring,
2998                                                 (void *)mbuf);
2999                         }
3000
3001                         LOG_DEBUG(VHOST_CONFIG,
3002                                 "in MAIN: mbuf count in mempool at initial "
3003                                 "is: %d\n", count_in_mempool);
3004                         LOG_DEBUG(VHOST_CONFIG,
3005                                 "in MAIN: mbuf count in  ring at initial  is :"
3006                                 " %d\n",
3007                                 rte_ring_count(vpool_array[index].ring));
3008                 }
3009
3010                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3011                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3012                                 lcore_id);
3013         }
3014
3015         if (mergeable == 0)
3016                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3017
3018         /* Register CUSE device to handle IOCTLs. */
3019         ret = rte_vhost_driver_register((char *)&dev_basename);
3020         if (ret != 0)
3021                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3022
3023         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3024
3025         /* Start CUSE session. */
3026         rte_vhost_driver_session_start();
3027         return 0;
3028
3029 }
3030