examples/vhost: adapt Tx routing to lib
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
103 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
104
105 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
106 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
107
108 #define JUMBO_FRAME_MAX_SIZE    0x2600
109
110 /* State of virtio device. */
111 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_RX                       1
113 #define DEVICE_SAFE_REMOVE      2
114
115 /* Config_core_flag status definitions. */
116 #define REQUEST_DEV_REMOVAL 1
117 #define ACK_DEV_REMOVAL 0
118
119 /* Configurable number of RX/TX ring descriptors */
120 #define RTE_TEST_RX_DESC_DEFAULT 1024
121 #define RTE_TEST_TX_DESC_DEFAULT 512
122
123 /*
124  * Need refine these 2 macros for legacy and DPDK based front end:
125  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
126  * And then adjust power 2.
127  */
128 /*
129  * For legacy front end, 128 descriptors,
130  * half for virtio header, another half for mbuf.
131  */
132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
134
135 /* Get first 4 bytes in mbuf headroom. */
136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
137                 + sizeof(struct rte_mbuf)))
138
139 /* true if x is a power of 2 */
140 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141
142 #define INVALID_PORT_ID 0xFF
143
144 /* Max number of devices. Limited by vmdq. */
145 #define MAX_DEVICES 64
146
147 /* Size of buffers used for snprintfs. */
148 #define MAX_PRINT_BUFF 6072
149
150 /* Maximum character device basename size. */
151 #define MAX_BASENAME_SZ 10
152
153 /* Maximum long option length for option parsing. */
154 #define MAX_LONG_OPT_SZ 64
155
156 /* Used to compare MAC addresses. */
157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158
159 /* Number of descriptors per cacheline. */
160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161
162 /* mask of enabled ports */
163 static uint32_t enabled_port_mask = 0;
164
165 /*Number of switching cores enabled*/
166 static uint32_t num_switching_cores = 0;
167
168 /* number of devices/queues to support*/
169 static uint32_t num_queues = 0;
170 uint32_t num_devices = 0;
171
172 /*
173  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
174  * disabled on default.
175  */
176 static uint32_t zero_copy;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* This can be set by the user so it is made available here. */
222 extern uint64_t VHOST_FEATURES;
223
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226         .rx_thresh = {
227                 .pthresh = RX_PTHRESH,
228                 .hthresh = RX_HTHRESH,
229                 .wthresh = RX_WTHRESH,
230         },
231         .rx_drop_en = 1,
232 };
233
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240         .tx_thresh = {
241                 .pthresh = TX_PTHRESH,
242                 .hthresh = TX_HTHRESH,
243                 .wthresh = TX_WTHRESH,
244         },
245         .tx_free_thresh = 0, /* Use PMD default values */
246         .tx_rs_thresh = 0, /* Use PMD default values */
247 };
248
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251         .rxmode = {
252                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253                 .split_hdr_size = 0,
254                 .header_split   = 0, /**< Header Split disabled */
255                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
257                 /*
258                  * It is necessary for 1G NIC such as I350,
259                  * this fixes bug of ipv4 forwarding in guest can't
260                  * forward pakets from one virtio dev to another virtio dev.
261                  */
262                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
265         },
266
267         .txmode = {
268                 .mq_mode = ETH_MQ_TX_NONE,
269         },
270         .rx_adv_conf = {
271                 /*
272                  * should be overridden separately in code with
273                  * appropriate values
274                  */
275                 .vmdq_rx_conf = {
276                         .nb_queue_pools = ETH_8_POOLS,
277                         .enable_default_pool = 0,
278                         .default_pool = 0,
279                         .nb_pool_maps = 0,
280                         .pool_map = {{0, 0},},
281                 },
282         },
283 };
284
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
293         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313         unsigned len;
314         unsigned txq_id;
315         struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326         unsigned char   h_dest[ETH_ALEN];
327         unsigned char   h_source[ETH_ALEN];
328         __be16          h_vlan_proto;
329         __be16          h_vlan_TCI;
330         __be16          h_vlan_encapsulated_proto;
331 };
332
333 /* IPv4 Header */
334 struct ipv4_hdr {
335         uint8_t  version_ihl;           /**< version and header length */
336         uint8_t  type_of_service;       /**< type of service */
337         uint16_t total_length;          /**< length of packet */
338         uint16_t packet_id;             /**< packet ID */
339         uint16_t fragment_offset;       /**< fragmentation offset */
340         uint8_t  time_to_live;          /**< time to live */
341         uint8_t  next_proto_id;         /**< protocol ID */
342         uint16_t hdr_checksum;          /**< header checksum */
343         uint32_t src_addr;              /**< source address */
344         uint32_t dst_addr;              /**< destination address */
345 } __attribute__((__packed__));
346
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350
351 /* Per-device statistics struct */
352 struct device_statistics {
353         uint64_t tx_total;
354         rte_atomic64_t rx_total_atomic;
355         uint64_t rx_total;
356         uint64_t tx;
357         rte_atomic64_t rx_atomic;
358         uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369         struct rte_eth_vmdq_rx_conf conf;
370         unsigned i;
371
372         memset(&conf, 0, sizeof(conf));
373         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
374         conf.nb_pool_maps = num_devices;
375         conf.enable_loop_back =
376                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
377
378         for (i = 0; i < conf.nb_pool_maps; i++) {
379                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
380                 conf.pool_map[i].pools = (1UL << i);
381         }
382
383         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
384         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
385                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
386         return 0;
387 }
388
389 /*
390  * Validate the device number according to the max pool number gotten form
391  * dev_info. If the device number is invalid, give the error message and
392  * return -1. Each device must have its own pool.
393  */
394 static inline int
395 validate_num_devices(uint32_t max_nb_devices)
396 {
397         if (num_devices > max_nb_devices) {
398                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
399                 return -1;
400         }
401         return 0;
402 }
403
404 /*
405  * Initialises a given port using global settings and with the rx buffers
406  * coming from the mbuf_pool passed as parameter
407  */
408 static inline int
409 port_init(uint8_t port)
410 {
411         struct rte_eth_dev_info dev_info;
412         struct rte_eth_conf port_conf;
413         uint16_t rx_rings, tx_rings;
414         uint16_t rx_ring_size, tx_ring_size;
415         int retval;
416         uint16_t q;
417
418         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
419         rte_eth_dev_info_get (port, &dev_info);
420
421         /*configure the number of supported virtio devices based on VMDQ limits */
422         num_devices = dev_info.max_vmdq_pools;
423         num_queues = dev_info.max_rx_queues;
424
425         if (zero_copy) {
426                 rx_ring_size = num_rx_descriptor;
427                 tx_ring_size = num_tx_descriptor;
428                 tx_rings = dev_info.max_tx_queues;
429         } else {
430                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
431                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
432                 tx_rings = (uint16_t)rte_lcore_count();
433         }
434
435         retval = validate_num_devices(MAX_DEVICES);
436         if (retval < 0)
437                 return retval;
438
439         /* Get port configuration. */
440         retval = get_eth_conf(&port_conf, num_devices);
441         if (retval < 0)
442                 return retval;
443
444         if (port >= rte_eth_dev_count()) return -1;
445
446         rx_rings = (uint16_t)num_queues,
447         /* Configure ethernet device. */
448         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449         if (retval != 0)
450                 return retval;
451
452         /* Setup the queues. */
453         for (q = 0; q < rx_rings; q ++) {
454                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
456                                                 vpool_array[q].pool);
457                 if (retval < 0)
458                         return retval;
459         }
460         for (q = 0; q < tx_rings; q ++) {
461                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
463                 if (retval < 0)
464                         return retval;
465         }
466
467         /* Start the device. */
468         retval  = rte_eth_dev_start(port);
469         if (retval < 0) {
470                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471                 return retval;
472         }
473
474         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
475         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
476         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
477                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
478                         (unsigned)port,
479                         vmdq_ports_eth_addr[port].addr_bytes[0],
480                         vmdq_ports_eth_addr[port].addr_bytes[1],
481                         vmdq_ports_eth_addr[port].addr_bytes[2],
482                         vmdq_ports_eth_addr[port].addr_bytes[3],
483                         vmdq_ports_eth_addr[port].addr_bytes[4],
484                         vmdq_ports_eth_addr[port].addr_bytes[5]);
485
486         return 0;
487 }
488
489 /*
490  * Set character device basename.
491  */
492 static int
493 us_vhost_parse_basename(const char *q_arg)
494 {
495         /* parse number string */
496
497         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
498                 return -1;
499         else
500                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
501
502         return 0;
503 }
504
505 /*
506  * Parse the portmask provided at run time.
507  */
508 static int
509 parse_portmask(const char *portmask)
510 {
511         char *end = NULL;
512         unsigned long pm;
513
514         errno = 0;
515
516         /* parse hexadecimal string */
517         pm = strtoul(portmask, &end, 16);
518         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
519                 return -1;
520
521         if (pm == 0)
522                 return -1;
523
524         return pm;
525
526 }
527
528 /*
529  * Parse num options at run time.
530  */
531 static int
532 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
533 {
534         char *end = NULL;
535         unsigned long num;
536
537         errno = 0;
538
539         /* parse unsigned int string */
540         num = strtoul(q_arg, &end, 10);
541         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
542                 return -1;
543
544         if (num > max_valid_value)
545                 return -1;
546
547         return num;
548
549 }
550
551 /*
552  * Display usage
553  */
554 static void
555 us_vhost_usage(const char *prgname)
556 {
557         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
558         "               --vm2vm [0|1|2]\n"
559         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
560         "               --dev-basename <name>\n"
561         "               --nb-devices ND\n"
562         "               -p PORTMASK: Set mask for ports to be used by application\n"
563         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
564         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
565         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
566         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
567         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
568         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
569         "               --dev-basename: The basename to be used for the character device.\n"
570         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
571                         "zero copy\n"
572         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
573                         "used only when zero copy is enabled.\n"
574         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
575                         "used only when zero copy is enabled.\n",
576                prgname);
577 }
578
579 /*
580  * Parse the arguments given in the command line of the application.
581  */
582 static int
583 us_vhost_parse_args(int argc, char **argv)
584 {
585         int opt, ret;
586         int option_index;
587         unsigned i;
588         const char *prgname = argv[0];
589         static struct option long_option[] = {
590                 {"vm2vm", required_argument, NULL, 0},
591                 {"rx-retry", required_argument, NULL, 0},
592                 {"rx-retry-delay", required_argument, NULL, 0},
593                 {"rx-retry-num", required_argument, NULL, 0},
594                 {"mergeable", required_argument, NULL, 0},
595                 {"stats", required_argument, NULL, 0},
596                 {"dev-basename", required_argument, NULL, 0},
597                 {"zero-copy", required_argument, NULL, 0},
598                 {"rx-desc-num", required_argument, NULL, 0},
599                 {"tx-desc-num", required_argument, NULL, 0},
600                 {NULL, 0, 0, 0},
601         };
602
603         /* Parse command line */
604         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605                 switch (opt) {
606                 /* Portmask */
607                 case 'p':
608                         enabled_port_mask = parse_portmask(optarg);
609                         if (enabled_port_mask == 0) {
610                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611                                 us_vhost_usage(prgname);
612                                 return -1;
613                         }
614                         break;
615
616                 case 0:
617                         /* Enable/disable vm2vm comms. */
618                         if (!strncmp(long_option[option_index].name, "vm2vm",
619                                 MAX_LONG_OPT_SZ)) {
620                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621                                 if (ret == -1) {
622                                         RTE_LOG(INFO, VHOST_CONFIG,
623                                                 "Invalid argument for "
624                                                 "vm2vm [0|1|2]\n");
625                                         us_vhost_usage(prgname);
626                                         return -1;
627                                 } else {
628                                         vm2vm_mode = (vm2vm_type)ret;
629                                 }
630                         }
631
632                         /* Enable/disable retries on RX. */
633                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634                                 ret = parse_num_opt(optarg, 1);
635                                 if (ret == -1) {
636                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637                                         us_vhost_usage(prgname);
638                                         return -1;
639                                 } else {
640                                         enable_retry = ret;
641                                 }
642                         }
643
644                         /* Specify the retries delay time (in useconds) on RX. */
645                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646                                 ret = parse_num_opt(optarg, INT32_MAX);
647                                 if (ret == -1) {
648                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         burst_rx_delay_time = ret;
653                                 }
654                         }
655
656                         /* Specify the retries number on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, INT32_MAX);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         burst_rx_retry_num = ret;
665                                 }
666                         }
667
668                         /* Enable/disable RX mergeable buffers. */
669                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else {
676                                         if (ret) {
677                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
678                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
679                                                         = JUMBO_FRAME_MAX_SIZE;
680                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681                                         }
682                                 }
683                         }
684
685                         /* Enable/disable stats. */
686                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687                                 ret = parse_num_opt(optarg, INT32_MAX);
688                                 if (ret == -1) {
689                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690                                         us_vhost_usage(prgname);
691                                         return -1;
692                                 } else {
693                                         enable_stats = ret;
694                                 }
695                         }
696
697                         /* Set character device basename. */
698                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699                                 if (us_vhost_parse_basename(optarg) == -1) {
700                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701                                         us_vhost_usage(prgname);
702                                         return -1;
703                                 }
704                         }
705
706                         /* Enable/disable rx/tx zero copy. */
707                         if (!strncmp(long_option[option_index].name,
708                                 "zero-copy", MAX_LONG_OPT_SZ)) {
709                                 ret = parse_num_opt(optarg, 1);
710                                 if (ret == -1) {
711                                         RTE_LOG(INFO, VHOST_CONFIG,
712                                                 "Invalid argument"
713                                                 " for zero-copy [0|1]\n");
714                                         us_vhost_usage(prgname);
715                                         return -1;
716                                 } else
717                                         zero_copy = ret;
718
719                                 if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722                                         "zero copy vhost APP, please "
723                                         "disable RTE_MBUF_REFCNT\n"
724                                         "in config file and then rebuild DPDK "
725                                         "core lib!\n"
726                                         "Otherwise please disable zero copy "
727                                         "flag in command line!\n");
728                                         return -1;
729 #endif
730                                 }
731                         }
732
733                         /* Specify the descriptor number on RX. */
734                         if (!strncmp(long_option[option_index].name,
735                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
736                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
737                                 if ((ret == -1) || (!POWEROF2(ret))) {
738                                         RTE_LOG(INFO, VHOST_CONFIG,
739                                         "Invalid argument for rx-desc-num[0-N],"
740                                         "power of 2 required.\n");
741                                         us_vhost_usage(prgname);
742                                         return -1;
743                                 } else {
744                                         num_rx_descriptor = ret;
745                                 }
746                         }
747
748                         /* Specify the descriptor number on TX. */
749                         if (!strncmp(long_option[option_index].name,
750                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
751                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
752                                 if ((ret == -1) || (!POWEROF2(ret))) {
753                                         RTE_LOG(INFO, VHOST_CONFIG,
754                                         "Invalid argument for tx-desc-num [0-N],"
755                                         "power of 2 required.\n");
756                                         us_vhost_usage(prgname);
757                                         return -1;
758                                 } else {
759                                         num_tx_descriptor = ret;
760                                 }
761                         }
762
763                         break;
764
765                         /* Invalid option - print options. */
766                 default:
767                         us_vhost_usage(prgname);
768                         return -1;
769                 }
770         }
771
772         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773                 if (enabled_port_mask & (1 << i))
774                         ports[num_ports++] = (uint8_t)i;
775         }
776
777         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780                 return -1;
781         }
782
783         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784                 RTE_LOG(INFO, VHOST_PORT,
785                         "Vhost zero copy doesn't support software vm2vm,"
786                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
787                 return -1;
788         }
789
790         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791                 RTE_LOG(INFO, VHOST_PORT,
792                         "Vhost zero copy doesn't support jumbo frame,"
793                         "please specify '--mergeable 0' to disable the "
794                         "mergeable feature.\n");
795                 return -1;
796         }
797
798         return 0;
799 }
800
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807         unsigned valid_num_ports = num_ports;
808         unsigned portid;
809
810         if (num_ports > nb_ports) {
811                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812                         num_ports, nb_ports);
813                 num_ports = nb_ports;
814         }
815
816         for (portid = 0; portid < num_ports; portid ++) {
817                 if (ports[portid] >= nb_ports) {
818                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819                                 ports[portid], (nb_ports - 1));
820                         ports[portid] = INVALID_PORT_ID;
821                         valid_num_ports--;
822                 }
823         }
824         return valid_num_ports;
825 }
826
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
833         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
834         unsigned int index;                                                                                                                                                                                             \
835         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
836                                                                                                                                                                                                                                         \
837         if ((header))                                                                                                                                                                                                   \
838                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
839         else                                                                                                                                                                                                                    \
840                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
841         for (index = 0; index < (size); index++) {                                                                                                                                              \
842                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
843                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
844         }                                                                                                                                                                                                                               \
845         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
846                                                                                                                                                                                                                                         \
847         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859         uint32_t buf_len, hpa_type *addr_type)
860 {
861         struct virtio_memory_regions_hpa *region;
862         uint32_t regionidx;
863         uint64_t vhost_pa = 0;
864
865         *addr_type = PHYS_ADDR_INVALID;
866
867         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868                 region = &vdev->regions_hpa[regionidx];
869                 if ((guest_pa >= region->guest_phys_address) &&
870                         (guest_pa <= region->guest_phys_address_end)) {
871                         vhost_pa = region->host_phys_addr_offset + guest_pa;
872                         if (likely((guest_pa + buf_len - 1)
873                                 <= region->guest_phys_address_end))
874                                 *addr_type = PHYS_ADDR_CONTINUOUS;
875                         else
876                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
877                         break;
878                 }
879         }
880
881         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883                 (void *)(uintptr_t)vhost_pa);
884
885         return vhost_pa;
886 }
887
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904         struct ether_hdr *pkt_hdr;
905         struct virtio_net_data_ll *dev_ll;
906         struct virtio_net *dev = vdev->dev;
907         int i, ret;
908
909         /* Learn MAC address of guest device from packet */
910         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911
912         dev_ll = ll_root_used;
913
914         while (dev_ll != NULL) {
915                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917                         return -1;
918                 }
919                 dev_ll = dev_ll->next;
920         }
921
922         for (i = 0; i < ETHER_ADDR_LEN; i++)
923                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924
925         /* vlan_tag currently uses the device_id. */
926         vdev->vlan_tag = vlan_tags[dev->device_fh];
927
928         /* Print out VMDQ registration info. */
929         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930                 dev->device_fh,
931                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934                 vdev->vlan_tag);
935
936         /* Register the MAC address. */
937         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
938         if (ret)
939                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
940                                         dev->device_fh);
941
942         /* Enable stripping of the vlan tag as we handle routing. */
943         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
944
945         /* Set device as ready for RX. */
946         vdev->ready = DEVICE_RX;
947
948         return 0;
949 }
950
951 /*
952  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
953  * queue before disabling RX on the device.
954  */
955 static inline void
956 unlink_vmdq(struct vhost_dev *vdev)
957 {
958         unsigned i = 0;
959         unsigned rx_count;
960         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
961
962         if (vdev->ready == DEVICE_RX) {
963                 /*clear MAC and VLAN settings*/
964                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
965                 for (i = 0; i < 6; i++)
966                         vdev->mac_address.addr_bytes[i] = 0;
967
968                 vdev->vlan_tag = 0;
969
970                 /*Clear out the receive buffers*/
971                 rx_count = rte_eth_rx_burst(ports[0],
972                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
973
974                 while (rx_count) {
975                         for (i = 0; i < rx_count; i++)
976                                 rte_pktmbuf_free(pkts_burst[i]);
977
978                         rx_count = rte_eth_rx_burst(ports[0],
979                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980                 }
981
982                 vdev->ready = DEVICE_MAC_LEARNING;
983         }
984 }
985
986 /*
987  * Check if the packet destination MAC address is for a local device. If so then put
988  * the packet on that devices RX queue. If not then return.
989  */
990 static inline unsigned __attribute__((always_inline))
991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
992 {
993         struct virtio_net_data_ll *dev_ll;
994         struct ether_hdr *pkt_hdr;
995         uint64_t ret = 0;
996         struct virtio_net *dev = vdev->dev;
997         struct virtio_net *tdev; /* destination virito device */
998
999         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1000
1001         /*get the used devices list*/
1002         dev_ll = ll_root_used;
1003
1004         while (dev_ll != NULL) {
1005                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1006                                           &dev_ll->vdev->mac_address)) {
1007
1008                         /* Drop the packet if the TX packet is destined for the TX device. */
1009                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1010                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011                                                         dev->device_fh);
1012                                 return 0;
1013                         }
1014                         tdev = dev_ll->vdev->dev;
1015
1016
1017                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1018
1019                         if (dev_ll->vdev->remove) {
1020                                 /*drop the packet if the device is marked for removal*/
1021                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1022                         } else {
1023                                 /*send the packet to the local virtio device*/
1024                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1025                                 if (enable_stats) {
1026                                         rte_atomic64_add(
1027                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1028                                         1);
1029                                         rte_atomic64_add(
1030                                         &dev_statistics[tdev->device_fh].rx_atomic,
1031                                         ret);
1032                                         dev_statistics[tdev->device_fh].tx_total++;
1033                                         dev_statistics[tdev->device_fh].tx += ret;
1034                                 }
1035                         }
1036
1037                         return 0;
1038                 }
1039                 dev_ll = dev_ll->next;
1040         }
1041
1042         return -1;
1043 }
1044
1045 /*
1046  * This function routes the TX packet to the correct interface. This may be a local device
1047  * or the physical port.
1048  */
1049 static inline void __attribute__((always_inline))
1050 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1051 {
1052         struct mbuf_table *tx_q;
1053         struct rte_mbuf **m_table;
1054         unsigned len, ret, offset = 0;
1055         const uint16_t lcore_id = rte_lcore_id();
1056         struct virtio_net_data_ll *dev_ll = ll_root_used;
1057         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1058         struct virtio_net *dev = vdev->dev;
1059
1060         /*check if destination is local VM*/
1061         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1062                 rte_pktmbuf_free(m);
1063                 return;
1064         }
1065
1066         if (vm2vm_mode == VM2VM_HARDWARE) {
1067                 while (dev_ll != NULL) {
1068                         if ((dev_ll->vdev->ready == DEVICE_RX)
1069                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1070                                 &dev_ll->vdev->mac_address)) {
1071                                 /*
1072                                  * Drop the packet if the TX packet is
1073                                  * destined for the TX device.
1074                                  */
1075                                 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1076                                         LOG_DEBUG(VHOST_DATA,
1077                                         "(%"PRIu64") TX: Source and destination"
1078                                         " MAC addresses are the same. Dropping "
1079                                         "packet.\n",
1080                                         dev_ll->vdev->dev->device_fh);
1081                                         rte_pktmbuf_free(m);
1082                                         return;
1083                                 }
1084                                 offset = 4;
1085                                 vlan_tag =
1086                                 (uint16_t)
1087                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1088
1089                                 LOG_DEBUG(VHOST_DATA,
1090                                 "(%"PRIu64") TX: pkt to local VM device id:"
1091                                 "(%"PRIu64") vlan tag: %d.\n",
1092                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1093                                 vlan_tag);
1094
1095                                 break;
1096                         }
1097                         dev_ll = dev_ll->next;
1098                 }
1099         }
1100
1101         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1102
1103         /*Add packet to the port tx queue*/
1104         tx_q = &lcore_tx_queue[lcore_id];
1105         len = tx_q->len;
1106
1107         m->ol_flags = PKT_TX_VLAN_PKT;
1108         /*FIXME: offset*/
1109         m->data_len += offset;
1110         m->vlan_tci = vlan_tag;
1111
1112         tx_q->m_table[len] = m;
1113         len++;
1114         if (enable_stats) {
1115                 dev_statistics[dev->device_fh].tx_total++;
1116                 dev_statistics[dev->device_fh].tx++;
1117         }
1118
1119         if (unlikely(len == MAX_PKT_BURST)) {
1120                 m_table = (struct rte_mbuf **)tx_q->m_table;
1121                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1122                 /* Free any buffers not handled by TX and update the port stats. */
1123                 if (unlikely(ret < len)) {
1124                         do {
1125                                 rte_pktmbuf_free(m_table[ret]);
1126                         } while (++ret < len);
1127                 }
1128
1129                 len = 0;
1130         }
1131
1132         tx_q->len = len;
1133         return;
1134 }
1135 /*
1136  * This function is called by each data core. It handles all RX/TX registered with the
1137  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1138  * with all devices in the main linked list.
1139  */
1140 static int
1141 switch_worker(__attribute__((unused)) void *arg)
1142 {
1143         struct rte_mempool *mbuf_pool = arg;
1144         struct virtio_net *dev = NULL;
1145         struct vhost_dev *vdev = NULL;
1146         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1147         struct virtio_net_data_ll *dev_ll;
1148         struct mbuf_table *tx_q;
1149         volatile struct lcore_ll_info *lcore_ll;
1150         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1151         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1152         unsigned ret, i;
1153         const uint16_t lcore_id = rte_lcore_id();
1154         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1155         uint16_t rx_count = 0;
1156         uint16_t tx_count;
1157         uint32_t retry = 0;
1158
1159         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1160         lcore_ll = lcore_info[lcore_id].lcore_ll;
1161         prev_tsc = 0;
1162
1163         tx_q = &lcore_tx_queue[lcore_id];
1164         for (i = 0; i < num_cores; i ++) {
1165                 if (lcore_ids[i] == lcore_id) {
1166                         tx_q->txq_id = i;
1167                         break;
1168                 }
1169         }
1170
1171         while(1) {
1172                 cur_tsc = rte_rdtsc();
1173                 /*
1174                  * TX burst queue drain
1175                  */
1176                 diff_tsc = cur_tsc - prev_tsc;
1177                 if (unlikely(diff_tsc > drain_tsc)) {
1178
1179                         if (tx_q->len) {
1180                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1181
1182                                 /*Tx any packets in the queue*/
1183                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1184                                                                            (struct rte_mbuf **)tx_q->m_table,
1185                                                                            (uint16_t)tx_q->len);
1186                                 if (unlikely(ret < tx_q->len)) {
1187                                         do {
1188                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1189                                         } while (++ret < tx_q->len);
1190                                 }
1191
1192                                 tx_q->len = 0;
1193                         }
1194
1195                         prev_tsc = cur_tsc;
1196
1197                 }
1198
1199                 rte_prefetch0(lcore_ll->ll_root_used);
1200                 /*
1201                  * Inform the configuration core that we have exited the linked list and that no devices are
1202                  * in use if requested.
1203                  */
1204                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1205                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1206
1207                 /*
1208                  * Process devices
1209                  */
1210                 dev_ll = lcore_ll->ll_root_used;
1211
1212                 while (dev_ll != NULL) {
1213                         /*get virtio device ID*/
1214                         vdev = dev_ll->vdev;
1215                         dev = vdev->dev;
1216
1217                         if (vdev->remove) {
1218                                 dev_ll = dev_ll->next;
1219                                 unlink_vmdq(vdev);
1220                                 vdev->ready = DEVICE_SAFE_REMOVE;
1221                                 continue;
1222                         }
1223                         if (likely(vdev->ready == DEVICE_RX)) {
1224                                 /*Handle guest RX*/
1225                                 rx_count = rte_eth_rx_burst(ports[0],
1226                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1227
1228                                 if (rx_count) {
1229                                         /*
1230                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1231                                         * Here MAX_PKT_BURST must be less than virtio queue size
1232                                         */
1233                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1234                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1235                                                         rte_delay_us(burst_rx_delay_time);
1236                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1237                                                                 break;
1238                                                 }
1239                                         }
1240                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1241                                         if (enable_stats) {
1242                                                 rte_atomic64_add(
1243                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1244                                                 rx_count);
1245                                                 rte_atomic64_add(
1246                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1247                                         }
1248                                         while (likely(rx_count)) {
1249                                                 rx_count--;
1250                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1251                                         }
1252
1253                                 }
1254                         }
1255
1256                         if (!vdev->remove) {
1257                                 /* Handle guest TX*/
1258                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1259                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1260                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1261                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1262                                                 while (tx_count--)
1263                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1264                                         }
1265                                 }
1266                                 while (tx_count)
1267                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1268                         }
1269
1270                         /*move to the next device in the list*/
1271                         dev_ll = dev_ll->next;
1272                 }
1273         }
1274
1275         return 0;
1276 }
1277
1278 /*
1279  * This function gets available ring number for zero copy rx.
1280  * Only one thread will call this funciton for a paticular virtio device,
1281  * so, it is designed as non-thread-safe function.
1282  */
1283 static inline uint32_t __attribute__((always_inline))
1284 get_available_ring_num_zcp(struct virtio_net *dev)
1285 {
1286         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1287         uint16_t avail_idx;
1288
1289         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1290         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1291 }
1292
1293 /*
1294  * This function gets available ring index for zero copy rx,
1295  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1296  * Only one thread will call this funciton for a paticular virtio device,
1297  * so, it is designed as non-thread-safe function.
1298  */
1299 static inline uint32_t __attribute__((always_inline))
1300 get_available_ring_index_zcp(struct virtio_net *dev,
1301         uint16_t *res_base_idx, uint32_t count)
1302 {
1303         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1304         uint16_t avail_idx;
1305         uint32_t retry = 0;
1306         uint16_t free_entries;
1307
1308         *res_base_idx = vq->last_used_idx_res;
1309         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1310         free_entries = (avail_idx - *res_base_idx);
1311
1312         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1313                         "avail idx: %d, "
1314                         "res base idx:%d, free entries:%d\n",
1315                         dev->device_fh, avail_idx, *res_base_idx,
1316                         free_entries);
1317
1318         /*
1319          * If retry is enabled and the queue is full then we wait
1320          * and retry to avoid packet loss.
1321          */
1322         if (enable_retry && unlikely(count > free_entries)) {
1323                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1324                         rte_delay_us(burst_rx_delay_time);
1325                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1326                         free_entries = (avail_idx - *res_base_idx);
1327                         if (count <= free_entries)
1328                                 break;
1329                 }
1330         }
1331
1332         /*check that we have enough buffers*/
1333         if (unlikely(count > free_entries))
1334                 count = free_entries;
1335
1336         if (unlikely(count == 0)) {
1337                 LOG_DEBUG(VHOST_DATA,
1338                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1339                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1340                         dev->device_fh, avail_idx,
1341                         *res_base_idx, free_entries);
1342                 return 0;
1343         }
1344
1345         vq->last_used_idx_res = *res_base_idx + count;
1346
1347         return count;
1348 }
1349
1350 /*
1351  * This function put descriptor back to used list.
1352  */
1353 static inline void __attribute__((always_inline))
1354 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1355 {
1356         uint16_t res_cur_idx = vq->last_used_idx;
1357         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1358         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1359         rte_compiler_barrier();
1360         *(volatile uint16_t *)&vq->used->idx += 1;
1361         vq->last_used_idx += 1;
1362
1363         /* Kick the guest if necessary. */
1364         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1365                 eventfd_write((int)vq->kickfd, 1);
1366 }
1367
1368 /*
1369  * This function get available descriptor from vitio vring and un-attached mbuf
1370  * from vpool->ring, and then attach them together. It needs adjust the offset
1371  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1372  * frame data may be put to wrong location in mbuf.
1373  */
1374 static inline void __attribute__((always_inline))
1375 attach_rxmbuf_zcp(struct virtio_net *dev)
1376 {
1377         uint16_t res_base_idx, desc_idx;
1378         uint64_t buff_addr, phys_addr;
1379         struct vhost_virtqueue *vq;
1380         struct vring_desc *desc;
1381         struct rte_mbuf *mbuf = NULL;
1382         struct vpool *vpool;
1383         hpa_type addr_type;
1384         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1385
1386         vpool = &vpool_array[vdev->vmdq_rx_q];
1387         vq = dev->virtqueue[VIRTIO_RXQ];
1388
1389         do {
1390                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1391                                 1) != 1))
1392                         return;
1393                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1394
1395                 desc = &vq->desc[desc_idx];
1396                 if (desc->flags & VRING_DESC_F_NEXT) {
1397                         desc = &vq->desc[desc->next];
1398                         buff_addr = gpa_to_vva(dev, desc->addr);
1399                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1400                                         &addr_type);
1401                 } else {
1402                         buff_addr = gpa_to_vva(dev,
1403                                         desc->addr + vq->vhost_hlen);
1404                         phys_addr = gpa_to_hpa(vdev,
1405                                         desc->addr + vq->vhost_hlen,
1406                                         desc->len, &addr_type);
1407                 }
1408
1409                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1410                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1411                                 " address found when attaching RX frame buffer"
1412                                 " address!\n", dev->device_fh);
1413                         put_desc_to_used_list_zcp(vq, desc_idx);
1414                         continue;
1415                 }
1416
1417                 /*
1418                  * Check if the frame buffer address from guest crosses
1419                  * sub-region or not.
1420                  */
1421                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1422                         RTE_LOG(ERR, VHOST_DATA,
1423                                 "(%"PRIu64") Frame buffer address cross "
1424                                 "sub-regioin found when attaching RX frame "
1425                                 "buffer address!\n",
1426                                 dev->device_fh);
1427                         put_desc_to_used_list_zcp(vq, desc_idx);
1428                         continue;
1429                 }
1430         } while (unlikely(phys_addr == 0));
1431
1432         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1433         if (unlikely(mbuf == NULL)) {
1434                 LOG_DEBUG(VHOST_DATA,
1435                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1436                         "ring_sc_dequeue fail.\n",
1437                         dev->device_fh);
1438                 put_desc_to_used_list_zcp(vq, desc_idx);
1439                 return;
1440         }
1441
1442         if (unlikely(vpool->buf_size > desc->len)) {
1443                 LOG_DEBUG(VHOST_DATA,
1444                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1445                         "length(%d) of descriptor idx: %d less than room "
1446                         "size required: %d\n",
1447                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1448                 put_desc_to_used_list_zcp(vq, desc_idx);
1449                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1450                 return;
1451         }
1452
1453         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1454         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1455         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1456         mbuf->data_len = desc->len;
1457         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1458
1459         LOG_DEBUG(VHOST_DATA,
1460                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1461                 "descriptor idx:%d\n",
1462                 dev->device_fh, res_base_idx, desc_idx);
1463
1464         __rte_mbuf_raw_free(mbuf);
1465
1466         return;
1467 }
1468
1469 /*
1470  * Detach an attched packet mbuf -
1471  *  - restore original mbuf address and length values.
1472  *  - reset pktmbuf data and data_len to their default values.
1473  *  All other fields of the given packet mbuf will be left intact.
1474  *
1475  * @param m
1476  *   The attached packet mbuf.
1477  */
1478 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1479 {
1480         const struct rte_mempool *mp = m->pool;
1481         void *buf = RTE_MBUF_TO_BADDR(m);
1482         uint32_t buf_ofs;
1483         uint32_t buf_len = mp->elt_size - sizeof(*m);
1484         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1485
1486         m->buf_addr = buf;
1487         m->buf_len = (uint16_t)buf_len;
1488
1489         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1490                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1491         m->data_off = buf_ofs;
1492
1493         m->data_len = 0;
1494 }
1495
1496 /*
1497  * This function is called after packets have been transimited. It fetchs mbuf
1498  * from vpool->pool, detached it and put into vpool->ring. It also update the
1499  * used index and kick the guest if necessary.
1500  */
1501 static inline uint32_t __attribute__((always_inline))
1502 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1503 {
1504         struct rte_mbuf *mbuf;
1505         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1506         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1507         uint32_t index = 0;
1508         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1509
1510         LOG_DEBUG(VHOST_DATA,
1511                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1512                 "clean is: %d\n",
1513                 dev->device_fh, mbuf_count);
1514         LOG_DEBUG(VHOST_DATA,
1515                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1516                 "clean  is : %d\n",
1517                 dev->device_fh, rte_ring_count(vpool->ring));
1518
1519         for (index = 0; index < mbuf_count; index++) {
1520                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1521                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1522                         pktmbuf_detach_zcp(mbuf);
1523                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1524
1525                 /* Update used index buffer information. */
1526                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1527                 vq->used->ring[used_idx].len = 0;
1528
1529                 used_idx = (used_idx + 1) & (vq->size - 1);
1530         }
1531
1532         LOG_DEBUG(VHOST_DATA,
1533                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1534                 "clean is: %d\n",
1535                 dev->device_fh, rte_mempool_count(vpool->pool));
1536         LOG_DEBUG(VHOST_DATA,
1537                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1538                 "clean  is : %d\n",
1539                 dev->device_fh, rte_ring_count(vpool->ring));
1540         LOG_DEBUG(VHOST_DATA,
1541                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1542                 "vq->last_used_idx:%d\n",
1543                 dev->device_fh, vq->last_used_idx);
1544
1545         vq->last_used_idx += mbuf_count;
1546
1547         LOG_DEBUG(VHOST_DATA,
1548                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1549                 "vq->last_used_idx:%d\n",
1550                 dev->device_fh, vq->last_used_idx);
1551
1552         rte_compiler_barrier();
1553
1554         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1555
1556         /* Kick guest if required. */
1557         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1558                 eventfd_write((int)vq->kickfd, 1);
1559
1560         return 0;
1561 }
1562
1563 /*
1564  * This function is called when a virtio device is destroy.
1565  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1566  */
1567 static void mbuf_destroy_zcp(struct vpool *vpool)
1568 {
1569         struct rte_mbuf *mbuf = NULL;
1570         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1571
1572         LOG_DEBUG(VHOST_CONFIG,
1573                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1574                 "mbuf_destroy_zcp is: %d\n",
1575                 mbuf_count);
1576         LOG_DEBUG(VHOST_CONFIG,
1577                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1578                 "mbuf_destroy_zcp  is : %d\n",
1579                 rte_ring_count(vpool->ring));
1580
1581         for (index = 0; index < mbuf_count; index++) {
1582                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583                 if (likely(mbuf != NULL)) {
1584                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1585                                 pktmbuf_detach_zcp(mbuf);
1586                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1587                 }
1588         }
1589
1590         LOG_DEBUG(VHOST_CONFIG,
1591                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1592                 "mbuf_destroy_zcp is: %d\n",
1593                 rte_mempool_count(vpool->pool));
1594         LOG_DEBUG(VHOST_CONFIG,
1595                 "in mbuf_destroy_zcp: mbuf count in ring after "
1596                 "mbuf_destroy_zcp is : %d\n",
1597                 rte_ring_count(vpool->ring));
1598 }
1599
1600 /*
1601  * This function update the use flag and counter.
1602  */
1603 static inline uint32_t __attribute__((always_inline))
1604 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1605         uint32_t count)
1606 {
1607         struct vhost_virtqueue *vq;
1608         struct vring_desc *desc;
1609         struct rte_mbuf *buff;
1610         /* The virtio_hdr is initialised to 0. */
1611         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1612                 = {{0, 0, 0, 0, 0, 0}, 0};
1613         uint64_t buff_hdr_addr = 0;
1614         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1615         uint32_t head_idx, packet_success = 0;
1616         uint16_t res_cur_idx;
1617
1618         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1619
1620         if (count == 0)
1621                 return 0;
1622
1623         vq = dev->virtqueue[VIRTIO_RXQ];
1624         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1625
1626         res_cur_idx = vq->last_used_idx;
1627         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1628                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1629
1630         /* Retrieve all of the head indexes first to avoid caching issues. */
1631         for (head_idx = 0; head_idx < count; head_idx++)
1632                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1633
1634         /*Prefetch descriptor index. */
1635         rte_prefetch0(&vq->desc[head[packet_success]]);
1636
1637         while (packet_success != count) {
1638                 /* Get descriptor from available ring */
1639                 desc = &vq->desc[head[packet_success]];
1640
1641                 buff = pkts[packet_success];
1642                 LOG_DEBUG(VHOST_DATA,
1643                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1644                         "pkt[%d] descriptor idx: %d\n",
1645                         dev->device_fh, packet_success,
1646                         MBUF_HEADROOM_UINT32(buff));
1647
1648                 PRINT_PACKET(dev,
1649                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1650                         + RTE_PKTMBUF_HEADROOM),
1651                         rte_pktmbuf_data_len(buff), 0);
1652
1653                 /* Buffer address translation for virtio header. */
1654                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1655                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1656
1657                 /*
1658                  * If the descriptors are chained the header and data are
1659                  * placed in separate buffers.
1660                  */
1661                 if (desc->flags & VRING_DESC_F_NEXT) {
1662                         desc->len = vq->vhost_hlen;
1663                         desc = &vq->desc[desc->next];
1664                         desc->len = rte_pktmbuf_data_len(buff);
1665                 } else {
1666                         desc->len = packet_len;
1667                 }
1668
1669                 /* Update used ring with desc information */
1670                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1671                         = head[packet_success];
1672                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1673                         = packet_len;
1674                 res_cur_idx++;
1675                 packet_success++;
1676
1677                 /* A header is required per buffer. */
1678                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1679                         (const void *)&virtio_hdr, vq->vhost_hlen);
1680
1681                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1682
1683                 if (likely(packet_success < count)) {
1684                         /* Prefetch descriptor index. */
1685                         rte_prefetch0(&vq->desc[head[packet_success]]);
1686                 }
1687         }
1688
1689         rte_compiler_barrier();
1690
1691         LOG_DEBUG(VHOST_DATA,
1692                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1693                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1694                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1695
1696         *(volatile uint16_t *)&vq->used->idx += count;
1697         vq->last_used_idx += count;
1698
1699         LOG_DEBUG(VHOST_DATA,
1700                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1701                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1702                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1703
1704         /* Kick the guest if necessary. */
1705         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1706                 eventfd_write((int)vq->kickfd, 1);
1707
1708         return count;
1709 }
1710
1711 /*
1712  * This function routes the TX packet to the correct interface.
1713  * This may be a local device or the physical port.
1714  */
1715 static inline void __attribute__((always_inline))
1716 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1717         uint32_t desc_idx, uint8_t need_copy)
1718 {
1719         struct mbuf_table *tx_q;
1720         struct rte_mbuf **m_table;
1721         struct rte_mbuf *mbuf = NULL;
1722         unsigned len, ret, offset = 0;
1723         struct vpool *vpool;
1724         struct virtio_net_data_ll *dev_ll = ll_root_used;
1725         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1726         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1727         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1728
1729         /*Add packet to the port tx queue*/
1730         tx_q = &tx_queue_zcp[vmdq_rx_q];
1731         len = tx_q->len;
1732
1733         /* Allocate an mbuf and populate the structure. */
1734         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1735         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1736         if (unlikely(mbuf == NULL)) {
1737                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1738                 RTE_LOG(ERR, VHOST_DATA,
1739                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1740                         dev->device_fh);
1741                 put_desc_to_used_list_zcp(vq, desc_idx);
1742                 return;
1743         }
1744
1745         if (vm2vm_mode == VM2VM_HARDWARE) {
1746                 /* Avoid using a vlan tag from any vm for external pkt, such as
1747                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1748                  * selection, MAC address determines it as an external pkt
1749                  * which should go to network, while vlan tag determine it as
1750                  * a vm2vm pkt should forward to another vm. Hardware confuse
1751                  * such a ambiguous situation, so pkt will lost.
1752                  */
1753                 vlan_tag = external_pkt_default_vlan_tag;
1754                 while (dev_ll != NULL) {
1755                         if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1756                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1757                                 &dev_ll->vdev->mac_address)) {
1758
1759                                 /*
1760                                  * Drop the packet if the TX packet is destined
1761                                  * for the TX device.
1762                                  */
1763                                 if (unlikely(dev_ll->vdev->dev->device_fh
1764                                         == dev->device_fh)) {
1765                                         LOG_DEBUG(VHOST_DATA,
1766                                         "(%"PRIu64") TX: Source and destination"
1767                                         "MAC addresses are the same. Dropping "
1768                                         "packet.\n",
1769                                         dev_ll->vdev->dev->device_fh);
1770                                         MBUF_HEADROOM_UINT32(mbuf)
1771                                                 = (uint32_t)desc_idx;
1772                                         __rte_mbuf_raw_free(mbuf);
1773                                         return;
1774                                 }
1775
1776                                 /*
1777                                  * Packet length offset 4 bytes for HW vlan
1778                                  * strip when L2 switch back.
1779                                  */
1780                                 offset = 4;
1781                                 vlan_tag =
1782                                 (uint16_t)
1783                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1784
1785                                 LOG_DEBUG(VHOST_DATA,
1786                                 "(%"PRIu64") TX: pkt to local VM device id:"
1787                                 "(%"PRIu64") vlan tag: %d.\n",
1788                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1789                                 vlan_tag);
1790
1791                                 break;
1792                         }
1793                         dev_ll = dev_ll->next;
1794                 }
1795         }
1796
1797         mbuf->nb_segs = m->nb_segs;
1798         mbuf->next = m->next;
1799         mbuf->data_len = m->data_len + offset;
1800         mbuf->pkt_len = mbuf->data_len;
1801         if (unlikely(need_copy)) {
1802                 /* Copy the packet contents to the mbuf. */
1803                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1804                         rte_pktmbuf_mtod(m, void *),
1805                         m->data_len);
1806         } else {
1807                 mbuf->data_off = m->data_off;
1808                 mbuf->buf_physaddr = m->buf_physaddr;
1809                 mbuf->buf_addr = m->buf_addr;
1810         }
1811         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1812         mbuf->vlan_tci = vlan_tag;
1813         mbuf->l2_len = sizeof(struct ether_hdr);
1814         mbuf->l3_len = sizeof(struct ipv4_hdr);
1815         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1816
1817         tx_q->m_table[len] = mbuf;
1818         len++;
1819
1820         LOG_DEBUG(VHOST_DATA,
1821                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1822                 dev->device_fh,
1823                 mbuf->nb_segs,
1824                 (mbuf->next == NULL) ? "null" : "non-null");
1825
1826         if (enable_stats) {
1827                 dev_statistics[dev->device_fh].tx_total++;
1828                 dev_statistics[dev->device_fh].tx++;
1829         }
1830
1831         if (unlikely(len == MAX_PKT_BURST)) {
1832                 m_table = (struct rte_mbuf **)tx_q->m_table;
1833                 ret = rte_eth_tx_burst(ports[0],
1834                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1835
1836                 /*
1837                  * Free any buffers not handled by TX and update
1838                  * the port stats.
1839                  */
1840                 if (unlikely(ret < len)) {
1841                         do {
1842                                 rte_pktmbuf_free(m_table[ret]);
1843                         } while (++ret < len);
1844                 }
1845
1846                 len = 0;
1847                 txmbuf_clean_zcp(dev, vpool);
1848         }
1849
1850         tx_q->len = len;
1851
1852         return;
1853 }
1854
1855 /*
1856  * This function TX all available packets in virtio TX queue for one
1857  * virtio-net device. If it is first packet, it learns MAC address and
1858  * setup VMDQ.
1859  */
1860 static inline void __attribute__((always_inline))
1861 virtio_dev_tx_zcp(struct virtio_net *dev)
1862 {
1863         struct rte_mbuf m;
1864         struct vhost_virtqueue *vq;
1865         struct vring_desc *desc;
1866         uint64_t buff_addr = 0, phys_addr;
1867         uint32_t head[MAX_PKT_BURST];
1868         uint32_t i;
1869         uint16_t free_entries, packet_success = 0;
1870         uint16_t avail_idx;
1871         uint8_t need_copy = 0;
1872         hpa_type addr_type;
1873         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1874
1875         vq = dev->virtqueue[VIRTIO_TXQ];
1876         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1877
1878         /* If there are no available buffers then return. */
1879         if (vq->last_used_idx_res == avail_idx)
1880                 return;
1881
1882         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1883
1884         /* Prefetch available ring to retrieve head indexes. */
1885         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1886
1887         /* Get the number of free entries in the ring */
1888         free_entries = (avail_idx - vq->last_used_idx_res);
1889
1890         /* Limit to MAX_PKT_BURST. */
1891         free_entries
1892                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1893
1894         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1895                 dev->device_fh, free_entries);
1896
1897         /* Retrieve all of the head indexes first to avoid caching issues. */
1898         for (i = 0; i < free_entries; i++)
1899                 head[i]
1900                         = vq->avail->ring[(vq->last_used_idx_res + i)
1901                         & (vq->size - 1)];
1902
1903         vq->last_used_idx_res += free_entries;
1904
1905         /* Prefetch descriptor index. */
1906         rte_prefetch0(&vq->desc[head[packet_success]]);
1907         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1908
1909         while (packet_success < free_entries) {
1910                 desc = &vq->desc[head[packet_success]];
1911
1912                 /* Discard first buffer as it is the virtio header */
1913                 desc = &vq->desc[desc->next];
1914
1915                 /* Buffer address translation. */
1916                 buff_addr = gpa_to_vva(dev, desc->addr);
1917                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1918
1919                 if (likely(packet_success < (free_entries - 1)))
1920                         /* Prefetch descriptor index. */
1921                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1922
1923                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1924                         RTE_LOG(ERR, VHOST_DATA,
1925                                 "(%"PRIu64") Invalid frame buffer address found"
1926                                 "when TX packets!\n",
1927                                 dev->device_fh);
1928                         packet_success++;
1929                         continue;
1930                 }
1931
1932                 /* Prefetch buffer address. */
1933                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1934
1935                 /*
1936                  * Setup dummy mbuf. This is copied to a real mbuf if
1937                  * transmitted out the physical port.
1938                  */
1939                 m.data_len = desc->len;
1940                 m.nb_segs = 1;
1941                 m.next = NULL;
1942                 m.data_off = 0;
1943                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1944                 m.buf_physaddr = phys_addr;
1945
1946                 /*
1947                  * Check if the frame buffer address from guest crosses
1948                  * sub-region or not.
1949                  */
1950                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1951                         RTE_LOG(ERR, VHOST_DATA,
1952                                 "(%"PRIu64") Frame buffer address cross "
1953                                 "sub-regioin found when attaching TX frame "
1954                                 "buffer address!\n",
1955                                 dev->device_fh);
1956                         need_copy = 1;
1957                 } else
1958                         need_copy = 0;
1959
1960                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1961
1962                 /*
1963                  * If this is the first received packet we need to learn
1964                  * the MAC and setup VMDQ
1965                  */
1966                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1967                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1968                                 /*
1969                                  * Discard frame if device is scheduled for
1970                                  * removal or a duplicate MAC address is found.
1971                                  */
1972                                 packet_success += free_entries;
1973                                 vq->last_used_idx += packet_success;
1974                                 break;
1975                         }
1976                 }
1977
1978                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1979                 packet_success++;
1980         }
1981 }
1982
1983 /*
1984  * This function is called by each data core. It handles all RX/TX registered
1985  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1986  * addresses are compared with all devices in the main linked list.
1987  */
1988 static int
1989 switch_worker_zcp(__attribute__((unused)) void *arg)
1990 {
1991         struct virtio_net *dev = NULL;
1992         struct vhost_dev  *vdev = NULL;
1993         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1994         struct virtio_net_data_ll *dev_ll;
1995         struct mbuf_table *tx_q;
1996         volatile struct lcore_ll_info *lcore_ll;
1997         const uint64_t drain_tsc
1998                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1999                 * BURST_TX_DRAIN_US;
2000         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2001         unsigned ret;
2002         const uint16_t lcore_id = rte_lcore_id();
2003         uint16_t count_in_ring, rx_count = 0;
2004
2005         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2006
2007         lcore_ll = lcore_info[lcore_id].lcore_ll;
2008         prev_tsc = 0;
2009
2010         while (1) {
2011                 cur_tsc = rte_rdtsc();
2012
2013                 /* TX burst queue drain */
2014                 diff_tsc = cur_tsc - prev_tsc;
2015                 if (unlikely(diff_tsc > drain_tsc)) {
2016                         /*
2017                          * Get mbuf from vpool.pool and detach mbuf and
2018                          * put back into vpool.ring.
2019                          */
2020                         dev_ll = lcore_ll->ll_root_used;
2021                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2022                                 /* Get virtio device ID */
2023                                 vdev = dev_ll->vdev;
2024                                 dev = vdev->dev;
2025
2026                                 if (likely(!vdev->remove)) {
2027                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2028                                         if (tx_q->len) {
2029                                                 LOG_DEBUG(VHOST_DATA,
2030                                                 "TX queue drained after timeout"
2031                                                 " with burst size %u\n",
2032                                                 tx_q->len);
2033
2034                                                 /*
2035                                                  * Tx any packets in the queue
2036                                                  */
2037                                                 ret = rte_eth_tx_burst(
2038                                                         ports[0],
2039                                                         (uint16_t)tx_q->txq_id,
2040                                                         (struct rte_mbuf **)
2041                                                         tx_q->m_table,
2042                                                         (uint16_t)tx_q->len);
2043                                                 if (unlikely(ret < tx_q->len)) {
2044                                                         do {
2045                                                                 rte_pktmbuf_free(
2046                                                                         tx_q->m_table[ret]);
2047                                                         } while (++ret < tx_q->len);
2048                                                 }
2049                                                 tx_q->len = 0;
2050
2051                                                 txmbuf_clean_zcp(dev,
2052                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2053                                         }
2054                                 }
2055                                 dev_ll = dev_ll->next;
2056                         }
2057                         prev_tsc = cur_tsc;
2058                 }
2059
2060                 rte_prefetch0(lcore_ll->ll_root_used);
2061
2062                 /*
2063                  * Inform the configuration core that we have exited the linked
2064                  * list and that no devices are in use if requested.
2065                  */
2066                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2067                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2068
2069                 /* Process devices */
2070                 dev_ll = lcore_ll->ll_root_used;
2071
2072                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2073                         vdev = dev_ll->vdev;
2074                         dev  = vdev->dev;
2075                         if (unlikely(vdev->remove)) {
2076                                 dev_ll = dev_ll->next;
2077                                 unlink_vmdq(vdev);
2078                                 vdev->ready = DEVICE_SAFE_REMOVE;
2079                                 continue;
2080                         }
2081
2082                         if (likely(vdev->ready == DEVICE_RX)) {
2083                                 uint32_t index = vdev->vmdq_rx_q;
2084                                 uint16_t i;
2085                                 count_in_ring
2086                                 = rte_ring_count(vpool_array[index].ring);
2087                                 uint16_t free_entries
2088                                 = (uint16_t)get_available_ring_num_zcp(dev);
2089
2090                                 /*
2091                                  * Attach all mbufs in vpool.ring and put back
2092                                  * into vpool.pool.
2093                                  */
2094                                 for (i = 0;
2095                                 i < RTE_MIN(free_entries,
2096                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2097                                 i++)
2098                                         attach_rxmbuf_zcp(dev);
2099
2100                                 /* Handle guest RX */
2101                                 rx_count = rte_eth_rx_burst(ports[0],
2102                                         vdev->vmdq_rx_q, pkts_burst,
2103                                         MAX_PKT_BURST);
2104
2105                                 if (rx_count) {
2106                                         ret_count = virtio_dev_rx_zcp(dev,
2107                                                         pkts_burst, rx_count);
2108                                         if (enable_stats) {
2109                                                 dev_statistics[dev->device_fh].rx_total
2110                                                         += rx_count;
2111                                                 dev_statistics[dev->device_fh].rx
2112                                                         += ret_count;
2113                                         }
2114                                         while (likely(rx_count)) {
2115                                                 rx_count--;
2116                                                 pktmbuf_detach_zcp(
2117                                                         pkts_burst[rx_count]);
2118                                                 rte_ring_sp_enqueue(
2119                                                         vpool_array[index].ring,
2120                                                         (void *)pkts_burst[rx_count]);
2121                                         }
2122                                 }
2123                         }
2124
2125                         if (likely(!vdev->remove))
2126                                 /* Handle guest TX */
2127                                 virtio_dev_tx_zcp(dev);
2128
2129                         /* Move to the next device in the list */
2130                         dev_ll = dev_ll->next;
2131                 }
2132         }
2133
2134         return 0;
2135 }
2136
2137
2138 /*
2139  * Add an entry to a used linked list. A free entry must first be found
2140  * in the free linked list using get_data_ll_free_entry();
2141  */
2142 static void
2143 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2144         struct virtio_net_data_ll *ll_dev)
2145 {
2146         struct virtio_net_data_ll *ll = *ll_root_addr;
2147
2148         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2149         ll_dev->next = NULL;
2150         rte_compiler_barrier();
2151
2152         /* If ll == NULL then this is the first device. */
2153         if (ll) {
2154                 /* Increment to the tail of the linked list. */
2155                 while ((ll->next != NULL) )
2156                         ll = ll->next;
2157
2158                 ll->next = ll_dev;
2159         } else {
2160                 *ll_root_addr = ll_dev;
2161         }
2162 }
2163
2164 /*
2165  * Remove an entry from a used linked list. The entry must then be added to
2166  * the free linked list using put_data_ll_free_entry().
2167  */
2168 static void
2169 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2170         struct virtio_net_data_ll *ll_dev,
2171         struct virtio_net_data_ll *ll_dev_last)
2172 {
2173         struct virtio_net_data_ll *ll = *ll_root_addr;
2174
2175         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2176                 return;
2177
2178         if (ll_dev == ll)
2179                 *ll_root_addr = ll_dev->next;
2180         else
2181                 if (likely(ll_dev_last != NULL))
2182                         ll_dev_last->next = ll_dev->next;
2183                 else
2184                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2185 }
2186
2187 /*
2188  * Find and return an entry from the free linked list.
2189  */
2190 static struct virtio_net_data_ll *
2191 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2192 {
2193         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2194         struct virtio_net_data_ll *ll_dev;
2195
2196         if (ll_free == NULL)
2197                 return NULL;
2198
2199         ll_dev = ll_free;
2200         *ll_root_addr = ll_free->next;
2201
2202         return ll_dev;
2203 }
2204
2205 /*
2206  * Place an entry back on to the free linked list.
2207  */
2208 static void
2209 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2210         struct virtio_net_data_ll *ll_dev)
2211 {
2212         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2213
2214         if (ll_dev == NULL)
2215                 return;
2216
2217         ll_dev->next = ll_free;
2218         *ll_root_addr = ll_dev;
2219 }
2220
2221 /*
2222  * Creates a linked list of a given size.
2223  */
2224 static struct virtio_net_data_ll *
2225 alloc_data_ll(uint32_t size)
2226 {
2227         struct virtio_net_data_ll *ll_new;
2228         uint32_t i;
2229
2230         /* Malloc and then chain the linked list. */
2231         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2232         if (ll_new == NULL) {
2233                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2234                 return NULL;
2235         }
2236
2237         for (i = 0; i < size - 1; i++) {
2238                 ll_new[i].vdev = NULL;
2239                 ll_new[i].next = &ll_new[i+1];
2240         }
2241         ll_new[i].next = NULL;
2242
2243         return (ll_new);
2244 }
2245
2246 /*
2247  * Create the main linked list along with each individual cores linked list. A used and a free list
2248  * are created to manage entries.
2249  */
2250 static int
2251 init_data_ll (void)
2252 {
2253         int lcore;
2254
2255         RTE_LCORE_FOREACH_SLAVE(lcore) {
2256                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2257                 if (lcore_info[lcore].lcore_ll == NULL) {
2258                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2259                         return -1;
2260                 }
2261
2262                 lcore_info[lcore].lcore_ll->device_num = 0;
2263                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2264                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2265                 if (num_devices % num_switching_cores)
2266                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2267                 else
2268                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2269         }
2270
2271         /* Allocate devices up to a maximum of MAX_DEVICES. */
2272         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2273
2274         return 0;
2275 }
2276
2277 /*
2278  * Set virtqueue flags so that we do not receive interrupts.
2279  */
2280 static void
2281 set_irq_status (struct virtio_net *dev)
2282 {
2283         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2284         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2285 }
2286
2287 /*
2288  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2289  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2290  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2291  */
2292 static void
2293 destroy_device (volatile struct virtio_net *dev)
2294 {
2295         struct virtio_net_data_ll *ll_lcore_dev_cur;
2296         struct virtio_net_data_ll *ll_main_dev_cur;
2297         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2298         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2299         struct vhost_dev *vdev;
2300         int lcore;
2301
2302         dev->flags &= ~VIRTIO_DEV_RUNNING;
2303
2304         vdev = (struct vhost_dev *)dev->priv;
2305         /*set the remove flag. */
2306         vdev->remove = 1;
2307         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2308                 rte_pause();
2309         }
2310
2311         /* Search for entry to be removed from lcore ll */
2312         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2313         while (ll_lcore_dev_cur != NULL) {
2314                 if (ll_lcore_dev_cur->vdev == vdev) {
2315                         break;
2316                 } else {
2317                         ll_lcore_dev_last = ll_lcore_dev_cur;
2318                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2319                 }
2320         }
2321
2322         if (ll_lcore_dev_cur == NULL) {
2323                 RTE_LOG(ERR, VHOST_CONFIG,
2324                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2325                         dev->device_fh);
2326                 return;
2327         }
2328
2329         /* Search for entry to be removed from main ll */
2330         ll_main_dev_cur = ll_root_used;
2331         ll_main_dev_last = NULL;
2332         while (ll_main_dev_cur != NULL) {
2333                 if (ll_main_dev_cur->vdev == vdev) {
2334                         break;
2335                 } else {
2336                         ll_main_dev_last = ll_main_dev_cur;
2337                         ll_main_dev_cur = ll_main_dev_cur->next;
2338                 }
2339         }
2340
2341         /* Remove entries from the lcore and main ll. */
2342         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2343         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2344
2345         /* Set the dev_removal_flag on each lcore. */
2346         RTE_LCORE_FOREACH_SLAVE(lcore) {
2347                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2348         }
2349
2350         /*
2351          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2352          * they can no longer access the device removed from the linked lists and that the devices
2353          * are no longer in use.
2354          */
2355         RTE_LCORE_FOREACH_SLAVE(lcore) {
2356                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2357                         rte_pause();
2358                 }
2359         }
2360
2361         /* Add the entries back to the lcore and main free ll.*/
2362         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2363         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2364
2365         /* Decrement number of device on the lcore. */
2366         lcore_info[vdev->coreid].lcore_ll->device_num--;
2367
2368         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2369
2370         if (zero_copy) {
2371                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2372
2373                 /* Stop the RX queue. */
2374                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2375                         LOG_DEBUG(VHOST_CONFIG,
2376                                 "(%"PRIu64") In destroy_device: Failed to stop "
2377                                 "rx queue:%d\n",
2378                                 dev->device_fh,
2379                                 vdev->vmdq_rx_q);
2380                 }
2381
2382                 LOG_DEBUG(VHOST_CONFIG,
2383                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2384                         "mempool back to ring for RX queue: %d\n",
2385                         dev->device_fh, vdev->vmdq_rx_q);
2386
2387                 mbuf_destroy_zcp(vpool);
2388
2389                 /* Stop the TX queue. */
2390                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2391                         LOG_DEBUG(VHOST_CONFIG,
2392                                 "(%"PRIu64") In destroy_device: Failed to "
2393                                 "stop tx queue:%d\n",
2394                                 dev->device_fh, vdev->vmdq_rx_q);
2395                 }
2396
2397                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2398
2399                 LOG_DEBUG(VHOST_CONFIG,
2400                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2401                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2402                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2403                         dev->device_fh);
2404
2405                 mbuf_destroy_zcp(vpool);
2406                 rte_free(vdev->regions_hpa);
2407         }
2408         rte_free(vdev);
2409
2410 }
2411
2412 /*
2413  * Calculate the region count of physical continous regions for one particular
2414  * region of whose vhost virtual address is continous. The particular region
2415  * start from vva_start, with size of 'size' in argument.
2416  */
2417 static uint32_t
2418 check_hpa_regions(uint64_t vva_start, uint64_t size)
2419 {
2420         uint32_t i, nregions = 0, page_size = getpagesize();
2421         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2422         if (vva_start % page_size) {
2423                 LOG_DEBUG(VHOST_CONFIG,
2424                         "in check_countinous: vva start(%p) mod page_size(%d) "
2425                         "has remainder\n",
2426                         (void *)(uintptr_t)vva_start, page_size);
2427                 return 0;
2428         }
2429         if (size % page_size) {
2430                 LOG_DEBUG(VHOST_CONFIG,
2431                         "in check_countinous: "
2432                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2433                         size, page_size);
2434                 return 0;
2435         }
2436         for (i = 0; i < size - page_size; i = i + page_size) {
2437                 cur_phys_addr
2438                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2439                 next_phys_addr = rte_mem_virt2phy(
2440                         (void *)(uintptr_t)(vva_start + i + page_size));
2441                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2442                         ++nregions;
2443                         LOG_DEBUG(VHOST_CONFIG,
2444                                 "in check_continuous: hva addr:(%p) is not "
2445                                 "continuous with hva addr:(%p), diff:%d\n",
2446                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2447                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2448                                 + page_size), page_size);
2449                         LOG_DEBUG(VHOST_CONFIG,
2450                                 "in check_continuous: hpa addr:(%p) is not "
2451                                 "continuous with hpa addr:(%p), "
2452                                 "diff:(%"PRIu64")\n",
2453                                 (void *)(uintptr_t)cur_phys_addr,
2454                                 (void *)(uintptr_t)next_phys_addr,
2455                                 (next_phys_addr-cur_phys_addr));
2456                 }
2457         }
2458         return nregions;
2459 }
2460
2461 /*
2462  * Divide each region whose vhost virtual address is continous into a few
2463  * sub-regions, make sure the physical address within each sub-region are
2464  * continous. And fill offset(to GPA) and size etc. information of each
2465  * sub-region into regions_hpa.
2466  */
2467 static uint32_t
2468 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2469 {
2470         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2471         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2472
2473         if (mem_region_hpa == NULL)
2474                 return 0;
2475
2476         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2477                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2478                         virtio_memory->regions[regionidx].address_offset;
2479                 mem_region_hpa[regionidx_hpa].guest_phys_address
2480                         = virtio_memory->regions[regionidx].guest_phys_address;
2481                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2482                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2483                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2484                 LOG_DEBUG(VHOST_CONFIG,
2485                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2486                         regionidx_hpa,
2487                         (void *)(uintptr_t)
2488                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2489                 LOG_DEBUG(VHOST_CONFIG,
2490                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2491                         regionidx_hpa,
2492                         (void *)(uintptr_t)
2493                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2494                 for (i = 0, k = 0;
2495                         i < virtio_memory->regions[regionidx].memory_size -
2496                                 page_size;
2497                         i += page_size) {
2498                         cur_phys_addr = rte_mem_virt2phy(
2499                                         (void *)(uintptr_t)(vva_start + i));
2500                         next_phys_addr = rte_mem_virt2phy(
2501                                         (void *)(uintptr_t)(vva_start +
2502                                         i + page_size));
2503                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2504                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2505                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2506                                         k + page_size;
2507                                 mem_region_hpa[regionidx_hpa].memory_size
2508                                         = k + page_size;
2509                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2510                                         "phys addr end  [%d]:(%p)\n",
2511                                         regionidx_hpa,
2512                                         (void *)(uintptr_t)
2513                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2514                                 LOG_DEBUG(VHOST_CONFIG,
2515                                         "in fill_hpa_regions: guest phys addr "
2516                                         "size [%d]:(%p)\n",
2517                                         regionidx_hpa,
2518                                         (void *)(uintptr_t)
2519                                         (mem_region_hpa[regionidx_hpa].memory_size));
2520                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2521                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2522                                 ++regionidx_hpa;
2523                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2524                                         next_phys_addr -
2525                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2526                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2527                                         " phys addr start[%d]:(%p)\n",
2528                                         regionidx_hpa,
2529                                         (void *)(uintptr_t)
2530                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2531                                 LOG_DEBUG(VHOST_CONFIG,
2532                                         "in fill_hpa_regions: host  phys addr "
2533                                         "start[%d]:(%p)\n",
2534                                         regionidx_hpa,
2535                                         (void *)(uintptr_t)
2536                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2537                                 k = 0;
2538                         } else {
2539                                 k += page_size;
2540                         }
2541                 }
2542                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2543                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2544                         + k + page_size;
2545                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2546                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2547                         "[%d]:(%p)\n", regionidx_hpa,
2548                         (void *)(uintptr_t)
2549                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2550                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2551                         "[%d]:(%p)\n", regionidx_hpa,
2552                         (void *)(uintptr_t)
2553                         (mem_region_hpa[regionidx_hpa].memory_size));
2554                 ++regionidx_hpa;
2555         }
2556         return regionidx_hpa;
2557 }
2558
2559 /*
2560  * A new device is added to a data core. First the device is added to the main linked list
2561  * and the allocated to a specific data core.
2562  */
2563 static int
2564 new_device (struct virtio_net *dev)
2565 {
2566         struct virtio_net_data_ll *ll_dev;
2567         int lcore, core_add = 0;
2568         uint32_t device_num_min = num_devices;
2569         struct vhost_dev *vdev;
2570         uint32_t regionidx;
2571
2572         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2573         if (vdev == NULL) {
2574                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2575                         dev->device_fh);
2576                 return -1;
2577         }
2578         vdev->dev = dev;
2579         dev->priv = vdev;
2580
2581         if (zero_copy) {
2582                 vdev->nregions_hpa = dev->mem->nregions;
2583                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2584                         vdev->nregions_hpa
2585                                 += check_hpa_regions(
2586                                         dev->mem->regions[regionidx].guest_phys_address
2587                                         + dev->mem->regions[regionidx].address_offset,
2588                                         dev->mem->regions[regionidx].memory_size);
2589
2590                 }
2591
2592                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2593                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2594                         CACHE_LINE_SIZE);
2595                 if (vdev->regions_hpa == NULL) {
2596                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2597                         rte_free(vdev);
2598                         return -1;
2599                 }
2600
2601
2602                 if (fill_hpa_memory_regions(
2603                         vdev->regions_hpa, dev->mem
2604                         ) != vdev->nregions_hpa) {
2605
2606                         RTE_LOG(ERR, VHOST_CONFIG,
2607                                 "hpa memory regions number mismatch: "
2608                                 "[%d]\n", vdev->nregions_hpa);
2609                         rte_free(vdev->regions_hpa);
2610                         rte_free(vdev);
2611                         return -1;
2612                 }
2613         }
2614
2615
2616         /* Add device to main ll */
2617         ll_dev = get_data_ll_free_entry(&ll_root_free);
2618         if (ll_dev == NULL) {
2619                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2620                         "of %d devices per core has been reached\n",
2621                         dev->device_fh, num_devices);
2622                 if (vdev->regions_hpa)
2623                         rte_free(vdev->regions_hpa);
2624                 rte_free(vdev);
2625                 return -1;
2626         }
2627         ll_dev->vdev = vdev;
2628         add_data_ll_entry(&ll_root_used, ll_dev);
2629         vdev->vmdq_rx_q
2630                 = dev->device_fh * (num_queues / num_devices);
2631
2632         if (zero_copy) {
2633                 uint32_t index = vdev->vmdq_rx_q;
2634                 uint32_t count_in_ring, i;
2635                 struct mbuf_table *tx_q;
2636
2637                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2638
2639                 LOG_DEBUG(VHOST_CONFIG,
2640                         "(%"PRIu64") in new_device: mbuf count in mempool "
2641                         "before attach is: %d\n",
2642                         dev->device_fh,
2643                         rte_mempool_count(vpool_array[index].pool));
2644                 LOG_DEBUG(VHOST_CONFIG,
2645                         "(%"PRIu64") in new_device: mbuf count in  ring "
2646                         "before attach  is : %d\n",
2647                         dev->device_fh, count_in_ring);
2648
2649                 /*
2650                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2651                  */
2652                 for (i = 0; i < count_in_ring; i++)
2653                         attach_rxmbuf_zcp(dev);
2654
2655                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2656                         "mempool after attach is: %d\n",
2657                         dev->device_fh,
2658                         rte_mempool_count(vpool_array[index].pool));
2659                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2660                         "ring after attach  is : %d\n",
2661                         dev->device_fh,
2662                         rte_ring_count(vpool_array[index].ring));
2663
2664                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2665                 tx_q->txq_id = vdev->vmdq_rx_q;
2666
2667                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2668                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2669
2670                         LOG_DEBUG(VHOST_CONFIG,
2671                                 "(%"PRIu64") In new_device: Failed to start "
2672                                 "tx queue:%d\n",
2673                                 dev->device_fh, vdev->vmdq_rx_q);
2674
2675                         mbuf_destroy_zcp(vpool);
2676                         rte_free(vdev->regions_hpa);
2677                         rte_free(vdev);
2678                         return -1;
2679                 }
2680
2681                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2682                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2683
2684                         LOG_DEBUG(VHOST_CONFIG,
2685                                 "(%"PRIu64") In new_device: Failed to start "
2686                                 "rx queue:%d\n",
2687                                 dev->device_fh, vdev->vmdq_rx_q);
2688
2689                         /* Stop the TX queue. */
2690                         if (rte_eth_dev_tx_queue_stop(ports[0],
2691                                 vdev->vmdq_rx_q) != 0) {
2692                                 LOG_DEBUG(VHOST_CONFIG,
2693                                         "(%"PRIu64") In new_device: Failed to "
2694                                         "stop tx queue:%d\n",
2695                                         dev->device_fh, vdev->vmdq_rx_q);
2696                         }
2697
2698                         mbuf_destroy_zcp(vpool);
2699                         rte_free(vdev->regions_hpa);
2700                         rte_free(vdev);
2701                         return -1;
2702                 }
2703
2704         }
2705
2706         /*reset ready flag*/
2707         vdev->ready = DEVICE_MAC_LEARNING;
2708         vdev->remove = 0;
2709
2710         /* Find a suitable lcore to add the device. */
2711         RTE_LCORE_FOREACH_SLAVE(lcore) {
2712                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2713                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2714                         core_add = lcore;
2715                 }
2716         }
2717         /* Add device to lcore ll */
2718         ll_dev->dev->coreid = core_add;
2719         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2720         if (ll_dev == NULL) {
2721                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2722                 vdev->ready = DEVICE_SAFE_REMOVE;
2723                 destroy_device(dev);
2724                 if (vdev->regions_hpa)
2725                         rte_free(vdev->regions_hpa);
2726                 rte_free(vdev);
2727                 return -1;
2728         }
2729         ll_dev->vdev = vdev;
2730         vdev->coreid = core_add;
2731
2732         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2733
2734         /* Initialize device stats */
2735         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2736
2737         /* Disable notifications. */
2738         set_irq_status(dev);
2739         lcore_info[vdev->coreid].lcore_ll->device_num++;
2740         dev->flags |= VIRTIO_DEV_RUNNING;
2741
2742         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2743
2744         return 0;
2745 }
2746
2747 /*
2748  * These callback allow devices to be added to the data core when configuration
2749  * has been fully complete.
2750  */
2751 static const struct virtio_net_device_ops virtio_net_device_ops =
2752 {
2753         .new_device =  new_device,
2754         .destroy_device = destroy_device,
2755 };
2756
2757 /*
2758  * This is a thread will wake up after a period to print stats if the user has
2759  * enabled them.
2760  */
2761 static void
2762 print_stats(void)
2763 {
2764         struct virtio_net_data_ll *dev_ll;
2765         uint64_t tx_dropped, rx_dropped;
2766         uint64_t tx, tx_total, rx, rx_total;
2767         uint32_t device_fh;
2768         const char clr[] = { 27, '[', '2', 'J', '\0' };
2769         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2770
2771         while(1) {
2772                 sleep(enable_stats);
2773
2774                 /* Clear screen and move to top left */
2775                 printf("%s%s", clr, top_left);
2776
2777                 printf("\nDevice statistics ====================================");
2778
2779                 dev_ll = ll_root_used;
2780                 while (dev_ll != NULL) {
2781                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2782                         tx_total = dev_statistics[device_fh].tx_total;
2783                         tx = dev_statistics[device_fh].tx;
2784                         tx_dropped = tx_total - tx;
2785                         if (zero_copy == 0) {
2786                                 rx_total = rte_atomic64_read(
2787                                         &dev_statistics[device_fh].rx_total_atomic);
2788                                 rx = rte_atomic64_read(
2789                                         &dev_statistics[device_fh].rx_atomic);
2790                         } else {
2791                                 rx_total = dev_statistics[device_fh].rx_total;
2792                                 rx = dev_statistics[device_fh].rx;
2793                         }
2794                         rx_dropped = rx_total - rx;
2795
2796                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2797                                         "\nTX total:            %"PRIu64""
2798                                         "\nTX dropped:          %"PRIu64""
2799                                         "\nTX successful:               %"PRIu64""
2800                                         "\nRX total:            %"PRIu64""
2801                                         "\nRX dropped:          %"PRIu64""
2802                                         "\nRX successful:               %"PRIu64"",
2803                                         device_fh,
2804                                         tx_total,
2805                                         tx_dropped,
2806                                         tx,
2807                                         rx_total,
2808                                         rx_dropped,
2809                                         rx);
2810
2811                         dev_ll = dev_ll->next;
2812                 }
2813                 printf("\n======================================================\n");
2814         }
2815 }
2816
2817 static void
2818 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2819         char *ring_name, uint32_t nb_mbuf)
2820 {
2821         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2822         vpool_array[index].pool
2823                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2824                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2825                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2826                 rte_pktmbuf_init, NULL, socket, 0);
2827         if (vpool_array[index].pool != NULL) {
2828                 vpool_array[index].ring
2829                         = rte_ring_create(ring_name,
2830                                 rte_align32pow2(nb_mbuf + 1),
2831                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2832                 if (likely(vpool_array[index].ring != NULL)) {
2833                         LOG_DEBUG(VHOST_CONFIG,
2834                                 "in setup_mempool_tbl: mbuf count in "
2835                                 "mempool is: %d\n",
2836                                 rte_mempool_count(vpool_array[index].pool));
2837                         LOG_DEBUG(VHOST_CONFIG,
2838                                 "in setup_mempool_tbl: mbuf count in "
2839                                 "ring   is: %d\n",
2840                                 rte_ring_count(vpool_array[index].ring));
2841                 } else {
2842                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2843                                 ring_name);
2844                 }
2845
2846                 /* Need consider head room. */
2847                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2848         } else {
2849                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2850         }
2851 }
2852
2853
2854 /*
2855  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2856  * device is also registered here to handle the IOCTLs.
2857  */
2858 int
2859 MAIN(int argc, char *argv[])
2860 {
2861         struct rte_mempool *mbuf_pool = NULL;
2862         unsigned lcore_id, core_id = 0;
2863         unsigned nb_ports, valid_num_ports;
2864         int ret;
2865         uint8_t portid, queue_id = 0;
2866         static pthread_t tid;
2867
2868         /* init EAL */
2869         ret = rte_eal_init(argc, argv);
2870         if (ret < 0)
2871                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2872         argc -= ret;
2873         argv += ret;
2874
2875         /* parse app arguments */
2876         ret = us_vhost_parse_args(argc, argv);
2877         if (ret < 0)
2878                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2879
2880         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2881                 if (rte_lcore_is_enabled(lcore_id))
2882                         lcore_ids[core_id ++] = lcore_id;
2883
2884         if (rte_lcore_count() > RTE_MAX_LCORE)
2885                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2886
2887         /*set the number of swithcing cores available*/
2888         num_switching_cores = rte_lcore_count()-1;
2889
2890         /* Get the number of physical ports. */
2891         nb_ports = rte_eth_dev_count();
2892         if (nb_ports > RTE_MAX_ETHPORTS)
2893                 nb_ports = RTE_MAX_ETHPORTS;
2894
2895         /*
2896          * Update the global var NUM_PORTS and global array PORTS
2897          * and get value of var VALID_NUM_PORTS according to system ports number
2898          */
2899         valid_num_ports = check_ports_num(nb_ports);
2900
2901         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2902                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2903                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2904                 return -1;
2905         }
2906
2907         if (zero_copy == 0) {
2908                 /* Create the mbuf pool. */
2909                 mbuf_pool = rte_mempool_create(
2910                                 "MBUF_POOL",
2911                                 NUM_MBUFS_PER_PORT
2912                                 * valid_num_ports,
2913                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2914                                 sizeof(struct rte_pktmbuf_pool_private),
2915                                 rte_pktmbuf_pool_init, NULL,
2916                                 rte_pktmbuf_init, NULL,
2917                                 rte_socket_id(), 0);
2918                 if (mbuf_pool == NULL)
2919                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2920
2921                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2922                         vpool_array[queue_id].pool = mbuf_pool;
2923
2924                 if (vm2vm_mode == VM2VM_HARDWARE) {
2925                         /* Enable VT loop back to let L2 switch to do it. */
2926                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2927                         LOG_DEBUG(VHOST_CONFIG,
2928                                 "Enable loop back for L2 switch in vmdq.\n");
2929                 }
2930         } else {
2931                 uint32_t nb_mbuf;
2932                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2933                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2934
2935                 /*
2936                  * Zero copy defers queue RX/TX start to the time when guest
2937                  * finishes its startup and packet buffers from that guest are
2938                  * available.
2939                  */
2940                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2941                 rx_conf_default.rx_drop_en = 0;
2942                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2943                 nb_mbuf = num_rx_descriptor
2944                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2945                         + num_switching_cores * MAX_PKT_BURST;
2946
2947                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2948                         snprintf(pool_name, sizeof(pool_name),
2949                                 "rxmbuf_pool_%u", queue_id);
2950                         snprintf(ring_name, sizeof(ring_name),
2951                                 "rxmbuf_ring_%u", queue_id);
2952                         setup_mempool_tbl(rte_socket_id(), queue_id,
2953                                 pool_name, ring_name, nb_mbuf);
2954                 }
2955
2956                 nb_mbuf = num_tx_descriptor
2957                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2958                                 + num_switching_cores * MAX_PKT_BURST;
2959
2960                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2961                         snprintf(pool_name, sizeof(pool_name),
2962                                 "txmbuf_pool_%u", queue_id);
2963                         snprintf(ring_name, sizeof(ring_name),
2964                                 "txmbuf_ring_%u", queue_id);
2965                         setup_mempool_tbl(rte_socket_id(),
2966                                 (queue_id + MAX_QUEUES),
2967                                 pool_name, ring_name, nb_mbuf);
2968                 }
2969
2970                 if (vm2vm_mode == VM2VM_HARDWARE) {
2971                         /* Enable VT loop back to let L2 switch to do it. */
2972                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2973                         LOG_DEBUG(VHOST_CONFIG,
2974                                 "Enable loop back for L2 switch in vmdq.\n");
2975                 }
2976         }
2977         /* Set log level. */
2978         rte_set_log_level(LOG_LEVEL);
2979
2980         /* initialize all ports */
2981         for (portid = 0; portid < nb_ports; portid++) {
2982                 /* skip ports that are not enabled */
2983                 if ((enabled_port_mask & (1 << portid)) == 0) {
2984                         RTE_LOG(INFO, VHOST_PORT,
2985                                 "Skipping disabled port %d\n", portid);
2986                         continue;
2987                 }
2988                 if (port_init(portid) != 0)
2989                         rte_exit(EXIT_FAILURE,
2990                                 "Cannot initialize network ports\n");
2991         }
2992
2993         /* Initialise all linked lists. */
2994         if (init_data_ll() == -1)
2995                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2996
2997         /* Initialize device stats */
2998         memset(&dev_statistics, 0, sizeof(dev_statistics));
2999
3000         /* Enable stats if the user option is set. */
3001         if (enable_stats)
3002                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3003
3004         /* Launch all data cores. */
3005         if (zero_copy == 0) {
3006                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3007                         rte_eal_remote_launch(switch_worker,
3008                                 mbuf_pool, lcore_id);
3009                 }
3010         } else {
3011                 uint32_t count_in_mempool, index, i;
3012                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3013                         /* For all RX and TX queues. */
3014                         count_in_mempool
3015                                 = rte_mempool_count(vpool_array[index].pool);
3016
3017                         /*
3018                          * Transfer all un-attached mbufs from vpool.pool
3019                          * to vpoo.ring.
3020                          */
3021                         for (i = 0; i < count_in_mempool; i++) {
3022                                 struct rte_mbuf *mbuf
3023                                         = __rte_mbuf_raw_alloc(
3024                                                 vpool_array[index].pool);
3025                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3026                                                 (void *)mbuf);
3027                         }
3028
3029                         LOG_DEBUG(VHOST_CONFIG,
3030                                 "in MAIN: mbuf count in mempool at initial "
3031                                 "is: %d\n", count_in_mempool);
3032                         LOG_DEBUG(VHOST_CONFIG,
3033                                 "in MAIN: mbuf count in  ring at initial  is :"
3034                                 " %d\n",
3035                                 rte_ring_count(vpool_array[index].ring));
3036                 }
3037
3038                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3039                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3040                                 lcore_id);
3041         }
3042
3043         /* Register CUSE device to handle IOCTLs. */
3044         ret = rte_vhost_driver_register((char *)&dev_basename);
3045         if (ret != 0)
3046                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3047
3048         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3049
3050         /* Start CUSE session. */
3051         rte_vhost_driver_session_start();
3052         return 0;
3053
3054 }
3055