examples/vhost: allow mergeable packets with vector ixgbe
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
103
104 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
106
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223         .rx_thresh = {
224                 .pthresh = RX_PTHRESH,
225                 .hthresh = RX_HTHRESH,
226                 .wthresh = RX_WTHRESH,
227         },
228         .rx_drop_en = 1,
229 };
230
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237         .tx_thresh = {
238                 .pthresh = TX_PTHRESH,
239                 .hthresh = TX_HTHRESH,
240                 .wthresh = TX_WTHRESH,
241         },
242         .tx_free_thresh = 0, /* Use PMD default values */
243         .tx_rs_thresh = 0, /* Use PMD default values */
244 };
245
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248         .rxmode = {
249                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250                 .split_hdr_size = 0,
251                 .header_split   = 0, /**< Header Split disabled */
252                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
253                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
254                 /*
255                  * It is necessary for 1G NIC such as I350,
256                  * this fixes bug of ipv4 forwarding in guest can't
257                  * forward pakets from one virtio dev to another virtio dev.
258                  */
259                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
262         },
263
264         .txmode = {
265                 .mq_mode = ETH_MQ_TX_NONE,
266         },
267         .rx_adv_conf = {
268                 /*
269                  * should be overridden separately in code with
270                  * appropriate values
271                  */
272                 .vmdq_rx_conf = {
273                         .nb_queue_pools = ETH_8_POOLS,
274                         .enable_default_pool = 0,
275                         .default_pool = 0,
276                         .nb_pool_maps = 0,
277                         .pool_map = {{0, 0},},
278                 },
279         },
280 };
281
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
290         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310         unsigned len;
311         unsigned txq_id;
312         struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323         unsigned char   h_dest[ETH_ALEN];
324         unsigned char   h_source[ETH_ALEN];
325         __be16          h_vlan_proto;
326         __be16          h_vlan_TCI;
327         __be16          h_vlan_encapsulated_proto;
328 };
329
330 /* IPv4 Header */
331 struct ipv4_hdr {
332         uint8_t  version_ihl;           /**< version and header length */
333         uint8_t  type_of_service;       /**< type of service */
334         uint16_t total_length;          /**< length of packet */
335         uint16_t packet_id;             /**< packet ID */
336         uint16_t fragment_offset;       /**< fragmentation offset */
337         uint8_t  time_to_live;          /**< time to live */
338         uint8_t  next_proto_id;         /**< protocol ID */
339         uint16_t hdr_checksum;          /**< header checksum */
340         uint32_t src_addr;              /**< source address */
341         uint32_t dst_addr;              /**< destination address */
342 } __attribute__((__packed__));
343
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347
348 /* Per-device statistics struct */
349 struct device_statistics {
350         uint64_t tx_total;
351         rte_atomic64_t rx_total_atomic;
352         uint64_t rx_total;
353         uint64_t tx;
354         rte_atomic64_t rx_atomic;
355         uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366         struct rte_eth_vmdq_rx_conf conf;
367         unsigned i;
368
369         memset(&conf, 0, sizeof(conf));
370         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371         conf.nb_pool_maps = num_devices;
372         conf.enable_loop_back =
373                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374
375         for (i = 0; i < conf.nb_pool_maps; i++) {
376                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
377                 conf.pool_map[i].pools = (1UL << i);
378         }
379
380         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383         return 0;
384 }
385
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394         if (num_devices > max_nb_devices) {
395                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396                 return -1;
397         }
398         return 0;
399 }
400
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408         struct rte_eth_dev_info dev_info;
409         struct rte_eth_conf port_conf;
410         uint16_t rx_rings, tx_rings;
411         uint16_t rx_ring_size, tx_ring_size;
412         int retval;
413         uint16_t q;
414
415         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416         rte_eth_dev_info_get (port, &dev_info);
417
418         /*configure the number of supported virtio devices based on VMDQ limits */
419         num_devices = dev_info.max_vmdq_pools;
420         num_queues = dev_info.max_rx_queues;
421
422         if (zero_copy) {
423                 rx_ring_size = num_rx_descriptor;
424                 tx_ring_size = num_tx_descriptor;
425                 tx_rings = dev_info.max_tx_queues;
426         } else {
427                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429                 tx_rings = (uint16_t)rte_lcore_count();
430         }
431
432         retval = validate_num_devices(MAX_DEVICES);
433         if (retval < 0)
434                 return retval;
435
436         /* Get port configuration. */
437         retval = get_eth_conf(&port_conf, num_devices);
438         if (retval < 0)
439                 return retval;
440
441         if (port >= rte_eth_dev_count()) return -1;
442
443         rx_rings = (uint16_t)num_queues,
444         /* Configure ethernet device. */
445         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446         if (retval != 0)
447                 return retval;
448
449         /* Setup the queues. */
450         for (q = 0; q < rx_rings; q ++) {
451                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
453                                                 vpool_array[q].pool);
454                 if (retval < 0)
455                         return retval;
456         }
457         for (q = 0; q < tx_rings; q ++) {
458                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
460                 if (retval < 0)
461                         return retval;
462         }
463
464         /* Start the device. */
465         retval  = rte_eth_dev_start(port);
466         if (retval < 0) {
467                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468                 return retval;
469         }
470
471         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475                         (unsigned)port,
476                         vmdq_ports_eth_addr[port].addr_bytes[0],
477                         vmdq_ports_eth_addr[port].addr_bytes[1],
478                         vmdq_ports_eth_addr[port].addr_bytes[2],
479                         vmdq_ports_eth_addr[port].addr_bytes[3],
480                         vmdq_ports_eth_addr[port].addr_bytes[4],
481                         vmdq_ports_eth_addr[port].addr_bytes[5]);
482
483         return 0;
484 }
485
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492         /* parse number string */
493
494         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495                 return -1;
496         else
497                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498
499         return 0;
500 }
501
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508         char *end = NULL;
509         unsigned long pm;
510
511         errno = 0;
512
513         /* parse hexadecimal string */
514         pm = strtoul(portmask, &end, 16);
515         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516                 return -1;
517
518         if (pm == 0)
519                 return -1;
520
521         return pm;
522
523 }
524
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531         char *end = NULL;
532         unsigned long num;
533
534         errno = 0;
535
536         /* parse unsigned int string */
537         num = strtoul(q_arg, &end, 10);
538         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539                 return -1;
540
541         if (num > max_valid_value)
542                 return -1;
543
544         return num;
545
546 }
547
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555         "               --vm2vm [0|1|2]\n"
556         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557         "               --dev-basename <name>\n"
558         "               --nb-devices ND\n"
559         "               -p PORTMASK: Set mask for ports to be used by application\n"
560         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566         "               --dev-basename: The basename to be used for the character device.\n"
567         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
568                         "zero copy\n"
569         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
570                         "used only when zero copy is enabled.\n"
571         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
572                         "used only when zero copy is enabled.\n",
573                prgname);
574 }
575
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582         int opt, ret;
583         int option_index;
584         unsigned i;
585         const char *prgname = argv[0];
586         static struct option long_option[] = {
587                 {"vm2vm", required_argument, NULL, 0},
588                 {"rx-retry", required_argument, NULL, 0},
589                 {"rx-retry-delay", required_argument, NULL, 0},
590                 {"rx-retry-num", required_argument, NULL, 0},
591                 {"mergeable", required_argument, NULL, 0},
592                 {"stats", required_argument, NULL, 0},
593                 {"dev-basename", required_argument, NULL, 0},
594                 {"zero-copy", required_argument, NULL, 0},
595                 {"rx-desc-num", required_argument, NULL, 0},
596                 {"tx-desc-num", required_argument, NULL, 0},
597                 {NULL, 0, 0, 0},
598         };
599
600         /* Parse command line */
601         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602                 switch (opt) {
603                 /* Portmask */
604                 case 'p':
605                         enabled_port_mask = parse_portmask(optarg);
606                         if (enabled_port_mask == 0) {
607                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608                                 us_vhost_usage(prgname);
609                                 return -1;
610                         }
611                         break;
612
613                 case 0:
614                         /* Enable/disable vm2vm comms. */
615                         if (!strncmp(long_option[option_index].name, "vm2vm",
616                                 MAX_LONG_OPT_SZ)) {
617                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618                                 if (ret == -1) {
619                                         RTE_LOG(INFO, VHOST_CONFIG,
620                                                 "Invalid argument for "
621                                                 "vm2vm [0|1|2]\n");
622                                         us_vhost_usage(prgname);
623                                         return -1;
624                                 } else {
625                                         vm2vm_mode = (vm2vm_type)ret;
626                                 }
627                         }
628
629                         /* Enable/disable retries on RX. */
630                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631                                 ret = parse_num_opt(optarg, 1);
632                                 if (ret == -1) {
633                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634                                         us_vhost_usage(prgname);
635                                         return -1;
636                                 } else {
637                                         enable_retry = ret;
638                                 }
639                         }
640
641                         /* Specify the retries delay time (in useconds) on RX. */
642                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643                                 ret = parse_num_opt(optarg, INT32_MAX);
644                                 if (ret == -1) {
645                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646                                         us_vhost_usage(prgname);
647                                         return -1;
648                                 } else {
649                                         burst_rx_delay_time = ret;
650                                 }
651                         }
652
653                         /* Specify the retries number on RX. */
654                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655                                 ret = parse_num_opt(optarg, INT32_MAX);
656                                 if (ret == -1) {
657                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658                                         us_vhost_usage(prgname);
659                                         return -1;
660                                 } else {
661                                         burst_rx_retry_num = ret;
662                                 }
663                         }
664
665                         /* Enable/disable RX mergeable buffers. */
666                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667                                 ret = parse_num_opt(optarg, 1);
668                                 if (ret == -1) {
669                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670                                         us_vhost_usage(prgname);
671                                         return -1;
672                                 } else {
673                                         mergeable = !!ret;
674                                         if (ret) {
675                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
676                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
677                                                         = JUMBO_FRAME_MAX_SIZE;
678                                         }
679                                 }
680                         }
681
682                         /* Enable/disable stats. */
683                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684                                 ret = parse_num_opt(optarg, INT32_MAX);
685                                 if (ret == -1) {
686                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687                                         us_vhost_usage(prgname);
688                                         return -1;
689                                 } else {
690                                         enable_stats = ret;
691                                 }
692                         }
693
694                         /* Set character device basename. */
695                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696                                 if (us_vhost_parse_basename(optarg) == -1) {
697                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698                                         us_vhost_usage(prgname);
699                                         return -1;
700                                 }
701                         }
702
703                         /* Enable/disable rx/tx zero copy. */
704                         if (!strncmp(long_option[option_index].name,
705                                 "zero-copy", MAX_LONG_OPT_SZ)) {
706                                 ret = parse_num_opt(optarg, 1);
707                                 if (ret == -1) {
708                                         RTE_LOG(INFO, VHOST_CONFIG,
709                                                 "Invalid argument"
710                                                 " for zero-copy [0|1]\n");
711                                         us_vhost_usage(prgname);
712                                         return -1;
713                                 } else
714                                         zero_copy = ret;
715
716                                 if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719                                         "zero copy vhost APP, please "
720                                         "disable RTE_MBUF_REFCNT\n"
721                                         "in config file and then rebuild DPDK "
722                                         "core lib!\n"
723                                         "Otherwise please disable zero copy "
724                                         "flag in command line!\n");
725                                         return -1;
726 #endif
727                                 }
728                         }
729
730                         /* Specify the descriptor number on RX. */
731                         if (!strncmp(long_option[option_index].name,
732                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
733                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
734                                 if ((ret == -1) || (!POWEROF2(ret))) {
735                                         RTE_LOG(INFO, VHOST_CONFIG,
736                                         "Invalid argument for rx-desc-num[0-N],"
737                                         "power of 2 required.\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         num_rx_descriptor = ret;
742                                 }
743                         }
744
745                         /* Specify the descriptor number on TX. */
746                         if (!strncmp(long_option[option_index].name,
747                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
748                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
749                                 if ((ret == -1) || (!POWEROF2(ret))) {
750                                         RTE_LOG(INFO, VHOST_CONFIG,
751                                         "Invalid argument for tx-desc-num [0-N],"
752                                         "power of 2 required.\n");
753                                         us_vhost_usage(prgname);
754                                         return -1;
755                                 } else {
756                                         num_tx_descriptor = ret;
757                                 }
758                         }
759
760                         break;
761
762                         /* Invalid option - print options. */
763                 default:
764                         us_vhost_usage(prgname);
765                         return -1;
766                 }
767         }
768
769         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770                 if (enabled_port_mask & (1 << i))
771                         ports[num_ports++] = (uint8_t)i;
772         }
773
774         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777                 return -1;
778         }
779
780         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781                 RTE_LOG(INFO, VHOST_PORT,
782                         "Vhost zero copy doesn't support software vm2vm,"
783                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
784                 return -1;
785         }
786
787         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788                 RTE_LOG(INFO, VHOST_PORT,
789                         "Vhost zero copy doesn't support jumbo frame,"
790                         "please specify '--mergeable 0' to disable the "
791                         "mergeable feature.\n");
792                 return -1;
793         }
794
795         return 0;
796 }
797
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804         unsigned valid_num_ports = num_ports;
805         unsigned portid;
806
807         if (num_ports > nb_ports) {
808                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809                         num_ports, nb_ports);
810                 num_ports = nb_ports;
811         }
812
813         for (portid = 0; portid < num_ports; portid ++) {
814                 if (ports[portid] >= nb_ports) {
815                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816                                 ports[portid], (nb_ports - 1));
817                         ports[portid] = INVALID_PORT_ID;
818                         valid_num_ports--;
819                 }
820         }
821         return valid_num_ports;
822 }
823
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
830         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
831         unsigned int index;                                                                                                                                                                                             \
832         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
833                                                                                                                                                                                                                                         \
834         if ((header))                                                                                                                                                                                                   \
835                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
836         else                                                                                                                                                                                                                    \
837                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
838         for (index = 0; index < (size); index++) {                                                                                                                                              \
839                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
840                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
841         }                                                                                                                                                                                                                               \
842         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
843                                                                                                                                                                                                                                         \
844         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856         uint32_t buf_len, hpa_type *addr_type)
857 {
858         struct virtio_memory_regions_hpa *region;
859         uint32_t regionidx;
860         uint64_t vhost_pa = 0;
861
862         *addr_type = PHYS_ADDR_INVALID;
863
864         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865                 region = &vdev->regions_hpa[regionidx];
866                 if ((guest_pa >= region->guest_phys_address) &&
867                         (guest_pa <= region->guest_phys_address_end)) {
868                         vhost_pa = region->host_phys_addr_offset + guest_pa;
869                         if (likely((guest_pa + buf_len - 1)
870                                 <= region->guest_phys_address_end))
871                                 *addr_type = PHYS_ADDR_CONTINUOUS;
872                         else
873                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
874                         break;
875                 }
876         }
877
878         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880                 (void *)(uintptr_t)vhost_pa);
881
882         return vhost_pa;
883 }
884
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901         struct ether_hdr *pkt_hdr;
902         struct virtio_net_data_ll *dev_ll;
903         struct virtio_net *dev = vdev->dev;
904         int i, ret;
905
906         /* Learn MAC address of guest device from packet */
907         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908
909         dev_ll = ll_root_used;
910
911         while (dev_ll != NULL) {
912                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914                         return -1;
915                 }
916                 dev_ll = dev_ll->next;
917         }
918
919         for (i = 0; i < ETHER_ADDR_LEN; i++)
920                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921
922         /* vlan_tag currently uses the device_id. */
923         vdev->vlan_tag = vlan_tags[dev->device_fh];
924
925         /* Print out VMDQ registration info. */
926         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927                 dev->device_fh,
928                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931                 vdev->vlan_tag);
932
933         /* Register the MAC address. */
934         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935         if (ret)
936                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937                                         dev->device_fh);
938
939         /* Enable stripping of the vlan tag as we handle routing. */
940         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941
942         /* Set device as ready for RX. */
943         vdev->ready = DEVICE_RX;
944
945         return 0;
946 }
947
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955         unsigned i = 0;
956         unsigned rx_count;
957         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958
959         if (vdev->ready == DEVICE_RX) {
960                 /*clear MAC and VLAN settings*/
961                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962                 for (i = 0; i < 6; i++)
963                         vdev->mac_address.addr_bytes[i] = 0;
964
965                 vdev->vlan_tag = 0;
966
967                 /*Clear out the receive buffers*/
968                 rx_count = rte_eth_rx_burst(ports[0],
969                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970
971                 while (rx_count) {
972                         for (i = 0; i < rx_count; i++)
973                                 rte_pktmbuf_free(pkts_burst[i]);
974
975                         rx_count = rte_eth_rx_burst(ports[0],
976                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977                 }
978
979                 vdev->ready = DEVICE_MAC_LEARNING;
980         }
981 }
982
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990         struct virtio_net_data_ll *dev_ll;
991         struct ether_hdr *pkt_hdr;
992         uint64_t ret = 0;
993         struct virtio_net *dev = vdev->dev;
994         struct virtio_net *tdev; /* destination virito device */
995
996         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997
998         /*get the used devices list*/
999         dev_ll = ll_root_used;
1000
1001         while (dev_ll != NULL) {
1002                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003                                           &dev_ll->vdev->mac_address)) {
1004
1005                         /* Drop the packet if the TX packet is destined for the TX device. */
1006                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008                                                         dev->device_fh);
1009                                 return 0;
1010                         }
1011                         tdev = dev_ll->vdev->dev;
1012
1013
1014                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015
1016                         if (unlikely(dev_ll->vdev->remove)) {
1017                                 /*drop the packet if the device is marked for removal*/
1018                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019                         } else {
1020                                 /*send the packet to the local virtio device*/
1021                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022                                 if (enable_stats) {
1023                                         rte_atomic64_add(
1024                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1025                                         1);
1026                                         rte_atomic64_add(
1027                                         &dev_statistics[tdev->device_fh].rx_atomic,
1028                                         ret);
1029                                         dev_statistics[tdev->device_fh].tx_total++;
1030                                         dev_statistics[tdev->device_fh].tx += ret;
1031                                 }
1032                         }
1033
1034                         return 0;
1035                 }
1036                 dev_ll = dev_ll->next;
1037         }
1038
1039         return -1;
1040 }
1041
1042 /*
1043  * This function routes the TX packet to the correct interface. This may be a local device
1044  * or the physical port.
1045  */
1046 static inline void __attribute__((always_inline))
1047 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1048 {
1049         struct mbuf_table *tx_q;
1050         struct rte_mbuf **m_table;
1051         unsigned len, ret, offset = 0;
1052         const uint16_t lcore_id = rte_lcore_id();
1053         struct virtio_net_data_ll *dev_ll = ll_root_used;
1054         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1055         struct virtio_net *dev = vdev->dev;
1056
1057         /*check if destination is local VM*/
1058         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1059                 rte_pktmbuf_free(m);
1060                 return;
1061         }
1062
1063         if (vm2vm_mode == VM2VM_HARDWARE) {
1064                 while (dev_ll != NULL) {
1065                         if ((dev_ll->vdev->ready == DEVICE_RX)
1066                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1067                                 &dev_ll->vdev->mac_address)) {
1068                                 /*
1069                                  * Drop the packet if the TX packet is
1070                                  * destined for the TX device.
1071                                  */
1072                                 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1073                                         LOG_DEBUG(VHOST_DATA,
1074                                         "(%"PRIu64") TX: Source and destination"
1075                                         " MAC addresses are the same. Dropping "
1076                                         "packet.\n",
1077                                         dev_ll->vdev->dev->device_fh);
1078                                         rte_pktmbuf_free(m);
1079                                         return;
1080                                 }
1081                                 offset = 4;
1082                                 vlan_tag =
1083                                 (uint16_t)
1084                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1085
1086                                 LOG_DEBUG(VHOST_DATA,
1087                                 "(%"PRIu64") TX: pkt to local VM device id:"
1088                                 "(%"PRIu64") vlan tag: %d.\n",
1089                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1090                                 vlan_tag);
1091
1092                                 break;
1093                         }
1094                         dev_ll = dev_ll->next;
1095                 }
1096         }
1097
1098         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1099
1100         /*Add packet to the port tx queue*/
1101         tx_q = &lcore_tx_queue[lcore_id];
1102         len = tx_q->len;
1103
1104         m->ol_flags = PKT_TX_VLAN_PKT;
1105         /*FIXME: offset*/
1106         m->data_len += offset;
1107         m->vlan_tci = vlan_tag;
1108
1109         tx_q->m_table[len] = m;
1110         len++;
1111         if (enable_stats) {
1112                 dev_statistics[dev->device_fh].tx_total++;
1113                 dev_statistics[dev->device_fh].tx++;
1114         }
1115
1116         if (unlikely(len == MAX_PKT_BURST)) {
1117                 m_table = (struct rte_mbuf **)tx_q->m_table;
1118                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1119                 /* Free any buffers not handled by TX and update the port stats. */
1120                 if (unlikely(ret < len)) {
1121                         do {
1122                                 rte_pktmbuf_free(m_table[ret]);
1123                         } while (++ret < len);
1124                 }
1125
1126                 len = 0;
1127         }
1128
1129         tx_q->len = len;
1130         return;
1131 }
1132 /*
1133  * This function is called by each data core. It handles all RX/TX registered with the
1134  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1135  * with all devices in the main linked list.
1136  */
1137 static int
1138 switch_worker(__attribute__((unused)) void *arg)
1139 {
1140         struct rte_mempool *mbuf_pool = arg;
1141         struct virtio_net *dev = NULL;
1142         struct vhost_dev *vdev = NULL;
1143         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1144         struct virtio_net_data_ll *dev_ll;
1145         struct mbuf_table *tx_q;
1146         volatile struct lcore_ll_info *lcore_ll;
1147         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1148         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1149         unsigned ret, i;
1150         const uint16_t lcore_id = rte_lcore_id();
1151         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1152         uint16_t rx_count = 0;
1153         uint16_t tx_count;
1154         uint32_t retry = 0;
1155
1156         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1157         lcore_ll = lcore_info[lcore_id].lcore_ll;
1158         prev_tsc = 0;
1159
1160         tx_q = &lcore_tx_queue[lcore_id];
1161         for (i = 0; i < num_cores; i ++) {
1162                 if (lcore_ids[i] == lcore_id) {
1163                         tx_q->txq_id = i;
1164                         break;
1165                 }
1166         }
1167
1168         while(1) {
1169                 cur_tsc = rte_rdtsc();
1170                 /*
1171                  * TX burst queue drain
1172                  */
1173                 diff_tsc = cur_tsc - prev_tsc;
1174                 if (unlikely(diff_tsc > drain_tsc)) {
1175
1176                         if (tx_q->len) {
1177                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1178
1179                                 /*Tx any packets in the queue*/
1180                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1181                                                                            (struct rte_mbuf **)tx_q->m_table,
1182                                                                            (uint16_t)tx_q->len);
1183                                 if (unlikely(ret < tx_q->len)) {
1184                                         do {
1185                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1186                                         } while (++ret < tx_q->len);
1187                                 }
1188
1189                                 tx_q->len = 0;
1190                         }
1191
1192                         prev_tsc = cur_tsc;
1193
1194                 }
1195
1196                 rte_prefetch0(lcore_ll->ll_root_used);
1197                 /*
1198                  * Inform the configuration core that we have exited the linked list and that no devices are
1199                  * in use if requested.
1200                  */
1201                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1202                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1203
1204                 /*
1205                  * Process devices
1206                  */
1207                 dev_ll = lcore_ll->ll_root_used;
1208
1209                 while (dev_ll != NULL) {
1210                         /*get virtio device ID*/
1211                         vdev = dev_ll->vdev;
1212                         dev = vdev->dev;
1213
1214                         if (unlikely(vdev->remove)) {
1215                                 dev_ll = dev_ll->next;
1216                                 unlink_vmdq(vdev);
1217                                 vdev->ready = DEVICE_SAFE_REMOVE;
1218                                 continue;
1219                         }
1220                         if (likely(vdev->ready == DEVICE_RX)) {
1221                                 /*Handle guest RX*/
1222                                 rx_count = rte_eth_rx_burst(ports[0],
1223                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1224
1225                                 if (rx_count) {
1226                                         /*
1227                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1228                                         * Here MAX_PKT_BURST must be less than virtio queue size
1229                                         */
1230                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1231                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1232                                                         rte_delay_us(burst_rx_delay_time);
1233                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1234                                                                 break;
1235                                                 }
1236                                         }
1237                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1238                                         if (enable_stats) {
1239                                                 rte_atomic64_add(
1240                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1241                                                 rx_count);
1242                                                 rte_atomic64_add(
1243                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1244                                         }
1245                                         while (likely(rx_count)) {
1246                                                 rx_count--;
1247                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1248                                         }
1249
1250                                 }
1251                         }
1252
1253                         if (likely(!vdev->remove)) {
1254                                 /* Handle guest TX*/
1255                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1256                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1257                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1258                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1259                                                 while (tx_count--)
1260                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1261                                         }
1262                                 }
1263                                 while (tx_count)
1264                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1265                         }
1266
1267                         /*move to the next device in the list*/
1268                         dev_ll = dev_ll->next;
1269                 }
1270         }
1271
1272         return 0;
1273 }
1274
1275 /*
1276  * This function gets available ring number for zero copy rx.
1277  * Only one thread will call this funciton for a paticular virtio device,
1278  * so, it is designed as non-thread-safe function.
1279  */
1280 static inline uint32_t __attribute__((always_inline))
1281 get_available_ring_num_zcp(struct virtio_net *dev)
1282 {
1283         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1284         uint16_t avail_idx;
1285
1286         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1287         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1288 }
1289
1290 /*
1291  * This function gets available ring index for zero copy rx,
1292  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1293  * Only one thread will call this funciton for a paticular virtio device,
1294  * so, it is designed as non-thread-safe function.
1295  */
1296 static inline uint32_t __attribute__((always_inline))
1297 get_available_ring_index_zcp(struct virtio_net *dev,
1298         uint16_t *res_base_idx, uint32_t count)
1299 {
1300         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1301         uint16_t avail_idx;
1302         uint32_t retry = 0;
1303         uint16_t free_entries;
1304
1305         *res_base_idx = vq->last_used_idx_res;
1306         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1307         free_entries = (avail_idx - *res_base_idx);
1308
1309         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1310                         "avail idx: %d, "
1311                         "res base idx:%d, free entries:%d\n",
1312                         dev->device_fh, avail_idx, *res_base_idx,
1313                         free_entries);
1314
1315         /*
1316          * If retry is enabled and the queue is full then we wait
1317          * and retry to avoid packet loss.
1318          */
1319         if (enable_retry && unlikely(count > free_entries)) {
1320                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1321                         rte_delay_us(burst_rx_delay_time);
1322                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1323                         free_entries = (avail_idx - *res_base_idx);
1324                         if (count <= free_entries)
1325                                 break;
1326                 }
1327         }
1328
1329         /*check that we have enough buffers*/
1330         if (unlikely(count > free_entries))
1331                 count = free_entries;
1332
1333         if (unlikely(count == 0)) {
1334                 LOG_DEBUG(VHOST_DATA,
1335                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1336                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1337                         dev->device_fh, avail_idx,
1338                         *res_base_idx, free_entries);
1339                 return 0;
1340         }
1341
1342         vq->last_used_idx_res = *res_base_idx + count;
1343
1344         return count;
1345 }
1346
1347 /*
1348  * This function put descriptor back to used list.
1349  */
1350 static inline void __attribute__((always_inline))
1351 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1352 {
1353         uint16_t res_cur_idx = vq->last_used_idx;
1354         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1355         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1356         rte_compiler_barrier();
1357         *(volatile uint16_t *)&vq->used->idx += 1;
1358         vq->last_used_idx += 1;
1359
1360         /* Kick the guest if necessary. */
1361         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1362                 eventfd_write((int)vq->kickfd, 1);
1363 }
1364
1365 /*
1366  * This function get available descriptor from vitio vring and un-attached mbuf
1367  * from vpool->ring, and then attach them together. It needs adjust the offset
1368  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1369  * frame data may be put to wrong location in mbuf.
1370  */
1371 static inline void __attribute__((always_inline))
1372 attach_rxmbuf_zcp(struct virtio_net *dev)
1373 {
1374         uint16_t res_base_idx, desc_idx;
1375         uint64_t buff_addr, phys_addr;
1376         struct vhost_virtqueue *vq;
1377         struct vring_desc *desc;
1378         struct rte_mbuf *mbuf = NULL;
1379         struct vpool *vpool;
1380         hpa_type addr_type;
1381         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1382
1383         vpool = &vpool_array[vdev->vmdq_rx_q];
1384         vq = dev->virtqueue[VIRTIO_RXQ];
1385
1386         do {
1387                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1388                                 1) != 1))
1389                         return;
1390                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1391
1392                 desc = &vq->desc[desc_idx];
1393                 if (desc->flags & VRING_DESC_F_NEXT) {
1394                         desc = &vq->desc[desc->next];
1395                         buff_addr = gpa_to_vva(dev, desc->addr);
1396                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1397                                         &addr_type);
1398                 } else {
1399                         buff_addr = gpa_to_vva(dev,
1400                                         desc->addr + vq->vhost_hlen);
1401                         phys_addr = gpa_to_hpa(vdev,
1402                                         desc->addr + vq->vhost_hlen,
1403                                         desc->len, &addr_type);
1404                 }
1405
1406                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1407                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1408                                 " address found when attaching RX frame buffer"
1409                                 " address!\n", dev->device_fh);
1410                         put_desc_to_used_list_zcp(vq, desc_idx);
1411                         continue;
1412                 }
1413
1414                 /*
1415                  * Check if the frame buffer address from guest crosses
1416                  * sub-region or not.
1417                  */
1418                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1419                         RTE_LOG(ERR, VHOST_DATA,
1420                                 "(%"PRIu64") Frame buffer address cross "
1421                                 "sub-regioin found when attaching RX frame "
1422                                 "buffer address!\n",
1423                                 dev->device_fh);
1424                         put_desc_to_used_list_zcp(vq, desc_idx);
1425                         continue;
1426                 }
1427         } while (unlikely(phys_addr == 0));
1428
1429         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1430         if (unlikely(mbuf == NULL)) {
1431                 LOG_DEBUG(VHOST_DATA,
1432                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1433                         "ring_sc_dequeue fail.\n",
1434                         dev->device_fh);
1435                 put_desc_to_used_list_zcp(vq, desc_idx);
1436                 return;
1437         }
1438
1439         if (unlikely(vpool->buf_size > desc->len)) {
1440                 LOG_DEBUG(VHOST_DATA,
1441                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1442                         "length(%d) of descriptor idx: %d less than room "
1443                         "size required: %d\n",
1444                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1445                 put_desc_to_used_list_zcp(vq, desc_idx);
1446                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1447                 return;
1448         }
1449
1450         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1451         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1452         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1453         mbuf->data_len = desc->len;
1454         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1455
1456         LOG_DEBUG(VHOST_DATA,
1457                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1458                 "descriptor idx:%d\n",
1459                 dev->device_fh, res_base_idx, desc_idx);
1460
1461         __rte_mbuf_raw_free(mbuf);
1462
1463         return;
1464 }
1465
1466 /*
1467  * Detach an attched packet mbuf -
1468  *  - restore original mbuf address and length values.
1469  *  - reset pktmbuf data and data_len to their default values.
1470  *  All other fields of the given packet mbuf will be left intact.
1471  *
1472  * @param m
1473  *   The attached packet mbuf.
1474  */
1475 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1476 {
1477         const struct rte_mempool *mp = m->pool;
1478         void *buf = RTE_MBUF_TO_BADDR(m);
1479         uint32_t buf_ofs;
1480         uint32_t buf_len = mp->elt_size - sizeof(*m);
1481         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1482
1483         m->buf_addr = buf;
1484         m->buf_len = (uint16_t)buf_len;
1485
1486         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1487                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1488         m->data_off = buf_ofs;
1489
1490         m->data_len = 0;
1491 }
1492
1493 /*
1494  * This function is called after packets have been transimited. It fetchs mbuf
1495  * from vpool->pool, detached it and put into vpool->ring. It also update the
1496  * used index and kick the guest if necessary.
1497  */
1498 static inline uint32_t __attribute__((always_inline))
1499 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1500 {
1501         struct rte_mbuf *mbuf;
1502         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1503         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1504         uint32_t index = 0;
1505         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1506
1507         LOG_DEBUG(VHOST_DATA,
1508                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1509                 "clean is: %d\n",
1510                 dev->device_fh, mbuf_count);
1511         LOG_DEBUG(VHOST_DATA,
1512                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1513                 "clean  is : %d\n",
1514                 dev->device_fh, rte_ring_count(vpool->ring));
1515
1516         for (index = 0; index < mbuf_count; index++) {
1517                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1518                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1519                         pktmbuf_detach_zcp(mbuf);
1520                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1521
1522                 /* Update used index buffer information. */
1523                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1524                 vq->used->ring[used_idx].len = 0;
1525
1526                 used_idx = (used_idx + 1) & (vq->size - 1);
1527         }
1528
1529         LOG_DEBUG(VHOST_DATA,
1530                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1531                 "clean is: %d\n",
1532                 dev->device_fh, rte_mempool_count(vpool->pool));
1533         LOG_DEBUG(VHOST_DATA,
1534                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1535                 "clean  is : %d\n",
1536                 dev->device_fh, rte_ring_count(vpool->ring));
1537         LOG_DEBUG(VHOST_DATA,
1538                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1539                 "vq->last_used_idx:%d\n",
1540                 dev->device_fh, vq->last_used_idx);
1541
1542         vq->last_used_idx += mbuf_count;
1543
1544         LOG_DEBUG(VHOST_DATA,
1545                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1546                 "vq->last_used_idx:%d\n",
1547                 dev->device_fh, vq->last_used_idx);
1548
1549         rte_compiler_barrier();
1550
1551         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1552
1553         /* Kick guest if required. */
1554         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1555                 eventfd_write((int)vq->kickfd, 1);
1556
1557         return 0;
1558 }
1559
1560 /*
1561  * This function is called when a virtio device is destroy.
1562  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1563  */
1564 static void mbuf_destroy_zcp(struct vpool *vpool)
1565 {
1566         struct rte_mbuf *mbuf = NULL;
1567         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1568
1569         LOG_DEBUG(VHOST_CONFIG,
1570                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1571                 "mbuf_destroy_zcp is: %d\n",
1572                 mbuf_count);
1573         LOG_DEBUG(VHOST_CONFIG,
1574                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1575                 "mbuf_destroy_zcp  is : %d\n",
1576                 rte_ring_count(vpool->ring));
1577
1578         for (index = 0; index < mbuf_count; index++) {
1579                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1580                 if (likely(mbuf != NULL)) {
1581                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1582                                 pktmbuf_detach_zcp(mbuf);
1583                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1584                 }
1585         }
1586
1587         LOG_DEBUG(VHOST_CONFIG,
1588                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1589                 "mbuf_destroy_zcp is: %d\n",
1590                 rte_mempool_count(vpool->pool));
1591         LOG_DEBUG(VHOST_CONFIG,
1592                 "in mbuf_destroy_zcp: mbuf count in ring after "
1593                 "mbuf_destroy_zcp is : %d\n",
1594                 rte_ring_count(vpool->ring));
1595 }
1596
1597 /*
1598  * This function update the use flag and counter.
1599  */
1600 static inline uint32_t __attribute__((always_inline))
1601 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1602         uint32_t count)
1603 {
1604         struct vhost_virtqueue *vq;
1605         struct vring_desc *desc;
1606         struct rte_mbuf *buff;
1607         /* The virtio_hdr is initialised to 0. */
1608         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1609                 = {{0, 0, 0, 0, 0, 0}, 0};
1610         uint64_t buff_hdr_addr = 0;
1611         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1612         uint32_t head_idx, packet_success = 0;
1613         uint16_t res_cur_idx;
1614
1615         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1616
1617         if (count == 0)
1618                 return 0;
1619
1620         vq = dev->virtqueue[VIRTIO_RXQ];
1621         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1622
1623         res_cur_idx = vq->last_used_idx;
1624         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1625                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1626
1627         /* Retrieve all of the head indexes first to avoid caching issues. */
1628         for (head_idx = 0; head_idx < count; head_idx++)
1629                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1630
1631         /*Prefetch descriptor index. */
1632         rte_prefetch0(&vq->desc[head[packet_success]]);
1633
1634         while (packet_success != count) {
1635                 /* Get descriptor from available ring */
1636                 desc = &vq->desc[head[packet_success]];
1637
1638                 buff = pkts[packet_success];
1639                 LOG_DEBUG(VHOST_DATA,
1640                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1641                         "pkt[%d] descriptor idx: %d\n",
1642                         dev->device_fh, packet_success,
1643                         MBUF_HEADROOM_UINT32(buff));
1644
1645                 PRINT_PACKET(dev,
1646                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1647                         + RTE_PKTMBUF_HEADROOM),
1648                         rte_pktmbuf_data_len(buff), 0);
1649
1650                 /* Buffer address translation for virtio header. */
1651                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1652                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1653
1654                 /*
1655                  * If the descriptors are chained the header and data are
1656                  * placed in separate buffers.
1657                  */
1658                 if (desc->flags & VRING_DESC_F_NEXT) {
1659                         desc->len = vq->vhost_hlen;
1660                         desc = &vq->desc[desc->next];
1661                         desc->len = rte_pktmbuf_data_len(buff);
1662                 } else {
1663                         desc->len = packet_len;
1664                 }
1665
1666                 /* Update used ring with desc information */
1667                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1668                         = head[packet_success];
1669                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1670                         = packet_len;
1671                 res_cur_idx++;
1672                 packet_success++;
1673
1674                 /* A header is required per buffer. */
1675                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1676                         (const void *)&virtio_hdr, vq->vhost_hlen);
1677
1678                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1679
1680                 if (likely(packet_success < count)) {
1681                         /* Prefetch descriptor index. */
1682                         rte_prefetch0(&vq->desc[head[packet_success]]);
1683                 }
1684         }
1685
1686         rte_compiler_barrier();
1687
1688         LOG_DEBUG(VHOST_DATA,
1689                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1690                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1691                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1692
1693         *(volatile uint16_t *)&vq->used->idx += count;
1694         vq->last_used_idx += count;
1695
1696         LOG_DEBUG(VHOST_DATA,
1697                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1698                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1699                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1700
1701         /* Kick the guest if necessary. */
1702         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1703                 eventfd_write((int)vq->kickfd, 1);
1704
1705         return count;
1706 }
1707
1708 /*
1709  * This function routes the TX packet to the correct interface.
1710  * This may be a local device or the physical port.
1711  */
1712 static inline void __attribute__((always_inline))
1713 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1714         uint32_t desc_idx, uint8_t need_copy)
1715 {
1716         struct mbuf_table *tx_q;
1717         struct rte_mbuf **m_table;
1718         struct rte_mbuf *mbuf = NULL;
1719         unsigned len, ret, offset = 0;
1720         struct vpool *vpool;
1721         struct virtio_net_data_ll *dev_ll = ll_root_used;
1722         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1723         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1724         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1725
1726         /*Add packet to the port tx queue*/
1727         tx_q = &tx_queue_zcp[vmdq_rx_q];
1728         len = tx_q->len;
1729
1730         /* Allocate an mbuf and populate the structure. */
1731         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1732         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1733         if (unlikely(mbuf == NULL)) {
1734                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1735                 RTE_LOG(ERR, VHOST_DATA,
1736                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1737                         dev->device_fh);
1738                 put_desc_to_used_list_zcp(vq, desc_idx);
1739                 return;
1740         }
1741
1742         if (vm2vm_mode == VM2VM_HARDWARE) {
1743                 /* Avoid using a vlan tag from any vm for external pkt, such as
1744                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1745                  * selection, MAC address determines it as an external pkt
1746                  * which should go to network, while vlan tag determine it as
1747                  * a vm2vm pkt should forward to another vm. Hardware confuse
1748                  * such a ambiguous situation, so pkt will lost.
1749                  */
1750                 vlan_tag = external_pkt_default_vlan_tag;
1751                 while (dev_ll != NULL) {
1752                         if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1753                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1754                                 &dev_ll->vdev->mac_address)) {
1755
1756                                 /*
1757                                  * Drop the packet if the TX packet is destined
1758                                  * for the TX device.
1759                                  */
1760                                 if (unlikely(dev_ll->vdev->dev->device_fh
1761                                         == dev->device_fh)) {
1762                                         LOG_DEBUG(VHOST_DATA,
1763                                         "(%"PRIu64") TX: Source and destination"
1764                                         "MAC addresses are the same. Dropping "
1765                                         "packet.\n",
1766                                         dev_ll->vdev->dev->device_fh);
1767                                         MBUF_HEADROOM_UINT32(mbuf)
1768                                                 = (uint32_t)desc_idx;
1769                                         __rte_mbuf_raw_free(mbuf);
1770                                         return;
1771                                 }
1772
1773                                 /*
1774                                  * Packet length offset 4 bytes for HW vlan
1775                                  * strip when L2 switch back.
1776                                  */
1777                                 offset = 4;
1778                                 vlan_tag =
1779                                 (uint16_t)
1780                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1781
1782                                 LOG_DEBUG(VHOST_DATA,
1783                                 "(%"PRIu64") TX: pkt to local VM device id:"
1784                                 "(%"PRIu64") vlan tag: %d.\n",
1785                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1786                                 vlan_tag);
1787
1788                                 break;
1789                         }
1790                         dev_ll = dev_ll->next;
1791                 }
1792         }
1793
1794         mbuf->nb_segs = m->nb_segs;
1795         mbuf->next = m->next;
1796         mbuf->data_len = m->data_len + offset;
1797         mbuf->pkt_len = mbuf->data_len;
1798         if (unlikely(need_copy)) {
1799                 /* Copy the packet contents to the mbuf. */
1800                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1801                         rte_pktmbuf_mtod(m, void *),
1802                         m->data_len);
1803         } else {
1804                 mbuf->data_off = m->data_off;
1805                 mbuf->buf_physaddr = m->buf_physaddr;
1806                 mbuf->buf_addr = m->buf_addr;
1807         }
1808         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1809         mbuf->vlan_tci = vlan_tag;
1810         mbuf->l2_len = sizeof(struct ether_hdr);
1811         mbuf->l3_len = sizeof(struct ipv4_hdr);
1812         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1813
1814         tx_q->m_table[len] = mbuf;
1815         len++;
1816
1817         LOG_DEBUG(VHOST_DATA,
1818                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1819                 dev->device_fh,
1820                 mbuf->nb_segs,
1821                 (mbuf->next == NULL) ? "null" : "non-null");
1822
1823         if (enable_stats) {
1824                 dev_statistics[dev->device_fh].tx_total++;
1825                 dev_statistics[dev->device_fh].tx++;
1826         }
1827
1828         if (unlikely(len == MAX_PKT_BURST)) {
1829                 m_table = (struct rte_mbuf **)tx_q->m_table;
1830                 ret = rte_eth_tx_burst(ports[0],
1831                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1832
1833                 /*
1834                  * Free any buffers not handled by TX and update
1835                  * the port stats.
1836                  */
1837                 if (unlikely(ret < len)) {
1838                         do {
1839                                 rte_pktmbuf_free(m_table[ret]);
1840                         } while (++ret < len);
1841                 }
1842
1843                 len = 0;
1844                 txmbuf_clean_zcp(dev, vpool);
1845         }
1846
1847         tx_q->len = len;
1848
1849         return;
1850 }
1851
1852 /*
1853  * This function TX all available packets in virtio TX queue for one
1854  * virtio-net device. If it is first packet, it learns MAC address and
1855  * setup VMDQ.
1856  */
1857 static inline void __attribute__((always_inline))
1858 virtio_dev_tx_zcp(struct virtio_net *dev)
1859 {
1860         struct rte_mbuf m;
1861         struct vhost_virtqueue *vq;
1862         struct vring_desc *desc;
1863         uint64_t buff_addr = 0, phys_addr;
1864         uint32_t head[MAX_PKT_BURST];
1865         uint32_t i;
1866         uint16_t free_entries, packet_success = 0;
1867         uint16_t avail_idx;
1868         uint8_t need_copy = 0;
1869         hpa_type addr_type;
1870         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1871
1872         vq = dev->virtqueue[VIRTIO_TXQ];
1873         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1874
1875         /* If there are no available buffers then return. */
1876         if (vq->last_used_idx_res == avail_idx)
1877                 return;
1878
1879         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1880
1881         /* Prefetch available ring to retrieve head indexes. */
1882         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1883
1884         /* Get the number of free entries in the ring */
1885         free_entries = (avail_idx - vq->last_used_idx_res);
1886
1887         /* Limit to MAX_PKT_BURST. */
1888         free_entries
1889                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1890
1891         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1892                 dev->device_fh, free_entries);
1893
1894         /* Retrieve all of the head indexes first to avoid caching issues. */
1895         for (i = 0; i < free_entries; i++)
1896                 head[i]
1897                         = vq->avail->ring[(vq->last_used_idx_res + i)
1898                         & (vq->size - 1)];
1899
1900         vq->last_used_idx_res += free_entries;
1901
1902         /* Prefetch descriptor index. */
1903         rte_prefetch0(&vq->desc[head[packet_success]]);
1904         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1905
1906         while (packet_success < free_entries) {
1907                 desc = &vq->desc[head[packet_success]];
1908
1909                 /* Discard first buffer as it is the virtio header */
1910                 desc = &vq->desc[desc->next];
1911
1912                 /* Buffer address translation. */
1913                 buff_addr = gpa_to_vva(dev, desc->addr);
1914                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1915
1916                 if (likely(packet_success < (free_entries - 1)))
1917                         /* Prefetch descriptor index. */
1918                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1919
1920                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1921                         RTE_LOG(ERR, VHOST_DATA,
1922                                 "(%"PRIu64") Invalid frame buffer address found"
1923                                 "when TX packets!\n",
1924                                 dev->device_fh);
1925                         packet_success++;
1926                         continue;
1927                 }
1928
1929                 /* Prefetch buffer address. */
1930                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1931
1932                 /*
1933                  * Setup dummy mbuf. This is copied to a real mbuf if
1934                  * transmitted out the physical port.
1935                  */
1936                 m.data_len = desc->len;
1937                 m.nb_segs = 1;
1938                 m.next = NULL;
1939                 m.data_off = 0;
1940                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1941                 m.buf_physaddr = phys_addr;
1942
1943                 /*
1944                  * Check if the frame buffer address from guest crosses
1945                  * sub-region or not.
1946                  */
1947                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1948                         RTE_LOG(ERR, VHOST_DATA,
1949                                 "(%"PRIu64") Frame buffer address cross "
1950                                 "sub-regioin found when attaching TX frame "
1951                                 "buffer address!\n",
1952                                 dev->device_fh);
1953                         need_copy = 1;
1954                 } else
1955                         need_copy = 0;
1956
1957                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1958
1959                 /*
1960                  * If this is the first received packet we need to learn
1961                  * the MAC and setup VMDQ
1962                  */
1963                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1964                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1965                                 /*
1966                                  * Discard frame if device is scheduled for
1967                                  * removal or a duplicate MAC address is found.
1968                                  */
1969                                 packet_success += free_entries;
1970                                 vq->last_used_idx += packet_success;
1971                                 break;
1972                         }
1973                 }
1974
1975                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1976                 packet_success++;
1977         }
1978 }
1979
1980 /*
1981  * This function is called by each data core. It handles all RX/TX registered
1982  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1983  * addresses are compared with all devices in the main linked list.
1984  */
1985 static int
1986 switch_worker_zcp(__attribute__((unused)) void *arg)
1987 {
1988         struct virtio_net *dev = NULL;
1989         struct vhost_dev  *vdev = NULL;
1990         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1991         struct virtio_net_data_ll *dev_ll;
1992         struct mbuf_table *tx_q;
1993         volatile struct lcore_ll_info *lcore_ll;
1994         const uint64_t drain_tsc
1995                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1996                 * BURST_TX_DRAIN_US;
1997         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1998         unsigned ret;
1999         const uint16_t lcore_id = rte_lcore_id();
2000         uint16_t count_in_ring, rx_count = 0;
2001
2002         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2003
2004         lcore_ll = lcore_info[lcore_id].lcore_ll;
2005         prev_tsc = 0;
2006
2007         while (1) {
2008                 cur_tsc = rte_rdtsc();
2009
2010                 /* TX burst queue drain */
2011                 diff_tsc = cur_tsc - prev_tsc;
2012                 if (unlikely(diff_tsc > drain_tsc)) {
2013                         /*
2014                          * Get mbuf from vpool.pool and detach mbuf and
2015                          * put back into vpool.ring.
2016                          */
2017                         dev_ll = lcore_ll->ll_root_used;
2018                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2019                                 /* Get virtio device ID */
2020                                 vdev = dev_ll->vdev;
2021                                 dev = vdev->dev;
2022
2023                                 if (likely(!vdev->remove)) {
2024                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2025                                         if (tx_q->len) {
2026                                                 LOG_DEBUG(VHOST_DATA,
2027                                                 "TX queue drained after timeout"
2028                                                 " with burst size %u\n",
2029                                                 tx_q->len);
2030
2031                                                 /*
2032                                                  * Tx any packets in the queue
2033                                                  */
2034                                                 ret = rte_eth_tx_burst(
2035                                                         ports[0],
2036                                                         (uint16_t)tx_q->txq_id,
2037                                                         (struct rte_mbuf **)
2038                                                         tx_q->m_table,
2039                                                         (uint16_t)tx_q->len);
2040                                                 if (unlikely(ret < tx_q->len)) {
2041                                                         do {
2042                                                                 rte_pktmbuf_free(
2043                                                                         tx_q->m_table[ret]);
2044                                                         } while (++ret < tx_q->len);
2045                                                 }
2046                                                 tx_q->len = 0;
2047
2048                                                 txmbuf_clean_zcp(dev,
2049                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2050                                         }
2051                                 }
2052                                 dev_ll = dev_ll->next;
2053                         }
2054                         prev_tsc = cur_tsc;
2055                 }
2056
2057                 rte_prefetch0(lcore_ll->ll_root_used);
2058
2059                 /*
2060                  * Inform the configuration core that we have exited the linked
2061                  * list and that no devices are in use if requested.
2062                  */
2063                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2064                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2065
2066                 /* Process devices */
2067                 dev_ll = lcore_ll->ll_root_used;
2068
2069                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2070                         vdev = dev_ll->vdev;
2071                         dev  = vdev->dev;
2072                         if (unlikely(vdev->remove)) {
2073                                 dev_ll = dev_ll->next;
2074                                 unlink_vmdq(vdev);
2075                                 vdev->ready = DEVICE_SAFE_REMOVE;
2076                                 continue;
2077                         }
2078
2079                         if (likely(vdev->ready == DEVICE_RX)) {
2080                                 uint32_t index = vdev->vmdq_rx_q;
2081                                 uint16_t i;
2082                                 count_in_ring
2083                                 = rte_ring_count(vpool_array[index].ring);
2084                                 uint16_t free_entries
2085                                 = (uint16_t)get_available_ring_num_zcp(dev);
2086
2087                                 /*
2088                                  * Attach all mbufs in vpool.ring and put back
2089                                  * into vpool.pool.
2090                                  */
2091                                 for (i = 0;
2092                                 i < RTE_MIN(free_entries,
2093                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2094                                 i++)
2095                                         attach_rxmbuf_zcp(dev);
2096
2097                                 /* Handle guest RX */
2098                                 rx_count = rte_eth_rx_burst(ports[0],
2099                                         vdev->vmdq_rx_q, pkts_burst,
2100                                         MAX_PKT_BURST);
2101
2102                                 if (rx_count) {
2103                                         ret_count = virtio_dev_rx_zcp(dev,
2104                                                         pkts_burst, rx_count);
2105                                         if (enable_stats) {
2106                                                 dev_statistics[dev->device_fh].rx_total
2107                                                         += rx_count;
2108                                                 dev_statistics[dev->device_fh].rx
2109                                                         += ret_count;
2110                                         }
2111                                         while (likely(rx_count)) {
2112                                                 rx_count--;
2113                                                 pktmbuf_detach_zcp(
2114                                                         pkts_burst[rx_count]);
2115                                                 rte_ring_sp_enqueue(
2116                                                         vpool_array[index].ring,
2117                                                         (void *)pkts_burst[rx_count]);
2118                                         }
2119                                 }
2120                         }
2121
2122                         if (likely(!vdev->remove))
2123                                 /* Handle guest TX */
2124                                 virtio_dev_tx_zcp(dev);
2125
2126                         /* Move to the next device in the list */
2127                         dev_ll = dev_ll->next;
2128                 }
2129         }
2130
2131         return 0;
2132 }
2133
2134
2135 /*
2136  * Add an entry to a used linked list. A free entry must first be found
2137  * in the free linked list using get_data_ll_free_entry();
2138  */
2139 static void
2140 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2141         struct virtio_net_data_ll *ll_dev)
2142 {
2143         struct virtio_net_data_ll *ll = *ll_root_addr;
2144
2145         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2146         ll_dev->next = NULL;
2147         rte_compiler_barrier();
2148
2149         /* If ll == NULL then this is the first device. */
2150         if (ll) {
2151                 /* Increment to the tail of the linked list. */
2152                 while ((ll->next != NULL) )
2153                         ll = ll->next;
2154
2155                 ll->next = ll_dev;
2156         } else {
2157                 *ll_root_addr = ll_dev;
2158         }
2159 }
2160
2161 /*
2162  * Remove an entry from a used linked list. The entry must then be added to
2163  * the free linked list using put_data_ll_free_entry().
2164  */
2165 static void
2166 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2167         struct virtio_net_data_ll *ll_dev,
2168         struct virtio_net_data_ll *ll_dev_last)
2169 {
2170         struct virtio_net_data_ll *ll = *ll_root_addr;
2171
2172         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2173                 return;
2174
2175         if (ll_dev == ll)
2176                 *ll_root_addr = ll_dev->next;
2177         else
2178                 if (likely(ll_dev_last != NULL))
2179                         ll_dev_last->next = ll_dev->next;
2180                 else
2181                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2182 }
2183
2184 /*
2185  * Find and return an entry from the free linked list.
2186  */
2187 static struct virtio_net_data_ll *
2188 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2189 {
2190         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2191         struct virtio_net_data_ll *ll_dev;
2192
2193         if (ll_free == NULL)
2194                 return NULL;
2195
2196         ll_dev = ll_free;
2197         *ll_root_addr = ll_free->next;
2198
2199         return ll_dev;
2200 }
2201
2202 /*
2203  * Place an entry back on to the free linked list.
2204  */
2205 static void
2206 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2207         struct virtio_net_data_ll *ll_dev)
2208 {
2209         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2210
2211         if (ll_dev == NULL)
2212                 return;
2213
2214         ll_dev->next = ll_free;
2215         *ll_root_addr = ll_dev;
2216 }
2217
2218 /*
2219  * Creates a linked list of a given size.
2220  */
2221 static struct virtio_net_data_ll *
2222 alloc_data_ll(uint32_t size)
2223 {
2224         struct virtio_net_data_ll *ll_new;
2225         uint32_t i;
2226
2227         /* Malloc and then chain the linked list. */
2228         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2229         if (ll_new == NULL) {
2230                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2231                 return NULL;
2232         }
2233
2234         for (i = 0; i < size - 1; i++) {
2235                 ll_new[i].vdev = NULL;
2236                 ll_new[i].next = &ll_new[i+1];
2237         }
2238         ll_new[i].next = NULL;
2239
2240         return (ll_new);
2241 }
2242
2243 /*
2244  * Create the main linked list along with each individual cores linked list. A used and a free list
2245  * are created to manage entries.
2246  */
2247 static int
2248 init_data_ll (void)
2249 {
2250         int lcore;
2251
2252         RTE_LCORE_FOREACH_SLAVE(lcore) {
2253                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2254                 if (lcore_info[lcore].lcore_ll == NULL) {
2255                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2256                         return -1;
2257                 }
2258
2259                 lcore_info[lcore].lcore_ll->device_num = 0;
2260                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2261                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2262                 if (num_devices % num_switching_cores)
2263                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2264                 else
2265                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2266         }
2267
2268         /* Allocate devices up to a maximum of MAX_DEVICES. */
2269         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2270
2271         return 0;
2272 }
2273
2274 /*
2275  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2276  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2277  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2278  */
2279 static void
2280 destroy_device (volatile struct virtio_net *dev)
2281 {
2282         struct virtio_net_data_ll *ll_lcore_dev_cur;
2283         struct virtio_net_data_ll *ll_main_dev_cur;
2284         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2285         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2286         struct vhost_dev *vdev;
2287         int lcore;
2288
2289         dev->flags &= ~VIRTIO_DEV_RUNNING;
2290
2291         vdev = (struct vhost_dev *)dev->priv;
2292         /*set the remove flag. */
2293         vdev->remove = 1;
2294         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2295                 rte_pause();
2296         }
2297
2298         /* Search for entry to be removed from lcore ll */
2299         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2300         while (ll_lcore_dev_cur != NULL) {
2301                 if (ll_lcore_dev_cur->vdev == vdev) {
2302                         break;
2303                 } else {
2304                         ll_lcore_dev_last = ll_lcore_dev_cur;
2305                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2306                 }
2307         }
2308
2309         if (ll_lcore_dev_cur == NULL) {
2310                 RTE_LOG(ERR, VHOST_CONFIG,
2311                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2312                         dev->device_fh);
2313                 return;
2314         }
2315
2316         /* Search for entry to be removed from main ll */
2317         ll_main_dev_cur = ll_root_used;
2318         ll_main_dev_last = NULL;
2319         while (ll_main_dev_cur != NULL) {
2320                 if (ll_main_dev_cur->vdev == vdev) {
2321                         break;
2322                 } else {
2323                         ll_main_dev_last = ll_main_dev_cur;
2324                         ll_main_dev_cur = ll_main_dev_cur->next;
2325                 }
2326         }
2327
2328         /* Remove entries from the lcore and main ll. */
2329         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2330         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2331
2332         /* Set the dev_removal_flag on each lcore. */
2333         RTE_LCORE_FOREACH_SLAVE(lcore) {
2334                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2335         }
2336
2337         /*
2338          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2339          * they can no longer access the device removed from the linked lists and that the devices
2340          * are no longer in use.
2341          */
2342         RTE_LCORE_FOREACH_SLAVE(lcore) {
2343                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2344                         rte_pause();
2345                 }
2346         }
2347
2348         /* Add the entries back to the lcore and main free ll.*/
2349         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2350         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2351
2352         /* Decrement number of device on the lcore. */
2353         lcore_info[vdev->coreid].lcore_ll->device_num--;
2354
2355         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2356
2357         if (zero_copy) {
2358                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2359
2360                 /* Stop the RX queue. */
2361                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2362                         LOG_DEBUG(VHOST_CONFIG,
2363                                 "(%"PRIu64") In destroy_device: Failed to stop "
2364                                 "rx queue:%d\n",
2365                                 dev->device_fh,
2366                                 vdev->vmdq_rx_q);
2367                 }
2368
2369                 LOG_DEBUG(VHOST_CONFIG,
2370                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2371                         "mempool back to ring for RX queue: %d\n",
2372                         dev->device_fh, vdev->vmdq_rx_q);
2373
2374                 mbuf_destroy_zcp(vpool);
2375
2376                 /* Stop the TX queue. */
2377                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2378                         LOG_DEBUG(VHOST_CONFIG,
2379                                 "(%"PRIu64") In destroy_device: Failed to "
2380                                 "stop tx queue:%d\n",
2381                                 dev->device_fh, vdev->vmdq_rx_q);
2382                 }
2383
2384                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2385
2386                 LOG_DEBUG(VHOST_CONFIG,
2387                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2388                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2389                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2390                         dev->device_fh);
2391
2392                 mbuf_destroy_zcp(vpool);
2393                 rte_free(vdev->regions_hpa);
2394         }
2395         rte_free(vdev);
2396
2397 }
2398
2399 /*
2400  * Calculate the region count of physical continous regions for one particular
2401  * region of whose vhost virtual address is continous. The particular region
2402  * start from vva_start, with size of 'size' in argument.
2403  */
2404 static uint32_t
2405 check_hpa_regions(uint64_t vva_start, uint64_t size)
2406 {
2407         uint32_t i, nregions = 0, page_size = getpagesize();
2408         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2409         if (vva_start % page_size) {
2410                 LOG_DEBUG(VHOST_CONFIG,
2411                         "in check_countinous: vva start(%p) mod page_size(%d) "
2412                         "has remainder\n",
2413                         (void *)(uintptr_t)vva_start, page_size);
2414                 return 0;
2415         }
2416         if (size % page_size) {
2417                 LOG_DEBUG(VHOST_CONFIG,
2418                         "in check_countinous: "
2419                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2420                         size, page_size);
2421                 return 0;
2422         }
2423         for (i = 0; i < size - page_size; i = i + page_size) {
2424                 cur_phys_addr
2425                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2426                 next_phys_addr = rte_mem_virt2phy(
2427                         (void *)(uintptr_t)(vva_start + i + page_size));
2428                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2429                         ++nregions;
2430                         LOG_DEBUG(VHOST_CONFIG,
2431                                 "in check_continuous: hva addr:(%p) is not "
2432                                 "continuous with hva addr:(%p), diff:%d\n",
2433                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2434                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2435                                 + page_size), page_size);
2436                         LOG_DEBUG(VHOST_CONFIG,
2437                                 "in check_continuous: hpa addr:(%p) is not "
2438                                 "continuous with hpa addr:(%p), "
2439                                 "diff:(%"PRIu64")\n",
2440                                 (void *)(uintptr_t)cur_phys_addr,
2441                                 (void *)(uintptr_t)next_phys_addr,
2442                                 (next_phys_addr-cur_phys_addr));
2443                 }
2444         }
2445         return nregions;
2446 }
2447
2448 /*
2449  * Divide each region whose vhost virtual address is continous into a few
2450  * sub-regions, make sure the physical address within each sub-region are
2451  * continous. And fill offset(to GPA) and size etc. information of each
2452  * sub-region into regions_hpa.
2453  */
2454 static uint32_t
2455 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2456 {
2457         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2458         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2459
2460         if (mem_region_hpa == NULL)
2461                 return 0;
2462
2463         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2464                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2465                         virtio_memory->regions[regionidx].address_offset;
2466                 mem_region_hpa[regionidx_hpa].guest_phys_address
2467                         = virtio_memory->regions[regionidx].guest_phys_address;
2468                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2469                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2470                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2471                 LOG_DEBUG(VHOST_CONFIG,
2472                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2473                         regionidx_hpa,
2474                         (void *)(uintptr_t)
2475                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2476                 LOG_DEBUG(VHOST_CONFIG,
2477                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2478                         regionidx_hpa,
2479                         (void *)(uintptr_t)
2480                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2481                 for (i = 0, k = 0;
2482                         i < virtio_memory->regions[regionidx].memory_size -
2483                                 page_size;
2484                         i += page_size) {
2485                         cur_phys_addr = rte_mem_virt2phy(
2486                                         (void *)(uintptr_t)(vva_start + i));
2487                         next_phys_addr = rte_mem_virt2phy(
2488                                         (void *)(uintptr_t)(vva_start +
2489                                         i + page_size));
2490                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2491                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2492                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2493                                         k + page_size;
2494                                 mem_region_hpa[regionidx_hpa].memory_size
2495                                         = k + page_size;
2496                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2497                                         "phys addr end  [%d]:(%p)\n",
2498                                         regionidx_hpa,
2499                                         (void *)(uintptr_t)
2500                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2501                                 LOG_DEBUG(VHOST_CONFIG,
2502                                         "in fill_hpa_regions: guest phys addr "
2503                                         "size [%d]:(%p)\n",
2504                                         regionidx_hpa,
2505                                         (void *)(uintptr_t)
2506                                         (mem_region_hpa[regionidx_hpa].memory_size));
2507                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2508                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2509                                 ++regionidx_hpa;
2510                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2511                                         next_phys_addr -
2512                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2513                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2514                                         " phys addr start[%d]:(%p)\n",
2515                                         regionidx_hpa,
2516                                         (void *)(uintptr_t)
2517                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2518                                 LOG_DEBUG(VHOST_CONFIG,
2519                                         "in fill_hpa_regions: host  phys addr "
2520                                         "start[%d]:(%p)\n",
2521                                         regionidx_hpa,
2522                                         (void *)(uintptr_t)
2523                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2524                                 k = 0;
2525                         } else {
2526                                 k += page_size;
2527                         }
2528                 }
2529                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2530                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2531                         + k + page_size;
2532                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2533                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2534                         "[%d]:(%p)\n", regionidx_hpa,
2535                         (void *)(uintptr_t)
2536                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2537                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2538                         "[%d]:(%p)\n", regionidx_hpa,
2539                         (void *)(uintptr_t)
2540                         (mem_region_hpa[regionidx_hpa].memory_size));
2541                 ++regionidx_hpa;
2542         }
2543         return regionidx_hpa;
2544 }
2545
2546 /*
2547  * A new device is added to a data core. First the device is added to the main linked list
2548  * and the allocated to a specific data core.
2549  */
2550 static int
2551 new_device (struct virtio_net *dev)
2552 {
2553         struct virtio_net_data_ll *ll_dev;
2554         int lcore, core_add = 0;
2555         uint32_t device_num_min = num_devices;
2556         struct vhost_dev *vdev;
2557         uint32_t regionidx;
2558
2559         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2560         if (vdev == NULL) {
2561                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2562                         dev->device_fh);
2563                 return -1;
2564         }
2565         vdev->dev = dev;
2566         dev->priv = vdev;
2567
2568         if (zero_copy) {
2569                 vdev->nregions_hpa = dev->mem->nregions;
2570                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2571                         vdev->nregions_hpa
2572                                 += check_hpa_regions(
2573                                         dev->mem->regions[regionidx].guest_phys_address
2574                                         + dev->mem->regions[regionidx].address_offset,
2575                                         dev->mem->regions[regionidx].memory_size);
2576
2577                 }
2578
2579                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2580                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2581                         CACHE_LINE_SIZE);
2582                 if (vdev->regions_hpa == NULL) {
2583                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2584                         rte_free(vdev);
2585                         return -1;
2586                 }
2587
2588
2589                 if (fill_hpa_memory_regions(
2590                         vdev->regions_hpa, dev->mem
2591                         ) != vdev->nregions_hpa) {
2592
2593                         RTE_LOG(ERR, VHOST_CONFIG,
2594                                 "hpa memory regions number mismatch: "
2595                                 "[%d]\n", vdev->nregions_hpa);
2596                         rte_free(vdev->regions_hpa);
2597                         rte_free(vdev);
2598                         return -1;
2599                 }
2600         }
2601
2602
2603         /* Add device to main ll */
2604         ll_dev = get_data_ll_free_entry(&ll_root_free);
2605         if (ll_dev == NULL) {
2606                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2607                         "of %d devices per core has been reached\n",
2608                         dev->device_fh, num_devices);
2609                 if (vdev->regions_hpa)
2610                         rte_free(vdev->regions_hpa);
2611                 rte_free(vdev);
2612                 return -1;
2613         }
2614         ll_dev->vdev = vdev;
2615         add_data_ll_entry(&ll_root_used, ll_dev);
2616         vdev->vmdq_rx_q
2617                 = dev->device_fh * (num_queues / num_devices);
2618
2619         if (zero_copy) {
2620                 uint32_t index = vdev->vmdq_rx_q;
2621                 uint32_t count_in_ring, i;
2622                 struct mbuf_table *tx_q;
2623
2624                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2625
2626                 LOG_DEBUG(VHOST_CONFIG,
2627                         "(%"PRIu64") in new_device: mbuf count in mempool "
2628                         "before attach is: %d\n",
2629                         dev->device_fh,
2630                         rte_mempool_count(vpool_array[index].pool));
2631                 LOG_DEBUG(VHOST_CONFIG,
2632                         "(%"PRIu64") in new_device: mbuf count in  ring "
2633                         "before attach  is : %d\n",
2634                         dev->device_fh, count_in_ring);
2635
2636                 /*
2637                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2638                  */
2639                 for (i = 0; i < count_in_ring; i++)
2640                         attach_rxmbuf_zcp(dev);
2641
2642                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2643                         "mempool after attach is: %d\n",
2644                         dev->device_fh,
2645                         rte_mempool_count(vpool_array[index].pool));
2646                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2647                         "ring after attach  is : %d\n",
2648                         dev->device_fh,
2649                         rte_ring_count(vpool_array[index].ring));
2650
2651                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2652                 tx_q->txq_id = vdev->vmdq_rx_q;
2653
2654                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2655                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2656
2657                         LOG_DEBUG(VHOST_CONFIG,
2658                                 "(%"PRIu64") In new_device: Failed to start "
2659                                 "tx queue:%d\n",
2660                                 dev->device_fh, vdev->vmdq_rx_q);
2661
2662                         mbuf_destroy_zcp(vpool);
2663                         rte_free(vdev->regions_hpa);
2664                         rte_free(vdev);
2665                         return -1;
2666                 }
2667
2668                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2669                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2670
2671                         LOG_DEBUG(VHOST_CONFIG,
2672                                 "(%"PRIu64") In new_device: Failed to start "
2673                                 "rx queue:%d\n",
2674                                 dev->device_fh, vdev->vmdq_rx_q);
2675
2676                         /* Stop the TX queue. */
2677                         if (rte_eth_dev_tx_queue_stop(ports[0],
2678                                 vdev->vmdq_rx_q) != 0) {
2679                                 LOG_DEBUG(VHOST_CONFIG,
2680                                         "(%"PRIu64") In new_device: Failed to "
2681                                         "stop tx queue:%d\n",
2682                                         dev->device_fh, vdev->vmdq_rx_q);
2683                         }
2684
2685                         mbuf_destroy_zcp(vpool);
2686                         rte_free(vdev->regions_hpa);
2687                         rte_free(vdev);
2688                         return -1;
2689                 }
2690
2691         }
2692
2693         /*reset ready flag*/
2694         vdev->ready = DEVICE_MAC_LEARNING;
2695         vdev->remove = 0;
2696
2697         /* Find a suitable lcore to add the device. */
2698         RTE_LCORE_FOREACH_SLAVE(lcore) {
2699                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2700                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2701                         core_add = lcore;
2702                 }
2703         }
2704         /* Add device to lcore ll */
2705         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2706         if (ll_dev == NULL) {
2707                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2708                 vdev->ready = DEVICE_SAFE_REMOVE;
2709                 destroy_device(dev);
2710                 if (vdev->regions_hpa)
2711                         rte_free(vdev->regions_hpa);
2712                 rte_free(vdev);
2713                 return -1;
2714         }
2715         ll_dev->vdev = vdev;
2716         vdev->coreid = core_add;
2717
2718         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2719
2720         /* Initialize device stats */
2721         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2722
2723         /* Disable notifications. */
2724         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2725         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2726         lcore_info[vdev->coreid].lcore_ll->device_num++;
2727         dev->flags |= VIRTIO_DEV_RUNNING;
2728
2729         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2730
2731         return 0;
2732 }
2733
2734 /*
2735  * These callback allow devices to be added to the data core when configuration
2736  * has been fully complete.
2737  */
2738 static const struct virtio_net_device_ops virtio_net_device_ops =
2739 {
2740         .new_device =  new_device,
2741         .destroy_device = destroy_device,
2742 };
2743
2744 /*
2745  * This is a thread will wake up after a period to print stats if the user has
2746  * enabled them.
2747  */
2748 static void
2749 print_stats(void)
2750 {
2751         struct virtio_net_data_ll *dev_ll;
2752         uint64_t tx_dropped, rx_dropped;
2753         uint64_t tx, tx_total, rx, rx_total;
2754         uint32_t device_fh;
2755         const char clr[] = { 27, '[', '2', 'J', '\0' };
2756         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2757
2758         while(1) {
2759                 sleep(enable_stats);
2760
2761                 /* Clear screen and move to top left */
2762                 printf("%s%s", clr, top_left);
2763
2764                 printf("\nDevice statistics ====================================");
2765
2766                 dev_ll = ll_root_used;
2767                 while (dev_ll != NULL) {
2768                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2769                         tx_total = dev_statistics[device_fh].tx_total;
2770                         tx = dev_statistics[device_fh].tx;
2771                         tx_dropped = tx_total - tx;
2772                         if (zero_copy == 0) {
2773                                 rx_total = rte_atomic64_read(
2774                                         &dev_statistics[device_fh].rx_total_atomic);
2775                                 rx = rte_atomic64_read(
2776                                         &dev_statistics[device_fh].rx_atomic);
2777                         } else {
2778                                 rx_total = dev_statistics[device_fh].rx_total;
2779                                 rx = dev_statistics[device_fh].rx;
2780                         }
2781                         rx_dropped = rx_total - rx;
2782
2783                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2784                                         "\nTX total:            %"PRIu64""
2785                                         "\nTX dropped:          %"PRIu64""
2786                                         "\nTX successful:               %"PRIu64""
2787                                         "\nRX total:            %"PRIu64""
2788                                         "\nRX dropped:          %"PRIu64""
2789                                         "\nRX successful:               %"PRIu64"",
2790                                         device_fh,
2791                                         tx_total,
2792                                         tx_dropped,
2793                                         tx,
2794                                         rx_total,
2795                                         rx_dropped,
2796                                         rx);
2797
2798                         dev_ll = dev_ll->next;
2799                 }
2800                 printf("\n======================================================\n");
2801         }
2802 }
2803
2804 static void
2805 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2806         char *ring_name, uint32_t nb_mbuf)
2807 {
2808         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2809         vpool_array[index].pool
2810                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2811                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2812                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2813                 rte_pktmbuf_init, NULL, socket, 0);
2814         if (vpool_array[index].pool != NULL) {
2815                 vpool_array[index].ring
2816                         = rte_ring_create(ring_name,
2817                                 rte_align32pow2(nb_mbuf + 1),
2818                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2819                 if (likely(vpool_array[index].ring != NULL)) {
2820                         LOG_DEBUG(VHOST_CONFIG,
2821                                 "in setup_mempool_tbl: mbuf count in "
2822                                 "mempool is: %d\n",
2823                                 rte_mempool_count(vpool_array[index].pool));
2824                         LOG_DEBUG(VHOST_CONFIG,
2825                                 "in setup_mempool_tbl: mbuf count in "
2826                                 "ring   is: %d\n",
2827                                 rte_ring_count(vpool_array[index].ring));
2828                 } else {
2829                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2830                                 ring_name);
2831                 }
2832
2833                 /* Need consider head room. */
2834                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2835         } else {
2836                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2837         }
2838 }
2839
2840
2841 /*
2842  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2843  * device is also registered here to handle the IOCTLs.
2844  */
2845 int
2846 MAIN(int argc, char *argv[])
2847 {
2848         struct rte_mempool *mbuf_pool = NULL;
2849         unsigned lcore_id, core_id = 0;
2850         unsigned nb_ports, valid_num_ports;
2851         int ret;
2852         uint8_t portid, queue_id = 0;
2853         static pthread_t tid;
2854
2855         /* init EAL */
2856         ret = rte_eal_init(argc, argv);
2857         if (ret < 0)
2858                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2859         argc -= ret;
2860         argv += ret;
2861
2862         /* parse app arguments */
2863         ret = us_vhost_parse_args(argc, argv);
2864         if (ret < 0)
2865                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2866
2867         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2868                 if (rte_lcore_is_enabled(lcore_id))
2869                         lcore_ids[core_id ++] = lcore_id;
2870
2871         if (rte_lcore_count() > RTE_MAX_LCORE)
2872                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2873
2874         /*set the number of swithcing cores available*/
2875         num_switching_cores = rte_lcore_count()-1;
2876
2877         /* Get the number of physical ports. */
2878         nb_ports = rte_eth_dev_count();
2879         if (nb_ports > RTE_MAX_ETHPORTS)
2880                 nb_ports = RTE_MAX_ETHPORTS;
2881
2882         /*
2883          * Update the global var NUM_PORTS and global array PORTS
2884          * and get value of var VALID_NUM_PORTS according to system ports number
2885          */
2886         valid_num_ports = check_ports_num(nb_ports);
2887
2888         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2889                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2890                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2891                 return -1;
2892         }
2893
2894         if (zero_copy == 0) {
2895                 /* Create the mbuf pool. */
2896                 mbuf_pool = rte_mempool_create(
2897                                 "MBUF_POOL",
2898                                 NUM_MBUFS_PER_PORT
2899                                 * valid_num_ports,
2900                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2901                                 sizeof(struct rte_pktmbuf_pool_private),
2902                                 rte_pktmbuf_pool_init, NULL,
2903                                 rte_pktmbuf_init, NULL,
2904                                 rte_socket_id(), 0);
2905                 if (mbuf_pool == NULL)
2906                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2907
2908                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2909                         vpool_array[queue_id].pool = mbuf_pool;
2910
2911                 if (vm2vm_mode == VM2VM_HARDWARE) {
2912                         /* Enable VT loop back to let L2 switch to do it. */
2913                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2914                         LOG_DEBUG(VHOST_CONFIG,
2915                                 "Enable loop back for L2 switch in vmdq.\n");
2916                 }
2917         } else {
2918                 uint32_t nb_mbuf;
2919                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2920                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2921
2922                 /*
2923                  * Zero copy defers queue RX/TX start to the time when guest
2924                  * finishes its startup and packet buffers from that guest are
2925                  * available.
2926                  */
2927                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2928                 rx_conf_default.rx_drop_en = 0;
2929                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2930                 nb_mbuf = num_rx_descriptor
2931                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2932                         + num_switching_cores * MAX_PKT_BURST;
2933
2934                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2935                         snprintf(pool_name, sizeof(pool_name),
2936                                 "rxmbuf_pool_%u", queue_id);
2937                         snprintf(ring_name, sizeof(ring_name),
2938                                 "rxmbuf_ring_%u", queue_id);
2939                         setup_mempool_tbl(rte_socket_id(), queue_id,
2940                                 pool_name, ring_name, nb_mbuf);
2941                 }
2942
2943                 nb_mbuf = num_tx_descriptor
2944                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2945                                 + num_switching_cores * MAX_PKT_BURST;
2946
2947                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2948                         snprintf(pool_name, sizeof(pool_name),
2949                                 "txmbuf_pool_%u", queue_id);
2950                         snprintf(ring_name, sizeof(ring_name),
2951                                 "txmbuf_ring_%u", queue_id);
2952                         setup_mempool_tbl(rte_socket_id(),
2953                                 (queue_id + MAX_QUEUES),
2954                                 pool_name, ring_name, nb_mbuf);
2955                 }
2956
2957                 if (vm2vm_mode == VM2VM_HARDWARE) {
2958                         /* Enable VT loop back to let L2 switch to do it. */
2959                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2960                         LOG_DEBUG(VHOST_CONFIG,
2961                                 "Enable loop back for L2 switch in vmdq.\n");
2962                 }
2963         }
2964         /* Set log level. */
2965         rte_set_log_level(LOG_LEVEL);
2966
2967         /* initialize all ports */
2968         for (portid = 0; portid < nb_ports; portid++) {
2969                 /* skip ports that are not enabled */
2970                 if ((enabled_port_mask & (1 << portid)) == 0) {
2971                         RTE_LOG(INFO, VHOST_PORT,
2972                                 "Skipping disabled port %d\n", portid);
2973                         continue;
2974                 }
2975                 if (port_init(portid) != 0)
2976                         rte_exit(EXIT_FAILURE,
2977                                 "Cannot initialize network ports\n");
2978         }
2979
2980         /* Initialise all linked lists. */
2981         if (init_data_ll() == -1)
2982                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2983
2984         /* Initialize device stats */
2985         memset(&dev_statistics, 0, sizeof(dev_statistics));
2986
2987         /* Enable stats if the user option is set. */
2988         if (enable_stats)
2989                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2990
2991         /* Launch all data cores. */
2992         if (zero_copy == 0) {
2993                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2994                         rte_eal_remote_launch(switch_worker,
2995                                 mbuf_pool, lcore_id);
2996                 }
2997         } else {
2998                 uint32_t count_in_mempool, index, i;
2999                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3000                         /* For all RX and TX queues. */
3001                         count_in_mempool
3002                                 = rte_mempool_count(vpool_array[index].pool);
3003
3004                         /*
3005                          * Transfer all un-attached mbufs from vpool.pool
3006                          * to vpoo.ring.
3007                          */
3008                         for (i = 0; i < count_in_mempool; i++) {
3009                                 struct rte_mbuf *mbuf
3010                                         = __rte_mbuf_raw_alloc(
3011                                                 vpool_array[index].pool);
3012                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3013                                                 (void *)mbuf);
3014                         }
3015
3016                         LOG_DEBUG(VHOST_CONFIG,
3017                                 "in MAIN: mbuf count in mempool at initial "
3018                                 "is: %d\n", count_in_mempool);
3019                         LOG_DEBUG(VHOST_CONFIG,
3020                                 "in MAIN: mbuf count in  ring at initial  is :"
3021                                 " %d\n",
3022                                 rte_ring_count(vpool_array[index].ring));
3023                 }
3024
3025                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3026                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3027                                 lcore_id);
3028         }
3029
3030         if (mergeable == 0)
3031                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3032
3033         /* Register CUSE device to handle IOCTLs. */
3034         ret = rte_vhost_driver_register((char *)&dev_basename);
3035         if (ret != 0)
3036                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3037
3038         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3039
3040         /* Start CUSE session. */
3041         rte_vhost_driver_session_start();
3042         return 0;
3043
3044 }
3045