examples/vhost: check offset with vlan
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
103
104 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
106
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223         .rx_thresh = {
224                 .pthresh = RX_PTHRESH,
225                 .hthresh = RX_HTHRESH,
226                 .wthresh = RX_WTHRESH,
227         },
228         .rx_drop_en = 1,
229 };
230
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237         .tx_thresh = {
238                 .pthresh = TX_PTHRESH,
239                 .hthresh = TX_HTHRESH,
240                 .wthresh = TX_WTHRESH,
241         },
242         .tx_free_thresh = 0, /* Use PMD default values */
243         .tx_rs_thresh = 0, /* Use PMD default values */
244 };
245
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248         .rxmode = {
249                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250                 .split_hdr_size = 0,
251                 .header_split   = 0, /**< Header Split disabled */
252                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
253                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
254                 /*
255                  * It is necessary for 1G NIC such as I350,
256                  * this fixes bug of ipv4 forwarding in guest can't
257                  * forward pakets from one virtio dev to another virtio dev.
258                  */
259                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
262         },
263
264         .txmode = {
265                 .mq_mode = ETH_MQ_TX_NONE,
266         },
267         .rx_adv_conf = {
268                 /*
269                  * should be overridden separately in code with
270                  * appropriate values
271                  */
272                 .vmdq_rx_conf = {
273                         .nb_queue_pools = ETH_8_POOLS,
274                         .enable_default_pool = 0,
275                         .default_pool = 0,
276                         .nb_pool_maps = 0,
277                         .pool_map = {{0, 0},},
278                 },
279         },
280 };
281
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
290         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310         unsigned len;
311         unsigned txq_id;
312         struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323         unsigned char   h_dest[ETH_ALEN];
324         unsigned char   h_source[ETH_ALEN];
325         __be16          h_vlan_proto;
326         __be16          h_vlan_TCI;
327         __be16          h_vlan_encapsulated_proto;
328 };
329
330 /* IPv4 Header */
331 struct ipv4_hdr {
332         uint8_t  version_ihl;           /**< version and header length */
333         uint8_t  type_of_service;       /**< type of service */
334         uint16_t total_length;          /**< length of packet */
335         uint16_t packet_id;             /**< packet ID */
336         uint16_t fragment_offset;       /**< fragmentation offset */
337         uint8_t  time_to_live;          /**< time to live */
338         uint8_t  next_proto_id;         /**< protocol ID */
339         uint16_t hdr_checksum;          /**< header checksum */
340         uint32_t src_addr;              /**< source address */
341         uint32_t dst_addr;              /**< destination address */
342 } __attribute__((__packed__));
343
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347
348 /* Per-device statistics struct */
349 struct device_statistics {
350         uint64_t tx_total;
351         rte_atomic64_t rx_total_atomic;
352         uint64_t rx_total;
353         uint64_t tx;
354         rte_atomic64_t rx_atomic;
355         uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366         struct rte_eth_vmdq_rx_conf conf;
367         unsigned i;
368
369         memset(&conf, 0, sizeof(conf));
370         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371         conf.nb_pool_maps = num_devices;
372         conf.enable_loop_back =
373                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374
375         for (i = 0; i < conf.nb_pool_maps; i++) {
376                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
377                 conf.pool_map[i].pools = (1UL << i);
378         }
379
380         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383         return 0;
384 }
385
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394         if (num_devices > max_nb_devices) {
395                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396                 return -1;
397         }
398         return 0;
399 }
400
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408         struct rte_eth_dev_info dev_info;
409         struct rte_eth_conf port_conf;
410         uint16_t rx_rings, tx_rings;
411         uint16_t rx_ring_size, tx_ring_size;
412         int retval;
413         uint16_t q;
414
415         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416         rte_eth_dev_info_get (port, &dev_info);
417
418         /*configure the number of supported virtio devices based on VMDQ limits */
419         num_devices = dev_info.max_vmdq_pools;
420         num_queues = dev_info.max_rx_queues;
421
422         if (zero_copy) {
423                 rx_ring_size = num_rx_descriptor;
424                 tx_ring_size = num_tx_descriptor;
425                 tx_rings = dev_info.max_tx_queues;
426         } else {
427                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429                 tx_rings = (uint16_t)rte_lcore_count();
430         }
431
432         retval = validate_num_devices(MAX_DEVICES);
433         if (retval < 0)
434                 return retval;
435
436         /* Get port configuration. */
437         retval = get_eth_conf(&port_conf, num_devices);
438         if (retval < 0)
439                 return retval;
440
441         if (port >= rte_eth_dev_count()) return -1;
442
443         rx_rings = (uint16_t)num_queues,
444         /* Configure ethernet device. */
445         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446         if (retval != 0)
447                 return retval;
448
449         /* Setup the queues. */
450         for (q = 0; q < rx_rings; q ++) {
451                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
453                                                 vpool_array[q].pool);
454                 if (retval < 0)
455                         return retval;
456         }
457         for (q = 0; q < tx_rings; q ++) {
458                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
460                 if (retval < 0)
461                         return retval;
462         }
463
464         /* Start the device. */
465         retval  = rte_eth_dev_start(port);
466         if (retval < 0) {
467                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468                 return retval;
469         }
470
471         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475                         (unsigned)port,
476                         vmdq_ports_eth_addr[port].addr_bytes[0],
477                         vmdq_ports_eth_addr[port].addr_bytes[1],
478                         vmdq_ports_eth_addr[port].addr_bytes[2],
479                         vmdq_ports_eth_addr[port].addr_bytes[3],
480                         vmdq_ports_eth_addr[port].addr_bytes[4],
481                         vmdq_ports_eth_addr[port].addr_bytes[5]);
482
483         return 0;
484 }
485
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492         /* parse number string */
493
494         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495                 return -1;
496         else
497                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498
499         return 0;
500 }
501
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508         char *end = NULL;
509         unsigned long pm;
510
511         errno = 0;
512
513         /* parse hexadecimal string */
514         pm = strtoul(portmask, &end, 16);
515         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516                 return -1;
517
518         if (pm == 0)
519                 return -1;
520
521         return pm;
522
523 }
524
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531         char *end = NULL;
532         unsigned long num;
533
534         errno = 0;
535
536         /* parse unsigned int string */
537         num = strtoul(q_arg, &end, 10);
538         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539                 return -1;
540
541         if (num > max_valid_value)
542                 return -1;
543
544         return num;
545
546 }
547
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555         "               --vm2vm [0|1|2]\n"
556         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557         "               --dev-basename <name>\n"
558         "               --nb-devices ND\n"
559         "               -p PORTMASK: Set mask for ports to be used by application\n"
560         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566         "               --dev-basename: The basename to be used for the character device.\n"
567         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
568                         "zero copy\n"
569         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
570                         "used only when zero copy is enabled.\n"
571         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
572                         "used only when zero copy is enabled.\n",
573                prgname);
574 }
575
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582         int opt, ret;
583         int option_index;
584         unsigned i;
585         const char *prgname = argv[0];
586         static struct option long_option[] = {
587                 {"vm2vm", required_argument, NULL, 0},
588                 {"rx-retry", required_argument, NULL, 0},
589                 {"rx-retry-delay", required_argument, NULL, 0},
590                 {"rx-retry-num", required_argument, NULL, 0},
591                 {"mergeable", required_argument, NULL, 0},
592                 {"stats", required_argument, NULL, 0},
593                 {"dev-basename", required_argument, NULL, 0},
594                 {"zero-copy", required_argument, NULL, 0},
595                 {"rx-desc-num", required_argument, NULL, 0},
596                 {"tx-desc-num", required_argument, NULL, 0},
597                 {NULL, 0, 0, 0},
598         };
599
600         /* Parse command line */
601         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602                 switch (opt) {
603                 /* Portmask */
604                 case 'p':
605                         enabled_port_mask = parse_portmask(optarg);
606                         if (enabled_port_mask == 0) {
607                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608                                 us_vhost_usage(prgname);
609                                 return -1;
610                         }
611                         break;
612
613                 case 0:
614                         /* Enable/disable vm2vm comms. */
615                         if (!strncmp(long_option[option_index].name, "vm2vm",
616                                 MAX_LONG_OPT_SZ)) {
617                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618                                 if (ret == -1) {
619                                         RTE_LOG(INFO, VHOST_CONFIG,
620                                                 "Invalid argument for "
621                                                 "vm2vm [0|1|2]\n");
622                                         us_vhost_usage(prgname);
623                                         return -1;
624                                 } else {
625                                         vm2vm_mode = (vm2vm_type)ret;
626                                 }
627                         }
628
629                         /* Enable/disable retries on RX. */
630                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631                                 ret = parse_num_opt(optarg, 1);
632                                 if (ret == -1) {
633                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634                                         us_vhost_usage(prgname);
635                                         return -1;
636                                 } else {
637                                         enable_retry = ret;
638                                 }
639                         }
640
641                         /* Specify the retries delay time (in useconds) on RX. */
642                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643                                 ret = parse_num_opt(optarg, INT32_MAX);
644                                 if (ret == -1) {
645                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646                                         us_vhost_usage(prgname);
647                                         return -1;
648                                 } else {
649                                         burst_rx_delay_time = ret;
650                                 }
651                         }
652
653                         /* Specify the retries number on RX. */
654                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655                                 ret = parse_num_opt(optarg, INT32_MAX);
656                                 if (ret == -1) {
657                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658                                         us_vhost_usage(prgname);
659                                         return -1;
660                                 } else {
661                                         burst_rx_retry_num = ret;
662                                 }
663                         }
664
665                         /* Enable/disable RX mergeable buffers. */
666                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667                                 ret = parse_num_opt(optarg, 1);
668                                 if (ret == -1) {
669                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670                                         us_vhost_usage(prgname);
671                                         return -1;
672                                 } else {
673                                         mergeable = !!ret;
674                                         if (ret) {
675                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
676                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
677                                                         = JUMBO_FRAME_MAX_SIZE;
678                                         }
679                                 }
680                         }
681
682                         /* Enable/disable stats. */
683                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684                                 ret = parse_num_opt(optarg, INT32_MAX);
685                                 if (ret == -1) {
686                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687                                         us_vhost_usage(prgname);
688                                         return -1;
689                                 } else {
690                                         enable_stats = ret;
691                                 }
692                         }
693
694                         /* Set character device basename. */
695                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696                                 if (us_vhost_parse_basename(optarg) == -1) {
697                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698                                         us_vhost_usage(prgname);
699                                         return -1;
700                                 }
701                         }
702
703                         /* Enable/disable rx/tx zero copy. */
704                         if (!strncmp(long_option[option_index].name,
705                                 "zero-copy", MAX_LONG_OPT_SZ)) {
706                                 ret = parse_num_opt(optarg, 1);
707                                 if (ret == -1) {
708                                         RTE_LOG(INFO, VHOST_CONFIG,
709                                                 "Invalid argument"
710                                                 " for zero-copy [0|1]\n");
711                                         us_vhost_usage(prgname);
712                                         return -1;
713                                 } else
714                                         zero_copy = ret;
715
716                                 if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719                                         "zero copy vhost APP, please "
720                                         "disable RTE_MBUF_REFCNT\n"
721                                         "in config file and then rebuild DPDK "
722                                         "core lib!\n"
723                                         "Otherwise please disable zero copy "
724                                         "flag in command line!\n");
725                                         return -1;
726 #endif
727                                 }
728                         }
729
730                         /* Specify the descriptor number on RX. */
731                         if (!strncmp(long_option[option_index].name,
732                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
733                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
734                                 if ((ret == -1) || (!POWEROF2(ret))) {
735                                         RTE_LOG(INFO, VHOST_CONFIG,
736                                         "Invalid argument for rx-desc-num[0-N],"
737                                         "power of 2 required.\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         num_rx_descriptor = ret;
742                                 }
743                         }
744
745                         /* Specify the descriptor number on TX. */
746                         if (!strncmp(long_option[option_index].name,
747                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
748                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
749                                 if ((ret == -1) || (!POWEROF2(ret))) {
750                                         RTE_LOG(INFO, VHOST_CONFIG,
751                                         "Invalid argument for tx-desc-num [0-N],"
752                                         "power of 2 required.\n");
753                                         us_vhost_usage(prgname);
754                                         return -1;
755                                 } else {
756                                         num_tx_descriptor = ret;
757                                 }
758                         }
759
760                         break;
761
762                         /* Invalid option - print options. */
763                 default:
764                         us_vhost_usage(prgname);
765                         return -1;
766                 }
767         }
768
769         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770                 if (enabled_port_mask & (1 << i))
771                         ports[num_ports++] = (uint8_t)i;
772         }
773
774         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777                 return -1;
778         }
779
780         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781                 RTE_LOG(INFO, VHOST_PORT,
782                         "Vhost zero copy doesn't support software vm2vm,"
783                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
784                 return -1;
785         }
786
787         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788                 RTE_LOG(INFO, VHOST_PORT,
789                         "Vhost zero copy doesn't support jumbo frame,"
790                         "please specify '--mergeable 0' to disable the "
791                         "mergeable feature.\n");
792                 return -1;
793         }
794
795         return 0;
796 }
797
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804         unsigned valid_num_ports = num_ports;
805         unsigned portid;
806
807         if (num_ports > nb_ports) {
808                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809                         num_ports, nb_ports);
810                 num_ports = nb_ports;
811         }
812
813         for (portid = 0; portid < num_ports; portid ++) {
814                 if (ports[portid] >= nb_ports) {
815                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816                                 ports[portid], (nb_ports - 1));
817                         ports[portid] = INVALID_PORT_ID;
818                         valid_num_ports--;
819                 }
820         }
821         return valid_num_ports;
822 }
823
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
830         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
831         unsigned int index;                                                                                                                                                                                             \
832         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
833                                                                                                                                                                                                                                         \
834         if ((header))                                                                                                                                                                                                   \
835                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
836         else                                                                                                                                                                                                                    \
837                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
838         for (index = 0; index < (size); index++) {                                                                                                                                              \
839                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
840                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
841         }                                                                                                                                                                                                                               \
842         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
843                                                                                                                                                                                                                                         \
844         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856         uint32_t buf_len, hpa_type *addr_type)
857 {
858         struct virtio_memory_regions_hpa *region;
859         uint32_t regionidx;
860         uint64_t vhost_pa = 0;
861
862         *addr_type = PHYS_ADDR_INVALID;
863
864         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865                 region = &vdev->regions_hpa[regionidx];
866                 if ((guest_pa >= region->guest_phys_address) &&
867                         (guest_pa <= region->guest_phys_address_end)) {
868                         vhost_pa = region->host_phys_addr_offset + guest_pa;
869                         if (likely((guest_pa + buf_len - 1)
870                                 <= region->guest_phys_address_end))
871                                 *addr_type = PHYS_ADDR_CONTINUOUS;
872                         else
873                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
874                         break;
875                 }
876         }
877
878         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880                 (void *)(uintptr_t)vhost_pa);
881
882         return vhost_pa;
883 }
884
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901         struct ether_hdr *pkt_hdr;
902         struct virtio_net_data_ll *dev_ll;
903         struct virtio_net *dev = vdev->dev;
904         int i, ret;
905
906         /* Learn MAC address of guest device from packet */
907         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908
909         dev_ll = ll_root_used;
910
911         while (dev_ll != NULL) {
912                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914                         return -1;
915                 }
916                 dev_ll = dev_ll->next;
917         }
918
919         for (i = 0; i < ETHER_ADDR_LEN; i++)
920                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921
922         /* vlan_tag currently uses the device_id. */
923         vdev->vlan_tag = vlan_tags[dev->device_fh];
924
925         /* Print out VMDQ registration info. */
926         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927                 dev->device_fh,
928                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931                 vdev->vlan_tag);
932
933         /* Register the MAC address. */
934         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935         if (ret)
936                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937                                         dev->device_fh);
938
939         /* Enable stripping of the vlan tag as we handle routing. */
940         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941
942         /* Set device as ready for RX. */
943         vdev->ready = DEVICE_RX;
944
945         return 0;
946 }
947
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955         unsigned i = 0;
956         unsigned rx_count;
957         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958
959         if (vdev->ready == DEVICE_RX) {
960                 /*clear MAC and VLAN settings*/
961                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962                 for (i = 0; i < 6; i++)
963                         vdev->mac_address.addr_bytes[i] = 0;
964
965                 vdev->vlan_tag = 0;
966
967                 /*Clear out the receive buffers*/
968                 rx_count = rte_eth_rx_burst(ports[0],
969                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970
971                 while (rx_count) {
972                         for (i = 0; i < rx_count; i++)
973                                 rte_pktmbuf_free(pkts_burst[i]);
974
975                         rx_count = rte_eth_rx_burst(ports[0],
976                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977                 }
978
979                 vdev->ready = DEVICE_MAC_LEARNING;
980         }
981 }
982
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990         struct virtio_net_data_ll *dev_ll;
991         struct ether_hdr *pkt_hdr;
992         uint64_t ret = 0;
993         struct virtio_net *dev = vdev->dev;
994         struct virtio_net *tdev; /* destination virito device */
995
996         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997
998         /*get the used devices list*/
999         dev_ll = ll_root_used;
1000
1001         while (dev_ll != NULL) {
1002                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003                                           &dev_ll->vdev->mac_address)) {
1004
1005                         /* Drop the packet if the TX packet is destined for the TX device. */
1006                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008                                                         dev->device_fh);
1009                                 return 0;
1010                         }
1011                         tdev = dev_ll->vdev->dev;
1012
1013
1014                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015
1016                         if (unlikely(dev_ll->vdev->remove)) {
1017                                 /*drop the packet if the device is marked for removal*/
1018                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019                         } else {
1020                                 /*send the packet to the local virtio device*/
1021                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022                                 if (enable_stats) {
1023                                         rte_atomic64_add(
1024                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1025                                         1);
1026                                         rte_atomic64_add(
1027                                         &dev_statistics[tdev->device_fh].rx_atomic,
1028                                         ret);
1029                                         dev_statistics[tdev->device_fh].tx_total++;
1030                                         dev_statistics[tdev->device_fh].tx += ret;
1031                                 }
1032                         }
1033
1034                         return 0;
1035                 }
1036                 dev_ll = dev_ll->next;
1037         }
1038
1039         return -1;
1040 }
1041
1042 /*
1043  * Check if the destination MAC of a packet is one local VM,
1044  * and get its vlan tag, and offset if it is.
1045  */
1046 static inline int __attribute__((always_inline))
1047 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1048         uint32_t *offset, uint16_t *vlan_tag)
1049 {
1050         struct virtio_net_data_ll *dev_ll = ll_root_used;
1051         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052
1053         while (dev_ll != NULL) {
1054                 if ((dev_ll->vdev->ready == DEVICE_RX)
1055                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1056                 &dev_ll->vdev->mac_address)) {
1057                         /*
1058                          * Drop the packet if the TX packet is
1059                          * destined for the TX device.
1060                          */
1061                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062                                 LOG_DEBUG(VHOST_DATA,
1063                                 "(%"PRIu64") TX: Source and destination"
1064                                 " MAC addresses are the same. Dropping "
1065                                 "packet.\n",
1066                                 dev_ll->vdev->dev->device_fh);
1067                                 return -1;
1068                         }
1069
1070                         /*
1071                          * HW vlan strip will reduce the packet length
1072                          * by minus length of vlan tag, so need restore
1073                          * the packet length by plus it.
1074                          */
1075                         *offset = VLAN_HLEN;
1076                         *vlan_tag =
1077                         (uint16_t)
1078                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1079
1080                         LOG_DEBUG(VHOST_DATA,
1081                         "(%"PRIu64") TX: pkt to local VM device id:"
1082                         "(%"PRIu64") vlan tag: %d.\n",
1083                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1084                         vlan_tag);
1085
1086                         break;
1087                 }
1088                 dev_ll = dev_ll->next;
1089         }
1090         return 0;
1091 }
1092
1093 /*
1094  * This function routes the TX packet to the correct interface. This may be a local device
1095  * or the physical port.
1096  */
1097 static inline void __attribute__((always_inline))
1098 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1099 {
1100         struct mbuf_table *tx_q;
1101         struct rte_mbuf **m_table;
1102         unsigned len, ret, offset = 0;
1103         const uint16_t lcore_id = rte_lcore_id();
1104         struct virtio_net *dev = vdev->dev;
1105
1106         /*check if destination is local VM*/
1107         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1108                 rte_pktmbuf_free(m);
1109                 return;
1110         }
1111
1112         if (vm2vm_mode == VM2VM_HARDWARE) {
1113                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1114                         offset > rte_pktmbuf_tailroom(m)) {
1115                         rte_pktmbuf_free(m);
1116                         return;
1117                 }
1118         }
1119
1120         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1121
1122         /*Add packet to the port tx queue*/
1123         tx_q = &lcore_tx_queue[lcore_id];
1124         len = tx_q->len;
1125
1126         m->ol_flags = PKT_TX_VLAN_PKT;
1127
1128         m->data_len += offset;
1129         m->pkt_len += offset;
1130
1131         m->vlan_tci = vlan_tag;
1132
1133         tx_q->m_table[len] = m;
1134         len++;
1135         if (enable_stats) {
1136                 dev_statistics[dev->device_fh].tx_total++;
1137                 dev_statistics[dev->device_fh].tx++;
1138         }
1139
1140         if (unlikely(len == MAX_PKT_BURST)) {
1141                 m_table = (struct rte_mbuf **)tx_q->m_table;
1142                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1143                 /* Free any buffers not handled by TX and update the port stats. */
1144                 if (unlikely(ret < len)) {
1145                         do {
1146                                 rte_pktmbuf_free(m_table[ret]);
1147                         } while (++ret < len);
1148                 }
1149
1150                 len = 0;
1151         }
1152
1153         tx_q->len = len;
1154         return;
1155 }
1156 /*
1157  * This function is called by each data core. It handles all RX/TX registered with the
1158  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1159  * with all devices in the main linked list.
1160  */
1161 static int
1162 switch_worker(__attribute__((unused)) void *arg)
1163 {
1164         struct rte_mempool *mbuf_pool = arg;
1165         struct virtio_net *dev = NULL;
1166         struct vhost_dev *vdev = NULL;
1167         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1168         struct virtio_net_data_ll *dev_ll;
1169         struct mbuf_table *tx_q;
1170         volatile struct lcore_ll_info *lcore_ll;
1171         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1172         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1173         unsigned ret, i;
1174         const uint16_t lcore_id = rte_lcore_id();
1175         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1176         uint16_t rx_count = 0;
1177         uint16_t tx_count;
1178         uint32_t retry = 0;
1179
1180         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1181         lcore_ll = lcore_info[lcore_id].lcore_ll;
1182         prev_tsc = 0;
1183
1184         tx_q = &lcore_tx_queue[lcore_id];
1185         for (i = 0; i < num_cores; i ++) {
1186                 if (lcore_ids[i] == lcore_id) {
1187                         tx_q->txq_id = i;
1188                         break;
1189                 }
1190         }
1191
1192         while(1) {
1193                 cur_tsc = rte_rdtsc();
1194                 /*
1195                  * TX burst queue drain
1196                  */
1197                 diff_tsc = cur_tsc - prev_tsc;
1198                 if (unlikely(diff_tsc > drain_tsc)) {
1199
1200                         if (tx_q->len) {
1201                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1202
1203                                 /*Tx any packets in the queue*/
1204                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1205                                                                            (struct rte_mbuf **)tx_q->m_table,
1206                                                                            (uint16_t)tx_q->len);
1207                                 if (unlikely(ret < tx_q->len)) {
1208                                         do {
1209                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1210                                         } while (++ret < tx_q->len);
1211                                 }
1212
1213                                 tx_q->len = 0;
1214                         }
1215
1216                         prev_tsc = cur_tsc;
1217
1218                 }
1219
1220                 rte_prefetch0(lcore_ll->ll_root_used);
1221                 /*
1222                  * Inform the configuration core that we have exited the linked list and that no devices are
1223                  * in use if requested.
1224                  */
1225                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1226                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1227
1228                 /*
1229                  * Process devices
1230                  */
1231                 dev_ll = lcore_ll->ll_root_used;
1232
1233                 while (dev_ll != NULL) {
1234                         /*get virtio device ID*/
1235                         vdev = dev_ll->vdev;
1236                         dev = vdev->dev;
1237
1238                         if (unlikely(vdev->remove)) {
1239                                 dev_ll = dev_ll->next;
1240                                 unlink_vmdq(vdev);
1241                                 vdev->ready = DEVICE_SAFE_REMOVE;
1242                                 continue;
1243                         }
1244                         if (likely(vdev->ready == DEVICE_RX)) {
1245                                 /*Handle guest RX*/
1246                                 rx_count = rte_eth_rx_burst(ports[0],
1247                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1248
1249                                 if (rx_count) {
1250                                         /*
1251                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1252                                         * Here MAX_PKT_BURST must be less than virtio queue size
1253                                         */
1254                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1255                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1256                                                         rte_delay_us(burst_rx_delay_time);
1257                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1258                                                                 break;
1259                                                 }
1260                                         }
1261                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1262                                         if (enable_stats) {
1263                                                 rte_atomic64_add(
1264                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1265                                                 rx_count);
1266                                                 rte_atomic64_add(
1267                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1268                                         }
1269                                         while (likely(rx_count)) {
1270                                                 rx_count--;
1271                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1272                                         }
1273
1274                                 }
1275                         }
1276
1277                         if (likely(!vdev->remove)) {
1278                                 /* Handle guest TX*/
1279                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1280                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1281                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1282                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1283                                                 while (tx_count--)
1284                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1285                                         }
1286                                 }
1287                                 while (tx_count)
1288                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1289                         }
1290
1291                         /*move to the next device in the list*/
1292                         dev_ll = dev_ll->next;
1293                 }
1294         }
1295
1296         return 0;
1297 }
1298
1299 /*
1300  * This function gets available ring number for zero copy rx.
1301  * Only one thread will call this funciton for a paticular virtio device,
1302  * so, it is designed as non-thread-safe function.
1303  */
1304 static inline uint32_t __attribute__((always_inline))
1305 get_available_ring_num_zcp(struct virtio_net *dev)
1306 {
1307         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1308         uint16_t avail_idx;
1309
1310         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1311         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1312 }
1313
1314 /*
1315  * This function gets available ring index for zero copy rx,
1316  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1317  * Only one thread will call this funciton for a paticular virtio device,
1318  * so, it is designed as non-thread-safe function.
1319  */
1320 static inline uint32_t __attribute__((always_inline))
1321 get_available_ring_index_zcp(struct virtio_net *dev,
1322         uint16_t *res_base_idx, uint32_t count)
1323 {
1324         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1325         uint16_t avail_idx;
1326         uint32_t retry = 0;
1327         uint16_t free_entries;
1328
1329         *res_base_idx = vq->last_used_idx_res;
1330         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1331         free_entries = (avail_idx - *res_base_idx);
1332
1333         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1334                         "avail idx: %d, "
1335                         "res base idx:%d, free entries:%d\n",
1336                         dev->device_fh, avail_idx, *res_base_idx,
1337                         free_entries);
1338
1339         /*
1340          * If retry is enabled and the queue is full then we wait
1341          * and retry to avoid packet loss.
1342          */
1343         if (enable_retry && unlikely(count > free_entries)) {
1344                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1345                         rte_delay_us(burst_rx_delay_time);
1346                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1347                         free_entries = (avail_idx - *res_base_idx);
1348                         if (count <= free_entries)
1349                                 break;
1350                 }
1351         }
1352
1353         /*check that we have enough buffers*/
1354         if (unlikely(count > free_entries))
1355                 count = free_entries;
1356
1357         if (unlikely(count == 0)) {
1358                 LOG_DEBUG(VHOST_DATA,
1359                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1360                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1361                         dev->device_fh, avail_idx,
1362                         *res_base_idx, free_entries);
1363                 return 0;
1364         }
1365
1366         vq->last_used_idx_res = *res_base_idx + count;
1367
1368         return count;
1369 }
1370
1371 /*
1372  * This function put descriptor back to used list.
1373  */
1374 static inline void __attribute__((always_inline))
1375 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1376 {
1377         uint16_t res_cur_idx = vq->last_used_idx;
1378         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1379         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1380         rte_compiler_barrier();
1381         *(volatile uint16_t *)&vq->used->idx += 1;
1382         vq->last_used_idx += 1;
1383
1384         /* Kick the guest if necessary. */
1385         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1386                 eventfd_write((int)vq->kickfd, 1);
1387 }
1388
1389 /*
1390  * This function get available descriptor from vitio vring and un-attached mbuf
1391  * from vpool->ring, and then attach them together. It needs adjust the offset
1392  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1393  * frame data may be put to wrong location in mbuf.
1394  */
1395 static inline void __attribute__((always_inline))
1396 attach_rxmbuf_zcp(struct virtio_net *dev)
1397 {
1398         uint16_t res_base_idx, desc_idx;
1399         uint64_t buff_addr, phys_addr;
1400         struct vhost_virtqueue *vq;
1401         struct vring_desc *desc;
1402         struct rte_mbuf *mbuf = NULL;
1403         struct vpool *vpool;
1404         hpa_type addr_type;
1405         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1406
1407         vpool = &vpool_array[vdev->vmdq_rx_q];
1408         vq = dev->virtqueue[VIRTIO_RXQ];
1409
1410         do {
1411                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1412                                 1) != 1))
1413                         return;
1414                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1415
1416                 desc = &vq->desc[desc_idx];
1417                 if (desc->flags & VRING_DESC_F_NEXT) {
1418                         desc = &vq->desc[desc->next];
1419                         buff_addr = gpa_to_vva(dev, desc->addr);
1420                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1421                                         &addr_type);
1422                 } else {
1423                         buff_addr = gpa_to_vva(dev,
1424                                         desc->addr + vq->vhost_hlen);
1425                         phys_addr = gpa_to_hpa(vdev,
1426                                         desc->addr + vq->vhost_hlen,
1427                                         desc->len, &addr_type);
1428                 }
1429
1430                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1431                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1432                                 " address found when attaching RX frame buffer"
1433                                 " address!\n", dev->device_fh);
1434                         put_desc_to_used_list_zcp(vq, desc_idx);
1435                         continue;
1436                 }
1437
1438                 /*
1439                  * Check if the frame buffer address from guest crosses
1440                  * sub-region or not.
1441                  */
1442                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1443                         RTE_LOG(ERR, VHOST_DATA,
1444                                 "(%"PRIu64") Frame buffer address cross "
1445                                 "sub-regioin found when attaching RX frame "
1446                                 "buffer address!\n",
1447                                 dev->device_fh);
1448                         put_desc_to_used_list_zcp(vq, desc_idx);
1449                         continue;
1450                 }
1451         } while (unlikely(phys_addr == 0));
1452
1453         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1454         if (unlikely(mbuf == NULL)) {
1455                 LOG_DEBUG(VHOST_DATA,
1456                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1457                         "ring_sc_dequeue fail.\n",
1458                         dev->device_fh);
1459                 put_desc_to_used_list_zcp(vq, desc_idx);
1460                 return;
1461         }
1462
1463         if (unlikely(vpool->buf_size > desc->len)) {
1464                 LOG_DEBUG(VHOST_DATA,
1465                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1466                         "length(%d) of descriptor idx: %d less than room "
1467                         "size required: %d\n",
1468                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1469                 put_desc_to_used_list_zcp(vq, desc_idx);
1470                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1471                 return;
1472         }
1473
1474         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1475         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1476         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1477         mbuf->data_len = desc->len;
1478         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1479
1480         LOG_DEBUG(VHOST_DATA,
1481                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1482                 "descriptor idx:%d\n",
1483                 dev->device_fh, res_base_idx, desc_idx);
1484
1485         __rte_mbuf_raw_free(mbuf);
1486
1487         return;
1488 }
1489
1490 /*
1491  * Detach an attched packet mbuf -
1492  *  - restore original mbuf address and length values.
1493  *  - reset pktmbuf data and data_len to their default values.
1494  *  All other fields of the given packet mbuf will be left intact.
1495  *
1496  * @param m
1497  *   The attached packet mbuf.
1498  */
1499 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1500 {
1501         const struct rte_mempool *mp = m->pool;
1502         void *buf = RTE_MBUF_TO_BADDR(m);
1503         uint32_t buf_ofs;
1504         uint32_t buf_len = mp->elt_size - sizeof(*m);
1505         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1506
1507         m->buf_addr = buf;
1508         m->buf_len = (uint16_t)buf_len;
1509
1510         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1511                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1512         m->data_off = buf_ofs;
1513
1514         m->data_len = 0;
1515 }
1516
1517 /*
1518  * This function is called after packets have been transimited. It fetchs mbuf
1519  * from vpool->pool, detached it and put into vpool->ring. It also update the
1520  * used index and kick the guest if necessary.
1521  */
1522 static inline uint32_t __attribute__((always_inline))
1523 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1524 {
1525         struct rte_mbuf *mbuf;
1526         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1527         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1528         uint32_t index = 0;
1529         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1530
1531         LOG_DEBUG(VHOST_DATA,
1532                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1533                 "clean is: %d\n",
1534                 dev->device_fh, mbuf_count);
1535         LOG_DEBUG(VHOST_DATA,
1536                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1537                 "clean  is : %d\n",
1538                 dev->device_fh, rte_ring_count(vpool->ring));
1539
1540         for (index = 0; index < mbuf_count; index++) {
1541                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1542                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1543                         pktmbuf_detach_zcp(mbuf);
1544                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1545
1546                 /* Update used index buffer information. */
1547                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1548                 vq->used->ring[used_idx].len = 0;
1549
1550                 used_idx = (used_idx + 1) & (vq->size - 1);
1551         }
1552
1553         LOG_DEBUG(VHOST_DATA,
1554                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1555                 "clean is: %d\n",
1556                 dev->device_fh, rte_mempool_count(vpool->pool));
1557         LOG_DEBUG(VHOST_DATA,
1558                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1559                 "clean  is : %d\n",
1560                 dev->device_fh, rte_ring_count(vpool->ring));
1561         LOG_DEBUG(VHOST_DATA,
1562                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1563                 "vq->last_used_idx:%d\n",
1564                 dev->device_fh, vq->last_used_idx);
1565
1566         vq->last_used_idx += mbuf_count;
1567
1568         LOG_DEBUG(VHOST_DATA,
1569                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1570                 "vq->last_used_idx:%d\n",
1571                 dev->device_fh, vq->last_used_idx);
1572
1573         rte_compiler_barrier();
1574
1575         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1576
1577         /* Kick guest if required. */
1578         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1579                 eventfd_write((int)vq->kickfd, 1);
1580
1581         return 0;
1582 }
1583
1584 /*
1585  * This function is called when a virtio device is destroy.
1586  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1587  */
1588 static void mbuf_destroy_zcp(struct vpool *vpool)
1589 {
1590         struct rte_mbuf *mbuf = NULL;
1591         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1592
1593         LOG_DEBUG(VHOST_CONFIG,
1594                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1595                 "mbuf_destroy_zcp is: %d\n",
1596                 mbuf_count);
1597         LOG_DEBUG(VHOST_CONFIG,
1598                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1599                 "mbuf_destroy_zcp  is : %d\n",
1600                 rte_ring_count(vpool->ring));
1601
1602         for (index = 0; index < mbuf_count; index++) {
1603                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1604                 if (likely(mbuf != NULL)) {
1605                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1606                                 pktmbuf_detach_zcp(mbuf);
1607                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1608                 }
1609         }
1610
1611         LOG_DEBUG(VHOST_CONFIG,
1612                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1613                 "mbuf_destroy_zcp is: %d\n",
1614                 rte_mempool_count(vpool->pool));
1615         LOG_DEBUG(VHOST_CONFIG,
1616                 "in mbuf_destroy_zcp: mbuf count in ring after "
1617                 "mbuf_destroy_zcp is : %d\n",
1618                 rte_ring_count(vpool->ring));
1619 }
1620
1621 /*
1622  * This function update the use flag and counter.
1623  */
1624 static inline uint32_t __attribute__((always_inline))
1625 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1626         uint32_t count)
1627 {
1628         struct vhost_virtqueue *vq;
1629         struct vring_desc *desc;
1630         struct rte_mbuf *buff;
1631         /* The virtio_hdr is initialised to 0. */
1632         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1633                 = {{0, 0, 0, 0, 0, 0}, 0};
1634         uint64_t buff_hdr_addr = 0;
1635         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1636         uint32_t head_idx, packet_success = 0;
1637         uint16_t res_cur_idx;
1638
1639         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1640
1641         if (count == 0)
1642                 return 0;
1643
1644         vq = dev->virtqueue[VIRTIO_RXQ];
1645         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1646
1647         res_cur_idx = vq->last_used_idx;
1648         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1649                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1650
1651         /* Retrieve all of the head indexes first to avoid caching issues. */
1652         for (head_idx = 0; head_idx < count; head_idx++)
1653                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1654
1655         /*Prefetch descriptor index. */
1656         rte_prefetch0(&vq->desc[head[packet_success]]);
1657
1658         while (packet_success != count) {
1659                 /* Get descriptor from available ring */
1660                 desc = &vq->desc[head[packet_success]];
1661
1662                 buff = pkts[packet_success];
1663                 LOG_DEBUG(VHOST_DATA,
1664                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1665                         "pkt[%d] descriptor idx: %d\n",
1666                         dev->device_fh, packet_success,
1667                         MBUF_HEADROOM_UINT32(buff));
1668
1669                 PRINT_PACKET(dev,
1670                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1671                         + RTE_PKTMBUF_HEADROOM),
1672                         rte_pktmbuf_data_len(buff), 0);
1673
1674                 /* Buffer address translation for virtio header. */
1675                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1676                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1677
1678                 /*
1679                  * If the descriptors are chained the header and data are
1680                  * placed in separate buffers.
1681                  */
1682                 if (desc->flags & VRING_DESC_F_NEXT) {
1683                         desc->len = vq->vhost_hlen;
1684                         desc = &vq->desc[desc->next];
1685                         desc->len = rte_pktmbuf_data_len(buff);
1686                 } else {
1687                         desc->len = packet_len;
1688                 }
1689
1690                 /* Update used ring with desc information */
1691                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1692                         = head[packet_success];
1693                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1694                         = packet_len;
1695                 res_cur_idx++;
1696                 packet_success++;
1697
1698                 /* A header is required per buffer. */
1699                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1700                         (const void *)&virtio_hdr, vq->vhost_hlen);
1701
1702                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1703
1704                 if (likely(packet_success < count)) {
1705                         /* Prefetch descriptor index. */
1706                         rte_prefetch0(&vq->desc[head[packet_success]]);
1707                 }
1708         }
1709
1710         rte_compiler_barrier();
1711
1712         LOG_DEBUG(VHOST_DATA,
1713                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1714                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1715                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1716
1717         *(volatile uint16_t *)&vq->used->idx += count;
1718         vq->last_used_idx += count;
1719
1720         LOG_DEBUG(VHOST_DATA,
1721                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1722                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1723                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1724
1725         /* Kick the guest if necessary. */
1726         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1727                 eventfd_write((int)vq->kickfd, 1);
1728
1729         return count;
1730 }
1731
1732 /*
1733  * This function routes the TX packet to the correct interface.
1734  * This may be a local device or the physical port.
1735  */
1736 static inline void __attribute__((always_inline))
1737 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1738         uint32_t desc_idx, uint8_t need_copy)
1739 {
1740         struct mbuf_table *tx_q;
1741         struct rte_mbuf **m_table;
1742         struct rte_mbuf *mbuf = NULL;
1743         unsigned len, ret, offset = 0;
1744         struct vpool *vpool;
1745         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1746         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1747
1748         /*Add packet to the port tx queue*/
1749         tx_q = &tx_queue_zcp[vmdq_rx_q];
1750         len = tx_q->len;
1751
1752         /* Allocate an mbuf and populate the structure. */
1753         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1754         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1755         if (unlikely(mbuf == NULL)) {
1756                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1757                 RTE_LOG(ERR, VHOST_DATA,
1758                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1759                         dev->device_fh);
1760                 put_desc_to_used_list_zcp(vq, desc_idx);
1761                 return;
1762         }
1763
1764         if (vm2vm_mode == VM2VM_HARDWARE) {
1765                 /* Avoid using a vlan tag from any vm for external pkt, such as
1766                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1767                  * selection, MAC address determines it as an external pkt
1768                  * which should go to network, while vlan tag determine it as
1769                  * a vm2vm pkt should forward to another vm. Hardware confuse
1770                  * such a ambiguous situation, so pkt will lost.
1771                  */
1772                 vlan_tag = external_pkt_default_vlan_tag;
1773                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1774                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1775                         __rte_mbuf_raw_free(mbuf);
1776                         return;
1777                 }
1778         }
1779
1780         mbuf->nb_segs = m->nb_segs;
1781         mbuf->next = m->next;
1782         mbuf->data_len = m->data_len + offset;
1783         mbuf->pkt_len = mbuf->data_len;
1784         if (unlikely(need_copy)) {
1785                 /* Copy the packet contents to the mbuf. */
1786                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1787                         rte_pktmbuf_mtod(m, void *),
1788                         m->data_len);
1789         } else {
1790                 mbuf->data_off = m->data_off;
1791                 mbuf->buf_physaddr = m->buf_physaddr;
1792                 mbuf->buf_addr = m->buf_addr;
1793         }
1794         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1795         mbuf->vlan_tci = vlan_tag;
1796         mbuf->l2_len = sizeof(struct ether_hdr);
1797         mbuf->l3_len = sizeof(struct ipv4_hdr);
1798         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1799
1800         tx_q->m_table[len] = mbuf;
1801         len++;
1802
1803         LOG_DEBUG(VHOST_DATA,
1804                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1805                 dev->device_fh,
1806                 mbuf->nb_segs,
1807                 (mbuf->next == NULL) ? "null" : "non-null");
1808
1809         if (enable_stats) {
1810                 dev_statistics[dev->device_fh].tx_total++;
1811                 dev_statistics[dev->device_fh].tx++;
1812         }
1813
1814         if (unlikely(len == MAX_PKT_BURST)) {
1815                 m_table = (struct rte_mbuf **)tx_q->m_table;
1816                 ret = rte_eth_tx_burst(ports[0],
1817                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1818
1819                 /*
1820                  * Free any buffers not handled by TX and update
1821                  * the port stats.
1822                  */
1823                 if (unlikely(ret < len)) {
1824                         do {
1825                                 rte_pktmbuf_free(m_table[ret]);
1826                         } while (++ret < len);
1827                 }
1828
1829                 len = 0;
1830                 txmbuf_clean_zcp(dev, vpool);
1831         }
1832
1833         tx_q->len = len;
1834
1835         return;
1836 }
1837
1838 /*
1839  * This function TX all available packets in virtio TX queue for one
1840  * virtio-net device. If it is first packet, it learns MAC address and
1841  * setup VMDQ.
1842  */
1843 static inline void __attribute__((always_inline))
1844 virtio_dev_tx_zcp(struct virtio_net *dev)
1845 {
1846         struct rte_mbuf m;
1847         struct vhost_virtqueue *vq;
1848         struct vring_desc *desc;
1849         uint64_t buff_addr = 0, phys_addr;
1850         uint32_t head[MAX_PKT_BURST];
1851         uint32_t i;
1852         uint16_t free_entries, packet_success = 0;
1853         uint16_t avail_idx;
1854         uint8_t need_copy = 0;
1855         hpa_type addr_type;
1856         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1857
1858         vq = dev->virtqueue[VIRTIO_TXQ];
1859         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1860
1861         /* If there are no available buffers then return. */
1862         if (vq->last_used_idx_res == avail_idx)
1863                 return;
1864
1865         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1866
1867         /* Prefetch available ring to retrieve head indexes. */
1868         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1869
1870         /* Get the number of free entries in the ring */
1871         free_entries = (avail_idx - vq->last_used_idx_res);
1872
1873         /* Limit to MAX_PKT_BURST. */
1874         free_entries
1875                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1876
1877         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1878                 dev->device_fh, free_entries);
1879
1880         /* Retrieve all of the head indexes first to avoid caching issues. */
1881         for (i = 0; i < free_entries; i++)
1882                 head[i]
1883                         = vq->avail->ring[(vq->last_used_idx_res + i)
1884                         & (vq->size - 1)];
1885
1886         vq->last_used_idx_res += free_entries;
1887
1888         /* Prefetch descriptor index. */
1889         rte_prefetch0(&vq->desc[head[packet_success]]);
1890         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1891
1892         while (packet_success < free_entries) {
1893                 desc = &vq->desc[head[packet_success]];
1894
1895                 /* Discard first buffer as it is the virtio header */
1896                 desc = &vq->desc[desc->next];
1897
1898                 /* Buffer address translation. */
1899                 buff_addr = gpa_to_vva(dev, desc->addr);
1900                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1901                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1902                         &addr_type);
1903
1904                 if (likely(packet_success < (free_entries - 1)))
1905                         /* Prefetch descriptor index. */
1906                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1907
1908                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1909                         RTE_LOG(ERR, VHOST_DATA,
1910                                 "(%"PRIu64") Invalid frame buffer address found"
1911                                 "when TX packets!\n",
1912                                 dev->device_fh);
1913                         packet_success++;
1914                         continue;
1915                 }
1916
1917                 /* Prefetch buffer address. */
1918                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1919
1920                 /*
1921                  * Setup dummy mbuf. This is copied to a real mbuf if
1922                  * transmitted out the physical port.
1923                  */
1924                 m.data_len = desc->len;
1925                 m.nb_segs = 1;
1926                 m.next = NULL;
1927                 m.data_off = 0;
1928                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1929                 m.buf_physaddr = phys_addr;
1930
1931                 /*
1932                  * Check if the frame buffer address from guest crosses
1933                  * sub-region or not.
1934                  */
1935                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1936                         RTE_LOG(ERR, VHOST_DATA,
1937                                 "(%"PRIu64") Frame buffer address cross "
1938                                 "sub-regioin found when attaching TX frame "
1939                                 "buffer address!\n",
1940                                 dev->device_fh);
1941                         need_copy = 1;
1942                 } else
1943                         need_copy = 0;
1944
1945                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1946
1947                 /*
1948                  * If this is the first received packet we need to learn
1949                  * the MAC and setup VMDQ
1950                  */
1951                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1952                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1953                                 /*
1954                                  * Discard frame if device is scheduled for
1955                                  * removal or a duplicate MAC address is found.
1956                                  */
1957                                 packet_success += free_entries;
1958                                 vq->last_used_idx += packet_success;
1959                                 break;
1960                         }
1961                 }
1962
1963                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1964                 packet_success++;
1965         }
1966 }
1967
1968 /*
1969  * This function is called by each data core. It handles all RX/TX registered
1970  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1971  * addresses are compared with all devices in the main linked list.
1972  */
1973 static int
1974 switch_worker_zcp(__attribute__((unused)) void *arg)
1975 {
1976         struct virtio_net *dev = NULL;
1977         struct vhost_dev  *vdev = NULL;
1978         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1979         struct virtio_net_data_ll *dev_ll;
1980         struct mbuf_table *tx_q;
1981         volatile struct lcore_ll_info *lcore_ll;
1982         const uint64_t drain_tsc
1983                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1984                 * BURST_TX_DRAIN_US;
1985         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1986         unsigned ret;
1987         const uint16_t lcore_id = rte_lcore_id();
1988         uint16_t count_in_ring, rx_count = 0;
1989
1990         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1991
1992         lcore_ll = lcore_info[lcore_id].lcore_ll;
1993         prev_tsc = 0;
1994
1995         while (1) {
1996                 cur_tsc = rte_rdtsc();
1997
1998                 /* TX burst queue drain */
1999                 diff_tsc = cur_tsc - prev_tsc;
2000                 if (unlikely(diff_tsc > drain_tsc)) {
2001                         /*
2002                          * Get mbuf from vpool.pool and detach mbuf and
2003                          * put back into vpool.ring.
2004                          */
2005                         dev_ll = lcore_ll->ll_root_used;
2006                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2007                                 /* Get virtio device ID */
2008                                 vdev = dev_ll->vdev;
2009                                 dev = vdev->dev;
2010
2011                                 if (likely(!vdev->remove)) {
2012                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2013                                         if (tx_q->len) {
2014                                                 LOG_DEBUG(VHOST_DATA,
2015                                                 "TX queue drained after timeout"
2016                                                 " with burst size %u\n",
2017                                                 tx_q->len);
2018
2019                                                 /*
2020                                                  * Tx any packets in the queue
2021                                                  */
2022                                                 ret = rte_eth_tx_burst(
2023                                                         ports[0],
2024                                                         (uint16_t)tx_q->txq_id,
2025                                                         (struct rte_mbuf **)
2026                                                         tx_q->m_table,
2027                                                         (uint16_t)tx_q->len);
2028                                                 if (unlikely(ret < tx_q->len)) {
2029                                                         do {
2030                                                                 rte_pktmbuf_free(
2031                                                                         tx_q->m_table[ret]);
2032                                                         } while (++ret < tx_q->len);
2033                                                 }
2034                                                 tx_q->len = 0;
2035
2036                                                 txmbuf_clean_zcp(dev,
2037                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2038                                         }
2039                                 }
2040                                 dev_ll = dev_ll->next;
2041                         }
2042                         prev_tsc = cur_tsc;
2043                 }
2044
2045                 rte_prefetch0(lcore_ll->ll_root_used);
2046
2047                 /*
2048                  * Inform the configuration core that we have exited the linked
2049                  * list and that no devices are in use if requested.
2050                  */
2051                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2052                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2053
2054                 /* Process devices */
2055                 dev_ll = lcore_ll->ll_root_used;
2056
2057                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2058                         vdev = dev_ll->vdev;
2059                         dev  = vdev->dev;
2060                         if (unlikely(vdev->remove)) {
2061                                 dev_ll = dev_ll->next;
2062                                 unlink_vmdq(vdev);
2063                                 vdev->ready = DEVICE_SAFE_REMOVE;
2064                                 continue;
2065                         }
2066
2067                         if (likely(vdev->ready == DEVICE_RX)) {
2068                                 uint32_t index = vdev->vmdq_rx_q;
2069                                 uint16_t i;
2070                                 count_in_ring
2071                                 = rte_ring_count(vpool_array[index].ring);
2072                                 uint16_t free_entries
2073                                 = (uint16_t)get_available_ring_num_zcp(dev);
2074
2075                                 /*
2076                                  * Attach all mbufs in vpool.ring and put back
2077                                  * into vpool.pool.
2078                                  */
2079                                 for (i = 0;
2080                                 i < RTE_MIN(free_entries,
2081                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2082                                 i++)
2083                                         attach_rxmbuf_zcp(dev);
2084
2085                                 /* Handle guest RX */
2086                                 rx_count = rte_eth_rx_burst(ports[0],
2087                                         vdev->vmdq_rx_q, pkts_burst,
2088                                         MAX_PKT_BURST);
2089
2090                                 if (rx_count) {
2091                                         ret_count = virtio_dev_rx_zcp(dev,
2092                                                         pkts_burst, rx_count);
2093                                         if (enable_stats) {
2094                                                 dev_statistics[dev->device_fh].rx_total
2095                                                         += rx_count;
2096                                                 dev_statistics[dev->device_fh].rx
2097                                                         += ret_count;
2098                                         }
2099                                         while (likely(rx_count)) {
2100                                                 rx_count--;
2101                                                 pktmbuf_detach_zcp(
2102                                                         pkts_burst[rx_count]);
2103                                                 rte_ring_sp_enqueue(
2104                                                         vpool_array[index].ring,
2105                                                         (void *)pkts_burst[rx_count]);
2106                                         }
2107                                 }
2108                         }
2109
2110                         if (likely(!vdev->remove))
2111                                 /* Handle guest TX */
2112                                 virtio_dev_tx_zcp(dev);
2113
2114                         /* Move to the next device in the list */
2115                         dev_ll = dev_ll->next;
2116                 }
2117         }
2118
2119         return 0;
2120 }
2121
2122
2123 /*
2124  * Add an entry to a used linked list. A free entry must first be found
2125  * in the free linked list using get_data_ll_free_entry();
2126  */
2127 static void
2128 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2129         struct virtio_net_data_ll *ll_dev)
2130 {
2131         struct virtio_net_data_ll *ll = *ll_root_addr;
2132
2133         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2134         ll_dev->next = NULL;
2135         rte_compiler_barrier();
2136
2137         /* If ll == NULL then this is the first device. */
2138         if (ll) {
2139                 /* Increment to the tail of the linked list. */
2140                 while ((ll->next != NULL) )
2141                         ll = ll->next;
2142
2143                 ll->next = ll_dev;
2144         } else {
2145                 *ll_root_addr = ll_dev;
2146         }
2147 }
2148
2149 /*
2150  * Remove an entry from a used linked list. The entry must then be added to
2151  * the free linked list using put_data_ll_free_entry().
2152  */
2153 static void
2154 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2155         struct virtio_net_data_ll *ll_dev,
2156         struct virtio_net_data_ll *ll_dev_last)
2157 {
2158         struct virtio_net_data_ll *ll = *ll_root_addr;
2159
2160         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2161                 return;
2162
2163         if (ll_dev == ll)
2164                 *ll_root_addr = ll_dev->next;
2165         else
2166                 if (likely(ll_dev_last != NULL))
2167                         ll_dev_last->next = ll_dev->next;
2168                 else
2169                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2170 }
2171
2172 /*
2173  * Find and return an entry from the free linked list.
2174  */
2175 static struct virtio_net_data_ll *
2176 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2177 {
2178         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2179         struct virtio_net_data_ll *ll_dev;
2180
2181         if (ll_free == NULL)
2182                 return NULL;
2183
2184         ll_dev = ll_free;
2185         *ll_root_addr = ll_free->next;
2186
2187         return ll_dev;
2188 }
2189
2190 /*
2191  * Place an entry back on to the free linked list.
2192  */
2193 static void
2194 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2195         struct virtio_net_data_ll *ll_dev)
2196 {
2197         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2198
2199         if (ll_dev == NULL)
2200                 return;
2201
2202         ll_dev->next = ll_free;
2203         *ll_root_addr = ll_dev;
2204 }
2205
2206 /*
2207  * Creates a linked list of a given size.
2208  */
2209 static struct virtio_net_data_ll *
2210 alloc_data_ll(uint32_t size)
2211 {
2212         struct virtio_net_data_ll *ll_new;
2213         uint32_t i;
2214
2215         /* Malloc and then chain the linked list. */
2216         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2217         if (ll_new == NULL) {
2218                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2219                 return NULL;
2220         }
2221
2222         for (i = 0; i < size - 1; i++) {
2223                 ll_new[i].vdev = NULL;
2224                 ll_new[i].next = &ll_new[i+1];
2225         }
2226         ll_new[i].next = NULL;
2227
2228         return (ll_new);
2229 }
2230
2231 /*
2232  * Create the main linked list along with each individual cores linked list. A used and a free list
2233  * are created to manage entries.
2234  */
2235 static int
2236 init_data_ll (void)
2237 {
2238         int lcore;
2239
2240         RTE_LCORE_FOREACH_SLAVE(lcore) {
2241                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2242                 if (lcore_info[lcore].lcore_ll == NULL) {
2243                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2244                         return -1;
2245                 }
2246
2247                 lcore_info[lcore].lcore_ll->device_num = 0;
2248                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2249                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2250                 if (num_devices % num_switching_cores)
2251                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2252                 else
2253                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2254         }
2255
2256         /* Allocate devices up to a maximum of MAX_DEVICES. */
2257         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2258
2259         return 0;
2260 }
2261
2262 /*
2263  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2264  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2265  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2266  */
2267 static void
2268 destroy_device (volatile struct virtio_net *dev)
2269 {
2270         struct virtio_net_data_ll *ll_lcore_dev_cur;
2271         struct virtio_net_data_ll *ll_main_dev_cur;
2272         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2273         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2274         struct vhost_dev *vdev;
2275         int lcore;
2276
2277         dev->flags &= ~VIRTIO_DEV_RUNNING;
2278
2279         vdev = (struct vhost_dev *)dev->priv;
2280         /*set the remove flag. */
2281         vdev->remove = 1;
2282         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2283                 rte_pause();
2284         }
2285
2286         /* Search for entry to be removed from lcore ll */
2287         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2288         while (ll_lcore_dev_cur != NULL) {
2289                 if (ll_lcore_dev_cur->vdev == vdev) {
2290                         break;
2291                 } else {
2292                         ll_lcore_dev_last = ll_lcore_dev_cur;
2293                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2294                 }
2295         }
2296
2297         if (ll_lcore_dev_cur == NULL) {
2298                 RTE_LOG(ERR, VHOST_CONFIG,
2299                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2300                         dev->device_fh);
2301                 return;
2302         }
2303
2304         /* Search for entry to be removed from main ll */
2305         ll_main_dev_cur = ll_root_used;
2306         ll_main_dev_last = NULL;
2307         while (ll_main_dev_cur != NULL) {
2308                 if (ll_main_dev_cur->vdev == vdev) {
2309                         break;
2310                 } else {
2311                         ll_main_dev_last = ll_main_dev_cur;
2312                         ll_main_dev_cur = ll_main_dev_cur->next;
2313                 }
2314         }
2315
2316         /* Remove entries from the lcore and main ll. */
2317         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2318         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2319
2320         /* Set the dev_removal_flag on each lcore. */
2321         RTE_LCORE_FOREACH_SLAVE(lcore) {
2322                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2323         }
2324
2325         /*
2326          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2327          * they can no longer access the device removed from the linked lists and that the devices
2328          * are no longer in use.
2329          */
2330         RTE_LCORE_FOREACH_SLAVE(lcore) {
2331                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2332                         rte_pause();
2333                 }
2334         }
2335
2336         /* Add the entries back to the lcore and main free ll.*/
2337         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2338         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2339
2340         /* Decrement number of device on the lcore. */
2341         lcore_info[vdev->coreid].lcore_ll->device_num--;
2342
2343         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2344
2345         if (zero_copy) {
2346                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2347
2348                 /* Stop the RX queue. */
2349                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2350                         LOG_DEBUG(VHOST_CONFIG,
2351                                 "(%"PRIu64") In destroy_device: Failed to stop "
2352                                 "rx queue:%d\n",
2353                                 dev->device_fh,
2354                                 vdev->vmdq_rx_q);
2355                 }
2356
2357                 LOG_DEBUG(VHOST_CONFIG,
2358                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2359                         "mempool back to ring for RX queue: %d\n",
2360                         dev->device_fh, vdev->vmdq_rx_q);
2361
2362                 mbuf_destroy_zcp(vpool);
2363
2364                 /* Stop the TX queue. */
2365                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2366                         LOG_DEBUG(VHOST_CONFIG,
2367                                 "(%"PRIu64") In destroy_device: Failed to "
2368                                 "stop tx queue:%d\n",
2369                                 dev->device_fh, vdev->vmdq_rx_q);
2370                 }
2371
2372                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2373
2374                 LOG_DEBUG(VHOST_CONFIG,
2375                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2376                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2377                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2378                         dev->device_fh);
2379
2380                 mbuf_destroy_zcp(vpool);
2381                 rte_free(vdev->regions_hpa);
2382         }
2383         rte_free(vdev);
2384
2385 }
2386
2387 /*
2388  * Calculate the region count of physical continous regions for one particular
2389  * region of whose vhost virtual address is continous. The particular region
2390  * start from vva_start, with size of 'size' in argument.
2391  */
2392 static uint32_t
2393 check_hpa_regions(uint64_t vva_start, uint64_t size)
2394 {
2395         uint32_t i, nregions = 0, page_size = getpagesize();
2396         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2397         if (vva_start % page_size) {
2398                 LOG_DEBUG(VHOST_CONFIG,
2399                         "in check_countinous: vva start(%p) mod page_size(%d) "
2400                         "has remainder\n",
2401                         (void *)(uintptr_t)vva_start, page_size);
2402                 return 0;
2403         }
2404         if (size % page_size) {
2405                 LOG_DEBUG(VHOST_CONFIG,
2406                         "in check_countinous: "
2407                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2408                         size, page_size);
2409                 return 0;
2410         }
2411         for (i = 0; i < size - page_size; i = i + page_size) {
2412                 cur_phys_addr
2413                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2414                 next_phys_addr = rte_mem_virt2phy(
2415                         (void *)(uintptr_t)(vva_start + i + page_size));
2416                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2417                         ++nregions;
2418                         LOG_DEBUG(VHOST_CONFIG,
2419                                 "in check_continuous: hva addr:(%p) is not "
2420                                 "continuous with hva addr:(%p), diff:%d\n",
2421                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2422                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2423                                 + page_size), page_size);
2424                         LOG_DEBUG(VHOST_CONFIG,
2425                                 "in check_continuous: hpa addr:(%p) is not "
2426                                 "continuous with hpa addr:(%p), "
2427                                 "diff:(%"PRIu64")\n",
2428                                 (void *)(uintptr_t)cur_phys_addr,
2429                                 (void *)(uintptr_t)next_phys_addr,
2430                                 (next_phys_addr-cur_phys_addr));
2431                 }
2432         }
2433         return nregions;
2434 }
2435
2436 /*
2437  * Divide each region whose vhost virtual address is continous into a few
2438  * sub-regions, make sure the physical address within each sub-region are
2439  * continous. And fill offset(to GPA) and size etc. information of each
2440  * sub-region into regions_hpa.
2441  */
2442 static uint32_t
2443 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2444 {
2445         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2446         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2447
2448         if (mem_region_hpa == NULL)
2449                 return 0;
2450
2451         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2452                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2453                         virtio_memory->regions[regionidx].address_offset;
2454                 mem_region_hpa[regionidx_hpa].guest_phys_address
2455                         = virtio_memory->regions[regionidx].guest_phys_address;
2456                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2457                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2458                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2459                 LOG_DEBUG(VHOST_CONFIG,
2460                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2461                         regionidx_hpa,
2462                         (void *)(uintptr_t)
2463                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2464                 LOG_DEBUG(VHOST_CONFIG,
2465                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2466                         regionidx_hpa,
2467                         (void *)(uintptr_t)
2468                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2469                 for (i = 0, k = 0;
2470                         i < virtio_memory->regions[regionidx].memory_size -
2471                                 page_size;
2472                         i += page_size) {
2473                         cur_phys_addr = rte_mem_virt2phy(
2474                                         (void *)(uintptr_t)(vva_start + i));
2475                         next_phys_addr = rte_mem_virt2phy(
2476                                         (void *)(uintptr_t)(vva_start +
2477                                         i + page_size));
2478                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2479                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2480                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2481                                         k + page_size;
2482                                 mem_region_hpa[regionidx_hpa].memory_size
2483                                         = k + page_size;
2484                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2485                                         "phys addr end  [%d]:(%p)\n",
2486                                         regionidx_hpa,
2487                                         (void *)(uintptr_t)
2488                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2489                                 LOG_DEBUG(VHOST_CONFIG,
2490                                         "in fill_hpa_regions: guest phys addr "
2491                                         "size [%d]:(%p)\n",
2492                                         regionidx_hpa,
2493                                         (void *)(uintptr_t)
2494                                         (mem_region_hpa[regionidx_hpa].memory_size));
2495                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2496                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2497                                 ++regionidx_hpa;
2498                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2499                                         next_phys_addr -
2500                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2501                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2502                                         " phys addr start[%d]:(%p)\n",
2503                                         regionidx_hpa,
2504                                         (void *)(uintptr_t)
2505                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2506                                 LOG_DEBUG(VHOST_CONFIG,
2507                                         "in fill_hpa_regions: host  phys addr "
2508                                         "start[%d]:(%p)\n",
2509                                         regionidx_hpa,
2510                                         (void *)(uintptr_t)
2511                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2512                                 k = 0;
2513                         } else {
2514                                 k += page_size;
2515                         }
2516                 }
2517                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2518                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2519                         + k + page_size;
2520                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2521                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2522                         "[%d]:(%p)\n", regionidx_hpa,
2523                         (void *)(uintptr_t)
2524                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2525                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2526                         "[%d]:(%p)\n", regionidx_hpa,
2527                         (void *)(uintptr_t)
2528                         (mem_region_hpa[regionidx_hpa].memory_size));
2529                 ++regionidx_hpa;
2530         }
2531         return regionidx_hpa;
2532 }
2533
2534 /*
2535  * A new device is added to a data core. First the device is added to the main linked list
2536  * and the allocated to a specific data core.
2537  */
2538 static int
2539 new_device (struct virtio_net *dev)
2540 {
2541         struct virtio_net_data_ll *ll_dev;
2542         int lcore, core_add = 0;
2543         uint32_t device_num_min = num_devices;
2544         struct vhost_dev *vdev;
2545         uint32_t regionidx;
2546
2547         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2548         if (vdev == NULL) {
2549                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2550                         dev->device_fh);
2551                 return -1;
2552         }
2553         vdev->dev = dev;
2554         dev->priv = vdev;
2555
2556         if (zero_copy) {
2557                 vdev->nregions_hpa = dev->mem->nregions;
2558                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2559                         vdev->nregions_hpa
2560                                 += check_hpa_regions(
2561                                         dev->mem->regions[regionidx].guest_phys_address
2562                                         + dev->mem->regions[regionidx].address_offset,
2563                                         dev->mem->regions[regionidx].memory_size);
2564
2565                 }
2566
2567                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2568                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2569                         CACHE_LINE_SIZE);
2570                 if (vdev->regions_hpa == NULL) {
2571                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2572                         rte_free(vdev);
2573                         return -1;
2574                 }
2575
2576
2577                 if (fill_hpa_memory_regions(
2578                         vdev->regions_hpa, dev->mem
2579                         ) != vdev->nregions_hpa) {
2580
2581                         RTE_LOG(ERR, VHOST_CONFIG,
2582                                 "hpa memory regions number mismatch: "
2583                                 "[%d]\n", vdev->nregions_hpa);
2584                         rte_free(vdev->regions_hpa);
2585                         rte_free(vdev);
2586                         return -1;
2587                 }
2588         }
2589
2590
2591         /* Add device to main ll */
2592         ll_dev = get_data_ll_free_entry(&ll_root_free);
2593         if (ll_dev == NULL) {
2594                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2595                         "of %d devices per core has been reached\n",
2596                         dev->device_fh, num_devices);
2597                 if (vdev->regions_hpa)
2598                         rte_free(vdev->regions_hpa);
2599                 rte_free(vdev);
2600                 return -1;
2601         }
2602         ll_dev->vdev = vdev;
2603         add_data_ll_entry(&ll_root_used, ll_dev);
2604         vdev->vmdq_rx_q
2605                 = dev->device_fh * (num_queues / num_devices);
2606
2607         if (zero_copy) {
2608                 uint32_t index = vdev->vmdq_rx_q;
2609                 uint32_t count_in_ring, i;
2610                 struct mbuf_table *tx_q;
2611
2612                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2613
2614                 LOG_DEBUG(VHOST_CONFIG,
2615                         "(%"PRIu64") in new_device: mbuf count in mempool "
2616                         "before attach is: %d\n",
2617                         dev->device_fh,
2618                         rte_mempool_count(vpool_array[index].pool));
2619                 LOG_DEBUG(VHOST_CONFIG,
2620                         "(%"PRIu64") in new_device: mbuf count in  ring "
2621                         "before attach  is : %d\n",
2622                         dev->device_fh, count_in_ring);
2623
2624                 /*
2625                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2626                  */
2627                 for (i = 0; i < count_in_ring; i++)
2628                         attach_rxmbuf_zcp(dev);
2629
2630                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2631                         "mempool after attach is: %d\n",
2632                         dev->device_fh,
2633                         rte_mempool_count(vpool_array[index].pool));
2634                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2635                         "ring after attach  is : %d\n",
2636                         dev->device_fh,
2637                         rte_ring_count(vpool_array[index].ring));
2638
2639                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2640                 tx_q->txq_id = vdev->vmdq_rx_q;
2641
2642                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2643                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2644
2645                         LOG_DEBUG(VHOST_CONFIG,
2646                                 "(%"PRIu64") In new_device: Failed to start "
2647                                 "tx queue:%d\n",
2648                                 dev->device_fh, vdev->vmdq_rx_q);
2649
2650                         mbuf_destroy_zcp(vpool);
2651                         rte_free(vdev->regions_hpa);
2652                         rte_free(vdev);
2653                         return -1;
2654                 }
2655
2656                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2657                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2658
2659                         LOG_DEBUG(VHOST_CONFIG,
2660                                 "(%"PRIu64") In new_device: Failed to start "
2661                                 "rx queue:%d\n",
2662                                 dev->device_fh, vdev->vmdq_rx_q);
2663
2664                         /* Stop the TX queue. */
2665                         if (rte_eth_dev_tx_queue_stop(ports[0],
2666                                 vdev->vmdq_rx_q) != 0) {
2667                                 LOG_DEBUG(VHOST_CONFIG,
2668                                         "(%"PRIu64") In new_device: Failed to "
2669                                         "stop tx queue:%d\n",
2670                                         dev->device_fh, vdev->vmdq_rx_q);
2671                         }
2672
2673                         mbuf_destroy_zcp(vpool);
2674                         rte_free(vdev->regions_hpa);
2675                         rte_free(vdev);
2676                         return -1;
2677                 }
2678
2679         }
2680
2681         /*reset ready flag*/
2682         vdev->ready = DEVICE_MAC_LEARNING;
2683         vdev->remove = 0;
2684
2685         /* Find a suitable lcore to add the device. */
2686         RTE_LCORE_FOREACH_SLAVE(lcore) {
2687                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2688                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2689                         core_add = lcore;
2690                 }
2691         }
2692         /* Add device to lcore ll */
2693         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2694         if (ll_dev == NULL) {
2695                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2696                 vdev->ready = DEVICE_SAFE_REMOVE;
2697                 destroy_device(dev);
2698                 if (vdev->regions_hpa)
2699                         rte_free(vdev->regions_hpa);
2700                 rte_free(vdev);
2701                 return -1;
2702         }
2703         ll_dev->vdev = vdev;
2704         vdev->coreid = core_add;
2705
2706         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2707
2708         /* Initialize device stats */
2709         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2710
2711         /* Disable notifications. */
2712         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2713         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2714         lcore_info[vdev->coreid].lcore_ll->device_num++;
2715         dev->flags |= VIRTIO_DEV_RUNNING;
2716
2717         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2718
2719         return 0;
2720 }
2721
2722 /*
2723  * These callback allow devices to be added to the data core when configuration
2724  * has been fully complete.
2725  */
2726 static const struct virtio_net_device_ops virtio_net_device_ops =
2727 {
2728         .new_device =  new_device,
2729         .destroy_device = destroy_device,
2730 };
2731
2732 /*
2733  * This is a thread will wake up after a period to print stats if the user has
2734  * enabled them.
2735  */
2736 static void
2737 print_stats(void)
2738 {
2739         struct virtio_net_data_ll *dev_ll;
2740         uint64_t tx_dropped, rx_dropped;
2741         uint64_t tx, tx_total, rx, rx_total;
2742         uint32_t device_fh;
2743         const char clr[] = { 27, '[', '2', 'J', '\0' };
2744         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2745
2746         while(1) {
2747                 sleep(enable_stats);
2748
2749                 /* Clear screen and move to top left */
2750                 printf("%s%s", clr, top_left);
2751
2752                 printf("\nDevice statistics ====================================");
2753
2754                 dev_ll = ll_root_used;
2755                 while (dev_ll != NULL) {
2756                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2757                         tx_total = dev_statistics[device_fh].tx_total;
2758                         tx = dev_statistics[device_fh].tx;
2759                         tx_dropped = tx_total - tx;
2760                         if (zero_copy == 0) {
2761                                 rx_total = rte_atomic64_read(
2762                                         &dev_statistics[device_fh].rx_total_atomic);
2763                                 rx = rte_atomic64_read(
2764                                         &dev_statistics[device_fh].rx_atomic);
2765                         } else {
2766                                 rx_total = dev_statistics[device_fh].rx_total;
2767                                 rx = dev_statistics[device_fh].rx;
2768                         }
2769                         rx_dropped = rx_total - rx;
2770
2771                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2772                                         "\nTX total:            %"PRIu64""
2773                                         "\nTX dropped:          %"PRIu64""
2774                                         "\nTX successful:               %"PRIu64""
2775                                         "\nRX total:            %"PRIu64""
2776                                         "\nRX dropped:          %"PRIu64""
2777                                         "\nRX successful:               %"PRIu64"",
2778                                         device_fh,
2779                                         tx_total,
2780                                         tx_dropped,
2781                                         tx,
2782                                         rx_total,
2783                                         rx_dropped,
2784                                         rx);
2785
2786                         dev_ll = dev_ll->next;
2787                 }
2788                 printf("\n======================================================\n");
2789         }
2790 }
2791
2792 static void
2793 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2794         char *ring_name, uint32_t nb_mbuf)
2795 {
2796         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2797         vpool_array[index].pool
2798                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2799                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2800                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2801                 rte_pktmbuf_init, NULL, socket, 0);
2802         if (vpool_array[index].pool != NULL) {
2803                 vpool_array[index].ring
2804                         = rte_ring_create(ring_name,
2805                                 rte_align32pow2(nb_mbuf + 1),
2806                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2807                 if (likely(vpool_array[index].ring != NULL)) {
2808                         LOG_DEBUG(VHOST_CONFIG,
2809                                 "in setup_mempool_tbl: mbuf count in "
2810                                 "mempool is: %d\n",
2811                                 rte_mempool_count(vpool_array[index].pool));
2812                         LOG_DEBUG(VHOST_CONFIG,
2813                                 "in setup_mempool_tbl: mbuf count in "
2814                                 "ring   is: %d\n",
2815                                 rte_ring_count(vpool_array[index].ring));
2816                 } else {
2817                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2818                                 ring_name);
2819                 }
2820
2821                 /* Need consider head room. */
2822                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2823         } else {
2824                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2825         }
2826 }
2827
2828
2829 /*
2830  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2831  * device is also registered here to handle the IOCTLs.
2832  */
2833 int
2834 MAIN(int argc, char *argv[])
2835 {
2836         struct rte_mempool *mbuf_pool = NULL;
2837         unsigned lcore_id, core_id = 0;
2838         unsigned nb_ports, valid_num_ports;
2839         int ret;
2840         uint8_t portid, queue_id = 0;
2841         static pthread_t tid;
2842
2843         /* init EAL */
2844         ret = rte_eal_init(argc, argv);
2845         if (ret < 0)
2846                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2847         argc -= ret;
2848         argv += ret;
2849
2850         /* parse app arguments */
2851         ret = us_vhost_parse_args(argc, argv);
2852         if (ret < 0)
2853                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2854
2855         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2856                 if (rte_lcore_is_enabled(lcore_id))
2857                         lcore_ids[core_id ++] = lcore_id;
2858
2859         if (rte_lcore_count() > RTE_MAX_LCORE)
2860                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2861
2862         /*set the number of swithcing cores available*/
2863         num_switching_cores = rte_lcore_count()-1;
2864
2865         /* Get the number of physical ports. */
2866         nb_ports = rte_eth_dev_count();
2867         if (nb_ports > RTE_MAX_ETHPORTS)
2868                 nb_ports = RTE_MAX_ETHPORTS;
2869
2870         /*
2871          * Update the global var NUM_PORTS and global array PORTS
2872          * and get value of var VALID_NUM_PORTS according to system ports number
2873          */
2874         valid_num_ports = check_ports_num(nb_ports);
2875
2876         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2877                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2878                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2879                 return -1;
2880         }
2881
2882         if (zero_copy == 0) {
2883                 /* Create the mbuf pool. */
2884                 mbuf_pool = rte_mempool_create(
2885                                 "MBUF_POOL",
2886                                 NUM_MBUFS_PER_PORT
2887                                 * valid_num_ports,
2888                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2889                                 sizeof(struct rte_pktmbuf_pool_private),
2890                                 rte_pktmbuf_pool_init, NULL,
2891                                 rte_pktmbuf_init, NULL,
2892                                 rte_socket_id(), 0);
2893                 if (mbuf_pool == NULL)
2894                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2895
2896                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2897                         vpool_array[queue_id].pool = mbuf_pool;
2898
2899                 if (vm2vm_mode == VM2VM_HARDWARE) {
2900                         /* Enable VT loop back to let L2 switch to do it. */
2901                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2902                         LOG_DEBUG(VHOST_CONFIG,
2903                                 "Enable loop back for L2 switch in vmdq.\n");
2904                 }
2905         } else {
2906                 uint32_t nb_mbuf;
2907                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2908                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2909
2910                 /*
2911                  * Zero copy defers queue RX/TX start to the time when guest
2912                  * finishes its startup and packet buffers from that guest are
2913                  * available.
2914                  */
2915                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2916                 rx_conf_default.rx_drop_en = 0;
2917                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2918                 nb_mbuf = num_rx_descriptor
2919                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2920                         + num_switching_cores * MAX_PKT_BURST;
2921
2922                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2923                         snprintf(pool_name, sizeof(pool_name),
2924                                 "rxmbuf_pool_%u", queue_id);
2925                         snprintf(ring_name, sizeof(ring_name),
2926                                 "rxmbuf_ring_%u", queue_id);
2927                         setup_mempool_tbl(rte_socket_id(), queue_id,
2928                                 pool_name, ring_name, nb_mbuf);
2929                 }
2930
2931                 nb_mbuf = num_tx_descriptor
2932                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2933                                 + num_switching_cores * MAX_PKT_BURST;
2934
2935                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2936                         snprintf(pool_name, sizeof(pool_name),
2937                                 "txmbuf_pool_%u", queue_id);
2938                         snprintf(ring_name, sizeof(ring_name),
2939                                 "txmbuf_ring_%u", queue_id);
2940                         setup_mempool_tbl(rte_socket_id(),
2941                                 (queue_id + MAX_QUEUES),
2942                                 pool_name, ring_name, nb_mbuf);
2943                 }
2944
2945                 if (vm2vm_mode == VM2VM_HARDWARE) {
2946                         /* Enable VT loop back to let L2 switch to do it. */
2947                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2948                         LOG_DEBUG(VHOST_CONFIG,
2949                                 "Enable loop back for L2 switch in vmdq.\n");
2950                 }
2951         }
2952         /* Set log level. */
2953         rte_set_log_level(LOG_LEVEL);
2954
2955         /* initialize all ports */
2956         for (portid = 0; portid < nb_ports; portid++) {
2957                 /* skip ports that are not enabled */
2958                 if ((enabled_port_mask & (1 << portid)) == 0) {
2959                         RTE_LOG(INFO, VHOST_PORT,
2960                                 "Skipping disabled port %d\n", portid);
2961                         continue;
2962                 }
2963                 if (port_init(portid) != 0)
2964                         rte_exit(EXIT_FAILURE,
2965                                 "Cannot initialize network ports\n");
2966         }
2967
2968         /* Initialise all linked lists. */
2969         if (init_data_ll() == -1)
2970                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2971
2972         /* Initialize device stats */
2973         memset(&dev_statistics, 0, sizeof(dev_statistics));
2974
2975         /* Enable stats if the user option is set. */
2976         if (enable_stats)
2977                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2978
2979         /* Launch all data cores. */
2980         if (zero_copy == 0) {
2981                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2982                         rte_eal_remote_launch(switch_worker,
2983                                 mbuf_pool, lcore_id);
2984                 }
2985         } else {
2986                 uint32_t count_in_mempool, index, i;
2987                 for (index = 0; index < 2*MAX_QUEUES; index++) {
2988                         /* For all RX and TX queues. */
2989                         count_in_mempool
2990                                 = rte_mempool_count(vpool_array[index].pool);
2991
2992                         /*
2993                          * Transfer all un-attached mbufs from vpool.pool
2994                          * to vpoo.ring.
2995                          */
2996                         for (i = 0; i < count_in_mempool; i++) {
2997                                 struct rte_mbuf *mbuf
2998                                         = __rte_mbuf_raw_alloc(
2999                                                 vpool_array[index].pool);
3000                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3001                                                 (void *)mbuf);
3002                         }
3003
3004                         LOG_DEBUG(VHOST_CONFIG,
3005                                 "in MAIN: mbuf count in mempool at initial "
3006                                 "is: %d\n", count_in_mempool);
3007                         LOG_DEBUG(VHOST_CONFIG,
3008                                 "in MAIN: mbuf count in  ring at initial  is :"
3009                                 " %d\n",
3010                                 rte_ring_count(vpool_array[index].ring));
3011                 }
3012
3013                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3014                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3015                                 lcore_id);
3016         }
3017
3018         if (mergeable == 0)
3019                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3020
3021         /* Register CUSE device to handle IOCTLs. */
3022         ret = rte_vhost_driver_register((char *)&dev_basename);
3023         if (ret != 0)
3024                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3025
3026         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3027
3028         /* Start CUSE session. */
3029         rte_vhost_driver_session_start();
3030         return 0;
3031
3032 }
3033