add prefix to cache line macros
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
103
104 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
106
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /* Promiscuous mode */
165 static uint32_t promiscuous;
166
167 /*Number of switching cores enabled*/
168 static uint32_t num_switching_cores = 0;
169
170 /* number of devices/queues to support*/
171 static uint32_t num_queues = 0;
172 static uint32_t num_devices;
173
174 /*
175  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
176  * disabled on default.
177  */
178 static uint32_t zero_copy;
179 static int mergeable;
180
181 /* number of descriptors to apply*/
182 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
183 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
184
185 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
186 #define MAX_RING_DESC 4096
187
188 struct vpool {
189         struct rte_mempool *pool;
190         struct rte_ring *ring;
191         uint32_t buf_size;
192 } vpool_array[MAX_QUEUES+MAX_QUEUES];
193
194 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
195 typedef enum {
196         VM2VM_DISABLED = 0,
197         VM2VM_SOFTWARE = 1,
198         VM2VM_HARDWARE = 2,
199         VM2VM_LAST
200 } vm2vm_type;
201 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
202
203 /* The type of host physical address translated from guest physical address. */
204 typedef enum {
205         PHYS_ADDR_CONTINUOUS = 0,
206         PHYS_ADDR_CROSS_SUBREG = 1,
207         PHYS_ADDR_INVALID = 2,
208         PHYS_ADDR_LAST
209 } hpa_type;
210
211 /* Enable stats. */
212 static uint32_t enable_stats = 0;
213 /* Enable retries on RX. */
214 static uint32_t enable_retry = 1;
215 /* Specify timeout (in useconds) between retries on RX. */
216 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
217 /* Specify the number of retries on RX. */
218 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
219
220 /* Character device basename. Can be set by user. */
221 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
222
223
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226         .rx_thresh = {
227                 .pthresh = RX_PTHRESH,
228                 .hthresh = RX_HTHRESH,
229                 .wthresh = RX_WTHRESH,
230         },
231         .rx_drop_en = 1,
232 };
233
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240         .tx_thresh = {
241                 .pthresh = TX_PTHRESH,
242                 .hthresh = TX_HTHRESH,
243                 .wthresh = TX_WTHRESH,
244         },
245         .tx_free_thresh = 0, /* Use PMD default values */
246         .tx_rs_thresh = 0, /* Use PMD default values */
247 };
248
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251         .rxmode = {
252                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253                 .split_hdr_size = 0,
254                 .header_split   = 0, /**< Header Split disabled */
255                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
257                 /*
258                  * It is necessary for 1G NIC such as I350,
259                  * this fixes bug of ipv4 forwarding in guest can't
260                  * forward pakets from one virtio dev to another virtio dev.
261                  */
262                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
265         },
266
267         .txmode = {
268                 .mq_mode = ETH_MQ_TX_NONE,
269         },
270         .rx_adv_conf = {
271                 /*
272                  * should be overridden separately in code with
273                  * appropriate values
274                  */
275                 .vmdq_rx_conf = {
276                         .nb_queue_pools = ETH_8_POOLS,
277                         .enable_default_pool = 0,
278                         .default_pool = 0,
279                         .nb_pool_maps = 0,
280                         .pool_map = {{0, 0},},
281                 },
282         },
283 };
284
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
293         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313         unsigned len;
314         unsigned txq_id;
315         struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326         unsigned char   h_dest[ETH_ALEN];
327         unsigned char   h_source[ETH_ALEN];
328         __be16          h_vlan_proto;
329         __be16          h_vlan_TCI;
330         __be16          h_vlan_encapsulated_proto;
331 };
332
333 /* IPv4 Header */
334 struct ipv4_hdr {
335         uint8_t  version_ihl;           /**< version and header length */
336         uint8_t  type_of_service;       /**< type of service */
337         uint16_t total_length;          /**< length of packet */
338         uint16_t packet_id;             /**< packet ID */
339         uint16_t fragment_offset;       /**< fragmentation offset */
340         uint8_t  time_to_live;          /**< time to live */
341         uint8_t  next_proto_id;         /**< protocol ID */
342         uint16_t hdr_checksum;          /**< header checksum */
343         uint32_t src_addr;              /**< source address */
344         uint32_t dst_addr;              /**< destination address */
345 } __attribute__((__packed__));
346
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350
351 /* Per-device statistics struct */
352 struct device_statistics {
353         uint64_t tx_total;
354         rte_atomic64_t rx_total_atomic;
355         uint64_t rx_total;
356         uint64_t tx;
357         rte_atomic64_t rx_atomic;
358         uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369         struct rte_eth_vmdq_rx_conf conf;
370         struct rte_eth_vmdq_rx_conf *def_conf =
371                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
372         unsigned i;
373
374         memset(&conf, 0, sizeof(conf));
375         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
376         conf.nb_pool_maps = num_devices;
377         conf.enable_loop_back = def_conf->enable_loop_back;
378         conf.rx_mode = def_conf->rx_mode;
379
380         for (i = 0; i < conf.nb_pool_maps; i++) {
381                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
382                 conf.pool_map[i].pools = (1UL << i);
383         }
384
385         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
386         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
387                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
388         return 0;
389 }
390
391 /*
392  * Validate the device number according to the max pool number gotten form
393  * dev_info. If the device number is invalid, give the error message and
394  * return -1. Each device must have its own pool.
395  */
396 static inline int
397 validate_num_devices(uint32_t max_nb_devices)
398 {
399         if (num_devices > max_nb_devices) {
400                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
401                 return -1;
402         }
403         return 0;
404 }
405
406 /*
407  * Initialises a given port using global settings and with the rx buffers
408  * coming from the mbuf_pool passed as parameter
409  */
410 static inline int
411 port_init(uint8_t port)
412 {
413         struct rte_eth_dev_info dev_info;
414         struct rte_eth_conf port_conf;
415         uint16_t rx_rings, tx_rings;
416         uint16_t rx_ring_size, tx_ring_size;
417         int retval;
418         uint16_t q;
419
420         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
421         rte_eth_dev_info_get (port, &dev_info);
422
423         /*configure the number of supported virtio devices based on VMDQ limits */
424         num_devices = dev_info.max_vmdq_pools;
425         num_queues = dev_info.max_rx_queues;
426
427         if (zero_copy) {
428                 rx_ring_size = num_rx_descriptor;
429                 tx_ring_size = num_tx_descriptor;
430                 tx_rings = dev_info.max_tx_queues;
431         } else {
432                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
433                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
434                 tx_rings = (uint16_t)rte_lcore_count();
435         }
436
437         retval = validate_num_devices(MAX_DEVICES);
438         if (retval < 0)
439                 return retval;
440
441         /* Get port configuration. */
442         retval = get_eth_conf(&port_conf, num_devices);
443         if (retval < 0)
444                 return retval;
445
446         if (port >= rte_eth_dev_count()) return -1;
447
448         rx_rings = (uint16_t)num_queues,
449         /* Configure ethernet device. */
450         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
451         if (retval != 0)
452                 return retval;
453
454         /* Setup the queues. */
455         for (q = 0; q < rx_rings; q ++) {
456                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
457                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
458                                                 vpool_array[q].pool);
459                 if (retval < 0)
460                         return retval;
461         }
462         for (q = 0; q < tx_rings; q ++) {
463                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
464                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
465                 if (retval < 0)
466                         return retval;
467         }
468
469         /* Start the device. */
470         retval  = rte_eth_dev_start(port);
471         if (retval < 0) {
472                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
473                 return retval;
474         }
475
476         if (promiscuous)
477                 rte_eth_promiscuous_enable(port);
478
479         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
480         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
481         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
482                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
483                         (unsigned)port,
484                         vmdq_ports_eth_addr[port].addr_bytes[0],
485                         vmdq_ports_eth_addr[port].addr_bytes[1],
486                         vmdq_ports_eth_addr[port].addr_bytes[2],
487                         vmdq_ports_eth_addr[port].addr_bytes[3],
488                         vmdq_ports_eth_addr[port].addr_bytes[4],
489                         vmdq_ports_eth_addr[port].addr_bytes[5]);
490
491         return 0;
492 }
493
494 /*
495  * Set character device basename.
496  */
497 static int
498 us_vhost_parse_basename(const char *q_arg)
499 {
500         /* parse number string */
501
502         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503                 return -1;
504         else
505                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
506
507         return 0;
508 }
509
510 /*
511  * Parse the portmask provided at run time.
512  */
513 static int
514 parse_portmask(const char *portmask)
515 {
516         char *end = NULL;
517         unsigned long pm;
518
519         errno = 0;
520
521         /* parse hexadecimal string */
522         pm = strtoul(portmask, &end, 16);
523         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
524                 return -1;
525
526         if (pm == 0)
527                 return -1;
528
529         return pm;
530
531 }
532
533 /*
534  * Parse num options at run time.
535  */
536 static int
537 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
538 {
539         char *end = NULL;
540         unsigned long num;
541
542         errno = 0;
543
544         /* parse unsigned int string */
545         num = strtoul(q_arg, &end, 10);
546         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547                 return -1;
548
549         if (num > max_valid_value)
550                 return -1;
551
552         return num;
553
554 }
555
556 /*
557  * Display usage
558  */
559 static void
560 us_vhost_usage(const char *prgname)
561 {
562         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
563         "               --vm2vm [0|1|2]\n"
564         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
565         "               --dev-basename <name>\n"
566         "               --nb-devices ND\n"
567         "               -p PORTMASK: Set mask for ports to be used by application\n"
568         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
569         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
570         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
571         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
572         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
573         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574         "               --dev-basename: The basename to be used for the character device.\n"
575         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
576                         "zero copy\n"
577         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
578                         "used only when zero copy is enabled.\n"
579         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
580                         "used only when zero copy is enabled.\n",
581                prgname);
582 }
583
584 /*
585  * Parse the arguments given in the command line of the application.
586  */
587 static int
588 us_vhost_parse_args(int argc, char **argv)
589 {
590         int opt, ret;
591         int option_index;
592         unsigned i;
593         const char *prgname = argv[0];
594         static struct option long_option[] = {
595                 {"vm2vm", required_argument, NULL, 0},
596                 {"rx-retry", required_argument, NULL, 0},
597                 {"rx-retry-delay", required_argument, NULL, 0},
598                 {"rx-retry-num", required_argument, NULL, 0},
599                 {"mergeable", required_argument, NULL, 0},
600                 {"stats", required_argument, NULL, 0},
601                 {"dev-basename", required_argument, NULL, 0},
602                 {"zero-copy", required_argument, NULL, 0},
603                 {"rx-desc-num", required_argument, NULL, 0},
604                 {"tx-desc-num", required_argument, NULL, 0},
605                 {NULL, 0, 0, 0},
606         };
607
608         /* Parse command line */
609         while ((opt = getopt_long(argc, argv, "p:P",
610                         long_option, &option_index)) != EOF) {
611                 switch (opt) {
612                 /* Portmask */
613                 case 'p':
614                         enabled_port_mask = parse_portmask(optarg);
615                         if (enabled_port_mask == 0) {
616                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617                                 us_vhost_usage(prgname);
618                                 return -1;
619                         }
620                         break;
621
622                 case 'P':
623                         promiscuous = 1;
624                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625                                 ETH_VMDQ_ACCEPT_BROADCAST |
626                                 ETH_VMDQ_ACCEPT_MULTICAST;
627                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
628
629                         break;
630
631                 case 0:
632                         /* Enable/disable vm2vm comms. */
633                         if (!strncmp(long_option[option_index].name, "vm2vm",
634                                 MAX_LONG_OPT_SZ)) {
635                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636                                 if (ret == -1) {
637                                         RTE_LOG(INFO, VHOST_CONFIG,
638                                                 "Invalid argument for "
639                                                 "vm2vm [0|1|2]\n");
640                                         us_vhost_usage(prgname);
641                                         return -1;
642                                 } else {
643                                         vm2vm_mode = (vm2vm_type)ret;
644                                 }
645                         }
646
647                         /* Enable/disable retries on RX. */
648                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649                                 ret = parse_num_opt(optarg, 1);
650                                 if (ret == -1) {
651                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652                                         us_vhost_usage(prgname);
653                                         return -1;
654                                 } else {
655                                         enable_retry = ret;
656                                 }
657                         }
658
659                         /* Specify the retries delay time (in useconds) on RX. */
660                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661                                 ret = parse_num_opt(optarg, INT32_MAX);
662                                 if (ret == -1) {
663                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664                                         us_vhost_usage(prgname);
665                                         return -1;
666                                 } else {
667                                         burst_rx_delay_time = ret;
668                                 }
669                         }
670
671                         /* Specify the retries number on RX. */
672                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673                                 ret = parse_num_opt(optarg, INT32_MAX);
674                                 if (ret == -1) {
675                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676                                         us_vhost_usage(prgname);
677                                         return -1;
678                                 } else {
679                                         burst_rx_retry_num = ret;
680                                 }
681                         }
682
683                         /* Enable/disable RX mergeable buffers. */
684                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685                                 ret = parse_num_opt(optarg, 1);
686                                 if (ret == -1) {
687                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688                                         us_vhost_usage(prgname);
689                                         return -1;
690                                 } else {
691                                         mergeable = !!ret;
692                                         if (ret) {
693                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
694                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
695                                                         = JUMBO_FRAME_MAX_SIZE;
696                                         }
697                                 }
698                         }
699
700                         /* Enable/disable stats. */
701                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
702                                 ret = parse_num_opt(optarg, INT32_MAX);
703                                 if (ret == -1) {
704                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
705                                         us_vhost_usage(prgname);
706                                         return -1;
707                                 } else {
708                                         enable_stats = ret;
709                                 }
710                         }
711
712                         /* Set character device basename. */
713                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
714                                 if (us_vhost_parse_basename(optarg) == -1) {
715                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
716                                         us_vhost_usage(prgname);
717                                         return -1;
718                                 }
719                         }
720
721                         /* Enable/disable rx/tx zero copy. */
722                         if (!strncmp(long_option[option_index].name,
723                                 "zero-copy", MAX_LONG_OPT_SZ)) {
724                                 ret = parse_num_opt(optarg, 1);
725                                 if (ret == -1) {
726                                         RTE_LOG(INFO, VHOST_CONFIG,
727                                                 "Invalid argument"
728                                                 " for zero-copy [0|1]\n");
729                                         us_vhost_usage(prgname);
730                                         return -1;
731                                 } else
732                                         zero_copy = ret;
733
734                                 if (zero_copy) {
735 #ifdef RTE_MBUF_REFCNT
736                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
737                                         "zero copy vhost APP, please "
738                                         "disable RTE_MBUF_REFCNT\n"
739                                         "in config file and then rebuild DPDK "
740                                         "core lib!\n"
741                                         "Otherwise please disable zero copy "
742                                         "flag in command line!\n");
743                                         return -1;
744 #endif
745                                 }
746                         }
747
748                         /* Specify the descriptor number on RX. */
749                         if (!strncmp(long_option[option_index].name,
750                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
751                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
752                                 if ((ret == -1) || (!POWEROF2(ret))) {
753                                         RTE_LOG(INFO, VHOST_CONFIG,
754                                         "Invalid argument for rx-desc-num[0-N],"
755                                         "power of 2 required.\n");
756                                         us_vhost_usage(prgname);
757                                         return -1;
758                                 } else {
759                                         num_rx_descriptor = ret;
760                                 }
761                         }
762
763                         /* Specify the descriptor number on TX. */
764                         if (!strncmp(long_option[option_index].name,
765                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
766                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
767                                 if ((ret == -1) || (!POWEROF2(ret))) {
768                                         RTE_LOG(INFO, VHOST_CONFIG,
769                                         "Invalid argument for tx-desc-num [0-N],"
770                                         "power of 2 required.\n");
771                                         us_vhost_usage(prgname);
772                                         return -1;
773                                 } else {
774                                         num_tx_descriptor = ret;
775                                 }
776                         }
777
778                         break;
779
780                         /* Invalid option - print options. */
781                 default:
782                         us_vhost_usage(prgname);
783                         return -1;
784                 }
785         }
786
787         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
788                 if (enabled_port_mask & (1 << i))
789                         ports[num_ports++] = (uint8_t)i;
790         }
791
792         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
793                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
794                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
795                 return -1;
796         }
797
798         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
799                 RTE_LOG(INFO, VHOST_PORT,
800                         "Vhost zero copy doesn't support software vm2vm,"
801                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
802                 return -1;
803         }
804
805         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
806                 RTE_LOG(INFO, VHOST_PORT,
807                         "Vhost zero copy doesn't support jumbo frame,"
808                         "please specify '--mergeable 0' to disable the "
809                         "mergeable feature.\n");
810                 return -1;
811         }
812
813         return 0;
814 }
815
816 /*
817  * Update the global var NUM_PORTS and array PORTS according to system ports number
818  * and return valid ports number
819  */
820 static unsigned check_ports_num(unsigned nb_ports)
821 {
822         unsigned valid_num_ports = num_ports;
823         unsigned portid;
824
825         if (num_ports > nb_ports) {
826                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
827                         num_ports, nb_ports);
828                 num_ports = nb_ports;
829         }
830
831         for (portid = 0; portid < num_ports; portid ++) {
832                 if (ports[portid] >= nb_ports) {
833                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
834                                 ports[portid], (nb_ports - 1));
835                         ports[portid] = INVALID_PORT_ID;
836                         valid_num_ports--;
837                 }
838         }
839         return valid_num_ports;
840 }
841
842 /*
843  * Macro to print out packet contents. Wrapped in debug define so that the
844  * data path is not effected when debug is disabled.
845  */
846 #ifdef DEBUG
847 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
848         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
849         unsigned int index;                                                                                                                                                                                             \
850         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
851                                                                                                                                                                                                                                         \
852         if ((header))                                                                                                                                                                                                   \
853                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
854         else                                                                                                                                                                                                                    \
855                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
856         for (index = 0; index < (size); index++) {                                                                                                                                              \
857                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
858                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
859         }                                                                                                                                                                                                                               \
860         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
861                                                                                                                                                                                                                                         \
862         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
863 } while(0)
864 #else
865 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
866 #endif
867
868 /*
869  * Function to convert guest physical addresses to vhost physical addresses.
870  * This is used to convert virtio buffer addresses.
871  */
872 static inline uint64_t __attribute__((always_inline))
873 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
874         uint32_t buf_len, hpa_type *addr_type)
875 {
876         struct virtio_memory_regions_hpa *region;
877         uint32_t regionidx;
878         uint64_t vhost_pa = 0;
879
880         *addr_type = PHYS_ADDR_INVALID;
881
882         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
883                 region = &vdev->regions_hpa[regionidx];
884                 if ((guest_pa >= region->guest_phys_address) &&
885                         (guest_pa <= region->guest_phys_address_end)) {
886                         vhost_pa = region->host_phys_addr_offset + guest_pa;
887                         if (likely((guest_pa + buf_len - 1)
888                                 <= region->guest_phys_address_end))
889                                 *addr_type = PHYS_ADDR_CONTINUOUS;
890                         else
891                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
892                         break;
893                 }
894         }
895
896         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
897                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
898                 (void *)(uintptr_t)vhost_pa);
899
900         return vhost_pa;
901 }
902
903 /*
904  * Compares a packet destination MAC address to a device MAC address.
905  */
906 static inline int __attribute__((always_inline))
907 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
908 {
909         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
910 }
911
912 /*
913  * This function learns the MAC address of the device and registers this along with a
914  * vlan tag to a VMDQ.
915  */
916 static int
917 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
918 {
919         struct ether_hdr *pkt_hdr;
920         struct virtio_net_data_ll *dev_ll;
921         struct virtio_net *dev = vdev->dev;
922         int i, ret;
923
924         /* Learn MAC address of guest device from packet */
925         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
926
927         dev_ll = ll_root_used;
928
929         while (dev_ll != NULL) {
930                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
931                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
932                         return -1;
933                 }
934                 dev_ll = dev_ll->next;
935         }
936
937         for (i = 0; i < ETHER_ADDR_LEN; i++)
938                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
939
940         /* vlan_tag currently uses the device_id. */
941         vdev->vlan_tag = vlan_tags[dev->device_fh];
942
943         /* Print out VMDQ registration info. */
944         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
945                 dev->device_fh,
946                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
947                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
948                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
949                 vdev->vlan_tag);
950
951         /* Register the MAC address. */
952         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
953         if (ret)
954                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
955                                         dev->device_fh);
956
957         /* Enable stripping of the vlan tag as we handle routing. */
958         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
959
960         /* Set device as ready for RX. */
961         vdev->ready = DEVICE_RX;
962
963         return 0;
964 }
965
966 /*
967  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
968  * queue before disabling RX on the device.
969  */
970 static inline void
971 unlink_vmdq(struct vhost_dev *vdev)
972 {
973         unsigned i = 0;
974         unsigned rx_count;
975         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
976
977         if (vdev->ready == DEVICE_RX) {
978                 /*clear MAC and VLAN settings*/
979                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
980                 for (i = 0; i < 6; i++)
981                         vdev->mac_address.addr_bytes[i] = 0;
982
983                 vdev->vlan_tag = 0;
984
985                 /*Clear out the receive buffers*/
986                 rx_count = rte_eth_rx_burst(ports[0],
987                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
988
989                 while (rx_count) {
990                         for (i = 0; i < rx_count; i++)
991                                 rte_pktmbuf_free(pkts_burst[i]);
992
993                         rx_count = rte_eth_rx_burst(ports[0],
994                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
995                 }
996
997                 vdev->ready = DEVICE_MAC_LEARNING;
998         }
999 }
1000
1001 /*
1002  * Check if the packet destination MAC address is for a local device. If so then put
1003  * the packet on that devices RX queue. If not then return.
1004  */
1005 static inline int __attribute__((always_inline))
1006 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1007 {
1008         struct virtio_net_data_ll *dev_ll;
1009         struct ether_hdr *pkt_hdr;
1010         uint64_t ret = 0;
1011         struct virtio_net *dev = vdev->dev;
1012         struct virtio_net *tdev; /* destination virito device */
1013
1014         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1015
1016         /*get the used devices list*/
1017         dev_ll = ll_root_used;
1018
1019         while (dev_ll != NULL) {
1020                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1021                                           &dev_ll->vdev->mac_address)) {
1022
1023                         /* Drop the packet if the TX packet is destined for the TX device. */
1024                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1025                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1026                                                         dev->device_fh);
1027                                 return 0;
1028                         }
1029                         tdev = dev_ll->vdev->dev;
1030
1031
1032                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1033
1034                         if (unlikely(dev_ll->vdev->remove)) {
1035                                 /*drop the packet if the device is marked for removal*/
1036                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1037                         } else {
1038                                 /*send the packet to the local virtio device*/
1039                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1040                                 if (enable_stats) {
1041                                         rte_atomic64_add(
1042                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1043                                         1);
1044                                         rte_atomic64_add(
1045                                         &dev_statistics[tdev->device_fh].rx_atomic,
1046                                         ret);
1047                                         dev_statistics[tdev->device_fh].tx_total++;
1048                                         dev_statistics[tdev->device_fh].tx += ret;
1049                                 }
1050                         }
1051
1052                         return 0;
1053                 }
1054                 dev_ll = dev_ll->next;
1055         }
1056
1057         return -1;
1058 }
1059
1060 /*
1061  * Check if the destination MAC of a packet is one local VM,
1062  * and get its vlan tag, and offset if it is.
1063  */
1064 static inline int __attribute__((always_inline))
1065 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1066         uint32_t *offset, uint16_t *vlan_tag)
1067 {
1068         struct virtio_net_data_ll *dev_ll = ll_root_used;
1069         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1070
1071         while (dev_ll != NULL) {
1072                 if ((dev_ll->vdev->ready == DEVICE_RX)
1073                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1074                 &dev_ll->vdev->mac_address)) {
1075                         /*
1076                          * Drop the packet if the TX packet is
1077                          * destined for the TX device.
1078                          */
1079                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1080                                 LOG_DEBUG(VHOST_DATA,
1081                                 "(%"PRIu64") TX: Source and destination"
1082                                 " MAC addresses are the same. Dropping "
1083                                 "packet.\n",
1084                                 dev_ll->vdev->dev->device_fh);
1085                                 return -1;
1086                         }
1087
1088                         /*
1089                          * HW vlan strip will reduce the packet length
1090                          * by minus length of vlan tag, so need restore
1091                          * the packet length by plus it.
1092                          */
1093                         *offset = VLAN_HLEN;
1094                         *vlan_tag =
1095                         (uint16_t)
1096                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1097
1098                         LOG_DEBUG(VHOST_DATA,
1099                         "(%"PRIu64") TX: pkt to local VM device id:"
1100                         "(%"PRIu64") vlan tag: %d.\n",
1101                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1102                         vlan_tag);
1103
1104                         break;
1105                 }
1106                 dev_ll = dev_ll->next;
1107         }
1108         return 0;
1109 }
1110
1111 /*
1112  * This function routes the TX packet to the correct interface. This may be a local device
1113  * or the physical port.
1114  */
1115 static inline void __attribute__((always_inline))
1116 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1117 {
1118         struct mbuf_table *tx_q;
1119         struct rte_mbuf **m_table;
1120         unsigned len, ret, offset = 0;
1121         const uint16_t lcore_id = rte_lcore_id();
1122         struct virtio_net *dev = vdev->dev;
1123
1124         /*check if destination is local VM*/
1125         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1126                 rte_pktmbuf_free(m);
1127                 return;
1128         }
1129
1130         if (vm2vm_mode == VM2VM_HARDWARE) {
1131                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1132                         offset > rte_pktmbuf_tailroom(m)) {
1133                         rte_pktmbuf_free(m);
1134                         return;
1135                 }
1136         }
1137
1138         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1139
1140         /*Add packet to the port tx queue*/
1141         tx_q = &lcore_tx_queue[lcore_id];
1142         len = tx_q->len;
1143
1144         m->ol_flags = PKT_TX_VLAN_PKT;
1145
1146         m->data_len += offset;
1147         m->pkt_len += offset;
1148
1149         m->vlan_tci = vlan_tag;
1150
1151         tx_q->m_table[len] = m;
1152         len++;
1153         if (enable_stats) {
1154                 dev_statistics[dev->device_fh].tx_total++;
1155                 dev_statistics[dev->device_fh].tx++;
1156         }
1157
1158         if (unlikely(len == MAX_PKT_BURST)) {
1159                 m_table = (struct rte_mbuf **)tx_q->m_table;
1160                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1161                 /* Free any buffers not handled by TX and update the port stats. */
1162                 if (unlikely(ret < len)) {
1163                         do {
1164                                 rte_pktmbuf_free(m_table[ret]);
1165                         } while (++ret < len);
1166                 }
1167
1168                 len = 0;
1169         }
1170
1171         tx_q->len = len;
1172         return;
1173 }
1174 /*
1175  * This function is called by each data core. It handles all RX/TX registered with the
1176  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1177  * with all devices in the main linked list.
1178  */
1179 static int
1180 switch_worker(__attribute__((unused)) void *arg)
1181 {
1182         struct rte_mempool *mbuf_pool = arg;
1183         struct virtio_net *dev = NULL;
1184         struct vhost_dev *vdev = NULL;
1185         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1186         struct virtio_net_data_ll *dev_ll;
1187         struct mbuf_table *tx_q;
1188         volatile struct lcore_ll_info *lcore_ll;
1189         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1190         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1191         unsigned ret, i;
1192         const uint16_t lcore_id = rte_lcore_id();
1193         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1194         uint16_t rx_count = 0;
1195         uint16_t tx_count;
1196         uint32_t retry = 0;
1197
1198         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1199         lcore_ll = lcore_info[lcore_id].lcore_ll;
1200         prev_tsc = 0;
1201
1202         tx_q = &lcore_tx_queue[lcore_id];
1203         for (i = 0; i < num_cores; i ++) {
1204                 if (lcore_ids[i] == lcore_id) {
1205                         tx_q->txq_id = i;
1206                         break;
1207                 }
1208         }
1209
1210         while(1) {
1211                 cur_tsc = rte_rdtsc();
1212                 /*
1213                  * TX burst queue drain
1214                  */
1215                 diff_tsc = cur_tsc - prev_tsc;
1216                 if (unlikely(diff_tsc > drain_tsc)) {
1217
1218                         if (tx_q->len) {
1219                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1220
1221                                 /*Tx any packets in the queue*/
1222                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1223                                                                            (struct rte_mbuf **)tx_q->m_table,
1224                                                                            (uint16_t)tx_q->len);
1225                                 if (unlikely(ret < tx_q->len)) {
1226                                         do {
1227                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1228                                         } while (++ret < tx_q->len);
1229                                 }
1230
1231                                 tx_q->len = 0;
1232                         }
1233
1234                         prev_tsc = cur_tsc;
1235
1236                 }
1237
1238                 rte_prefetch0(lcore_ll->ll_root_used);
1239                 /*
1240                  * Inform the configuration core that we have exited the linked list and that no devices are
1241                  * in use if requested.
1242                  */
1243                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1244                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1245
1246                 /*
1247                  * Process devices
1248                  */
1249                 dev_ll = lcore_ll->ll_root_used;
1250
1251                 while (dev_ll != NULL) {
1252                         /*get virtio device ID*/
1253                         vdev = dev_ll->vdev;
1254                         dev = vdev->dev;
1255
1256                         if (unlikely(vdev->remove)) {
1257                                 dev_ll = dev_ll->next;
1258                                 unlink_vmdq(vdev);
1259                                 vdev->ready = DEVICE_SAFE_REMOVE;
1260                                 continue;
1261                         }
1262                         if (likely(vdev->ready == DEVICE_RX)) {
1263                                 /*Handle guest RX*/
1264                                 rx_count = rte_eth_rx_burst(ports[0],
1265                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1266
1267                                 if (rx_count) {
1268                                         /*
1269                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1270                                         * Here MAX_PKT_BURST must be less than virtio queue size
1271                                         */
1272                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1273                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1274                                                         rte_delay_us(burst_rx_delay_time);
1275                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1276                                                                 break;
1277                                                 }
1278                                         }
1279                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1280                                         if (enable_stats) {
1281                                                 rte_atomic64_add(
1282                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1283                                                 rx_count);
1284                                                 rte_atomic64_add(
1285                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1286                                         }
1287                                         while (likely(rx_count)) {
1288                                                 rx_count--;
1289                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1290                                         }
1291
1292                                 }
1293                         }
1294
1295                         if (likely(!vdev->remove)) {
1296                                 /* Handle guest TX*/
1297                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1298                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1299                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1300                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1301                                                 while (tx_count--)
1302                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1303                                         }
1304                                 }
1305                                 while (tx_count)
1306                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1307                         }
1308
1309                         /*move to the next device in the list*/
1310                         dev_ll = dev_ll->next;
1311                 }
1312         }
1313
1314         return 0;
1315 }
1316
1317 /*
1318  * This function gets available ring number for zero copy rx.
1319  * Only one thread will call this funciton for a paticular virtio device,
1320  * so, it is designed as non-thread-safe function.
1321  */
1322 static inline uint32_t __attribute__((always_inline))
1323 get_available_ring_num_zcp(struct virtio_net *dev)
1324 {
1325         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1326         uint16_t avail_idx;
1327
1328         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1329         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1330 }
1331
1332 /*
1333  * This function gets available ring index for zero copy rx,
1334  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1335  * Only one thread will call this funciton for a paticular virtio device,
1336  * so, it is designed as non-thread-safe function.
1337  */
1338 static inline uint32_t __attribute__((always_inline))
1339 get_available_ring_index_zcp(struct virtio_net *dev,
1340         uint16_t *res_base_idx, uint32_t count)
1341 {
1342         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1343         uint16_t avail_idx;
1344         uint32_t retry = 0;
1345         uint16_t free_entries;
1346
1347         *res_base_idx = vq->last_used_idx_res;
1348         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1349         free_entries = (avail_idx - *res_base_idx);
1350
1351         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1352                         "avail idx: %d, "
1353                         "res base idx:%d, free entries:%d\n",
1354                         dev->device_fh, avail_idx, *res_base_idx,
1355                         free_entries);
1356
1357         /*
1358          * If retry is enabled and the queue is full then we wait
1359          * and retry to avoid packet loss.
1360          */
1361         if (enable_retry && unlikely(count > free_entries)) {
1362                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1363                         rte_delay_us(burst_rx_delay_time);
1364                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1365                         free_entries = (avail_idx - *res_base_idx);
1366                         if (count <= free_entries)
1367                                 break;
1368                 }
1369         }
1370
1371         /*check that we have enough buffers*/
1372         if (unlikely(count > free_entries))
1373                 count = free_entries;
1374
1375         if (unlikely(count == 0)) {
1376                 LOG_DEBUG(VHOST_DATA,
1377                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1378                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1379                         dev->device_fh, avail_idx,
1380                         *res_base_idx, free_entries);
1381                 return 0;
1382         }
1383
1384         vq->last_used_idx_res = *res_base_idx + count;
1385
1386         return count;
1387 }
1388
1389 /*
1390  * This function put descriptor back to used list.
1391  */
1392 static inline void __attribute__((always_inline))
1393 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1394 {
1395         uint16_t res_cur_idx = vq->last_used_idx;
1396         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1397         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1398         rte_compiler_barrier();
1399         *(volatile uint16_t *)&vq->used->idx += 1;
1400         vq->last_used_idx += 1;
1401
1402         /* Kick the guest if necessary. */
1403         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1404                 eventfd_write((int)vq->kickfd, 1);
1405 }
1406
1407 /*
1408  * This function get available descriptor from vitio vring and un-attached mbuf
1409  * from vpool->ring, and then attach them together. It needs adjust the offset
1410  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1411  * frame data may be put to wrong location in mbuf.
1412  */
1413 static inline void __attribute__((always_inline))
1414 attach_rxmbuf_zcp(struct virtio_net *dev)
1415 {
1416         uint16_t res_base_idx, desc_idx;
1417         uint64_t buff_addr, phys_addr;
1418         struct vhost_virtqueue *vq;
1419         struct vring_desc *desc;
1420         struct rte_mbuf *mbuf = NULL;
1421         struct vpool *vpool;
1422         hpa_type addr_type;
1423         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1424
1425         vpool = &vpool_array[vdev->vmdq_rx_q];
1426         vq = dev->virtqueue[VIRTIO_RXQ];
1427
1428         do {
1429                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1430                                 1) != 1))
1431                         return;
1432                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1433
1434                 desc = &vq->desc[desc_idx];
1435                 if (desc->flags & VRING_DESC_F_NEXT) {
1436                         desc = &vq->desc[desc->next];
1437                         buff_addr = gpa_to_vva(dev, desc->addr);
1438                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1439                                         &addr_type);
1440                 } else {
1441                         buff_addr = gpa_to_vva(dev,
1442                                         desc->addr + vq->vhost_hlen);
1443                         phys_addr = gpa_to_hpa(vdev,
1444                                         desc->addr + vq->vhost_hlen,
1445                                         desc->len, &addr_type);
1446                 }
1447
1448                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1449                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1450                                 " address found when attaching RX frame buffer"
1451                                 " address!\n", dev->device_fh);
1452                         put_desc_to_used_list_zcp(vq, desc_idx);
1453                         continue;
1454                 }
1455
1456                 /*
1457                  * Check if the frame buffer address from guest crosses
1458                  * sub-region or not.
1459                  */
1460                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1461                         RTE_LOG(ERR, VHOST_DATA,
1462                                 "(%"PRIu64") Frame buffer address cross "
1463                                 "sub-regioin found when attaching RX frame "
1464                                 "buffer address!\n",
1465                                 dev->device_fh);
1466                         put_desc_to_used_list_zcp(vq, desc_idx);
1467                         continue;
1468                 }
1469         } while (unlikely(phys_addr == 0));
1470
1471         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1472         if (unlikely(mbuf == NULL)) {
1473                 LOG_DEBUG(VHOST_DATA,
1474                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1475                         "ring_sc_dequeue fail.\n",
1476                         dev->device_fh);
1477                 put_desc_to_used_list_zcp(vq, desc_idx);
1478                 return;
1479         }
1480
1481         if (unlikely(vpool->buf_size > desc->len)) {
1482                 LOG_DEBUG(VHOST_DATA,
1483                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1484                         "length(%d) of descriptor idx: %d less than room "
1485                         "size required: %d\n",
1486                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1487                 put_desc_to_used_list_zcp(vq, desc_idx);
1488                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1489                 return;
1490         }
1491
1492         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1493         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1494         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1495         mbuf->data_len = desc->len;
1496         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1497
1498         LOG_DEBUG(VHOST_DATA,
1499                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1500                 "descriptor idx:%d\n",
1501                 dev->device_fh, res_base_idx, desc_idx);
1502
1503         __rte_mbuf_raw_free(mbuf);
1504
1505         return;
1506 }
1507
1508 /*
1509  * Detach an attched packet mbuf -
1510  *  - restore original mbuf address and length values.
1511  *  - reset pktmbuf data and data_len to their default values.
1512  *  All other fields of the given packet mbuf will be left intact.
1513  *
1514  * @param m
1515  *   The attached packet mbuf.
1516  */
1517 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1518 {
1519         const struct rte_mempool *mp = m->pool;
1520         void *buf = RTE_MBUF_TO_BADDR(m);
1521         uint32_t buf_ofs;
1522         uint32_t buf_len = mp->elt_size - sizeof(*m);
1523         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1524
1525         m->buf_addr = buf;
1526         m->buf_len = (uint16_t)buf_len;
1527
1528         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1529                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1530         m->data_off = buf_ofs;
1531
1532         m->data_len = 0;
1533 }
1534
1535 /*
1536  * This function is called after packets have been transimited. It fetchs mbuf
1537  * from vpool->pool, detached it and put into vpool->ring. It also update the
1538  * used index and kick the guest if necessary.
1539  */
1540 static inline uint32_t __attribute__((always_inline))
1541 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1542 {
1543         struct rte_mbuf *mbuf;
1544         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1545         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1546         uint32_t index = 0;
1547         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1548
1549         LOG_DEBUG(VHOST_DATA,
1550                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1551                 "clean is: %d\n",
1552                 dev->device_fh, mbuf_count);
1553         LOG_DEBUG(VHOST_DATA,
1554                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1555                 "clean  is : %d\n",
1556                 dev->device_fh, rte_ring_count(vpool->ring));
1557
1558         for (index = 0; index < mbuf_count; index++) {
1559                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1560                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1561                         pktmbuf_detach_zcp(mbuf);
1562                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1563
1564                 /* Update used index buffer information. */
1565                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1566                 vq->used->ring[used_idx].len = 0;
1567
1568                 used_idx = (used_idx + 1) & (vq->size - 1);
1569         }
1570
1571         LOG_DEBUG(VHOST_DATA,
1572                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1573                 "clean is: %d\n",
1574                 dev->device_fh, rte_mempool_count(vpool->pool));
1575         LOG_DEBUG(VHOST_DATA,
1576                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1577                 "clean  is : %d\n",
1578                 dev->device_fh, rte_ring_count(vpool->ring));
1579         LOG_DEBUG(VHOST_DATA,
1580                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1581                 "vq->last_used_idx:%d\n",
1582                 dev->device_fh, vq->last_used_idx);
1583
1584         vq->last_used_idx += mbuf_count;
1585
1586         LOG_DEBUG(VHOST_DATA,
1587                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1588                 "vq->last_used_idx:%d\n",
1589                 dev->device_fh, vq->last_used_idx);
1590
1591         rte_compiler_barrier();
1592
1593         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1594
1595         /* Kick guest if required. */
1596         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1597                 eventfd_write((int)vq->kickfd, 1);
1598
1599         return 0;
1600 }
1601
1602 /*
1603  * This function is called when a virtio device is destroy.
1604  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1605  */
1606 static void mbuf_destroy_zcp(struct vpool *vpool)
1607 {
1608         struct rte_mbuf *mbuf = NULL;
1609         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1610
1611         LOG_DEBUG(VHOST_CONFIG,
1612                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1613                 "mbuf_destroy_zcp is: %d\n",
1614                 mbuf_count);
1615         LOG_DEBUG(VHOST_CONFIG,
1616                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1617                 "mbuf_destroy_zcp  is : %d\n",
1618                 rte_ring_count(vpool->ring));
1619
1620         for (index = 0; index < mbuf_count; index++) {
1621                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1622                 if (likely(mbuf != NULL)) {
1623                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1624                                 pktmbuf_detach_zcp(mbuf);
1625                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1626                 }
1627         }
1628
1629         LOG_DEBUG(VHOST_CONFIG,
1630                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1631                 "mbuf_destroy_zcp is: %d\n",
1632                 rte_mempool_count(vpool->pool));
1633         LOG_DEBUG(VHOST_CONFIG,
1634                 "in mbuf_destroy_zcp: mbuf count in ring after "
1635                 "mbuf_destroy_zcp is : %d\n",
1636                 rte_ring_count(vpool->ring));
1637 }
1638
1639 /*
1640  * This function update the use flag and counter.
1641  */
1642 static inline uint32_t __attribute__((always_inline))
1643 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1644         uint32_t count)
1645 {
1646         struct vhost_virtqueue *vq;
1647         struct vring_desc *desc;
1648         struct rte_mbuf *buff;
1649         /* The virtio_hdr is initialised to 0. */
1650         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1651                 = {{0, 0, 0, 0, 0, 0}, 0};
1652         uint64_t buff_hdr_addr = 0;
1653         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1654         uint32_t head_idx, packet_success = 0;
1655         uint16_t res_cur_idx;
1656
1657         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1658
1659         if (count == 0)
1660                 return 0;
1661
1662         vq = dev->virtqueue[VIRTIO_RXQ];
1663         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1664
1665         res_cur_idx = vq->last_used_idx;
1666         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1667                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1668
1669         /* Retrieve all of the head indexes first to avoid caching issues. */
1670         for (head_idx = 0; head_idx < count; head_idx++)
1671                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1672
1673         /*Prefetch descriptor index. */
1674         rte_prefetch0(&vq->desc[head[packet_success]]);
1675
1676         while (packet_success != count) {
1677                 /* Get descriptor from available ring */
1678                 desc = &vq->desc[head[packet_success]];
1679
1680                 buff = pkts[packet_success];
1681                 LOG_DEBUG(VHOST_DATA,
1682                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1683                         "pkt[%d] descriptor idx: %d\n",
1684                         dev->device_fh, packet_success,
1685                         MBUF_HEADROOM_UINT32(buff));
1686
1687                 PRINT_PACKET(dev,
1688                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1689                         + RTE_PKTMBUF_HEADROOM),
1690                         rte_pktmbuf_data_len(buff), 0);
1691
1692                 /* Buffer address translation for virtio header. */
1693                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1694                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1695
1696                 /*
1697                  * If the descriptors are chained the header and data are
1698                  * placed in separate buffers.
1699                  */
1700                 if (desc->flags & VRING_DESC_F_NEXT) {
1701                         desc->len = vq->vhost_hlen;
1702                         desc = &vq->desc[desc->next];
1703                         desc->len = rte_pktmbuf_data_len(buff);
1704                 } else {
1705                         desc->len = packet_len;
1706                 }
1707
1708                 /* Update used ring with desc information */
1709                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1710                         = head[packet_success];
1711                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1712                         = packet_len;
1713                 res_cur_idx++;
1714                 packet_success++;
1715
1716                 /* A header is required per buffer. */
1717                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1718                         (const void *)&virtio_hdr, vq->vhost_hlen);
1719
1720                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1721
1722                 if (likely(packet_success < count)) {
1723                         /* Prefetch descriptor index. */
1724                         rte_prefetch0(&vq->desc[head[packet_success]]);
1725                 }
1726         }
1727
1728         rte_compiler_barrier();
1729
1730         LOG_DEBUG(VHOST_DATA,
1731                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1732                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1733                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1734
1735         *(volatile uint16_t *)&vq->used->idx += count;
1736         vq->last_used_idx += count;
1737
1738         LOG_DEBUG(VHOST_DATA,
1739                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1740                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1741                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1742
1743         /* Kick the guest if necessary. */
1744         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1745                 eventfd_write((int)vq->kickfd, 1);
1746
1747         return count;
1748 }
1749
1750 /*
1751  * This function routes the TX packet to the correct interface.
1752  * This may be a local device or the physical port.
1753  */
1754 static inline void __attribute__((always_inline))
1755 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1756         uint32_t desc_idx, uint8_t need_copy)
1757 {
1758         struct mbuf_table *tx_q;
1759         struct rte_mbuf **m_table;
1760         struct rte_mbuf *mbuf = NULL;
1761         unsigned len, ret, offset = 0;
1762         struct vpool *vpool;
1763         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1764         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1765
1766         /*Add packet to the port tx queue*/
1767         tx_q = &tx_queue_zcp[vmdq_rx_q];
1768         len = tx_q->len;
1769
1770         /* Allocate an mbuf and populate the structure. */
1771         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1772         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1773         if (unlikely(mbuf == NULL)) {
1774                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1775                 RTE_LOG(ERR, VHOST_DATA,
1776                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1777                         dev->device_fh);
1778                 put_desc_to_used_list_zcp(vq, desc_idx);
1779                 return;
1780         }
1781
1782         if (vm2vm_mode == VM2VM_HARDWARE) {
1783                 /* Avoid using a vlan tag from any vm for external pkt, such as
1784                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1785                  * selection, MAC address determines it as an external pkt
1786                  * which should go to network, while vlan tag determine it as
1787                  * a vm2vm pkt should forward to another vm. Hardware confuse
1788                  * such a ambiguous situation, so pkt will lost.
1789                  */
1790                 vlan_tag = external_pkt_default_vlan_tag;
1791                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1792                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1793                         __rte_mbuf_raw_free(mbuf);
1794                         return;
1795                 }
1796         }
1797
1798         mbuf->nb_segs = m->nb_segs;
1799         mbuf->next = m->next;
1800         mbuf->data_len = m->data_len + offset;
1801         mbuf->pkt_len = mbuf->data_len;
1802         if (unlikely(need_copy)) {
1803                 /* Copy the packet contents to the mbuf. */
1804                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1805                         rte_pktmbuf_mtod(m, void *),
1806                         m->data_len);
1807         } else {
1808                 mbuf->data_off = m->data_off;
1809                 mbuf->buf_physaddr = m->buf_physaddr;
1810                 mbuf->buf_addr = m->buf_addr;
1811         }
1812         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1813         mbuf->vlan_tci = vlan_tag;
1814         mbuf->l2_len = sizeof(struct ether_hdr);
1815         mbuf->l3_len = sizeof(struct ipv4_hdr);
1816         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1817
1818         tx_q->m_table[len] = mbuf;
1819         len++;
1820
1821         LOG_DEBUG(VHOST_DATA,
1822                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1823                 dev->device_fh,
1824                 mbuf->nb_segs,
1825                 (mbuf->next == NULL) ? "null" : "non-null");
1826
1827         if (enable_stats) {
1828                 dev_statistics[dev->device_fh].tx_total++;
1829                 dev_statistics[dev->device_fh].tx++;
1830         }
1831
1832         if (unlikely(len == MAX_PKT_BURST)) {
1833                 m_table = (struct rte_mbuf **)tx_q->m_table;
1834                 ret = rte_eth_tx_burst(ports[0],
1835                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1836
1837                 /*
1838                  * Free any buffers not handled by TX and update
1839                  * the port stats.
1840                  */
1841                 if (unlikely(ret < len)) {
1842                         do {
1843                                 rte_pktmbuf_free(m_table[ret]);
1844                         } while (++ret < len);
1845                 }
1846
1847                 len = 0;
1848                 txmbuf_clean_zcp(dev, vpool);
1849         }
1850
1851         tx_q->len = len;
1852
1853         return;
1854 }
1855
1856 /*
1857  * This function TX all available packets in virtio TX queue for one
1858  * virtio-net device. If it is first packet, it learns MAC address and
1859  * setup VMDQ.
1860  */
1861 static inline void __attribute__((always_inline))
1862 virtio_dev_tx_zcp(struct virtio_net *dev)
1863 {
1864         struct rte_mbuf m;
1865         struct vhost_virtqueue *vq;
1866         struct vring_desc *desc;
1867         uint64_t buff_addr = 0, phys_addr;
1868         uint32_t head[MAX_PKT_BURST];
1869         uint32_t i;
1870         uint16_t free_entries, packet_success = 0;
1871         uint16_t avail_idx;
1872         uint8_t need_copy = 0;
1873         hpa_type addr_type;
1874         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1875
1876         vq = dev->virtqueue[VIRTIO_TXQ];
1877         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1878
1879         /* If there are no available buffers then return. */
1880         if (vq->last_used_idx_res == avail_idx)
1881                 return;
1882
1883         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1884
1885         /* Prefetch available ring to retrieve head indexes. */
1886         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1887
1888         /* Get the number of free entries in the ring */
1889         free_entries = (avail_idx - vq->last_used_idx_res);
1890
1891         /* Limit to MAX_PKT_BURST. */
1892         free_entries
1893                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1894
1895         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1896                 dev->device_fh, free_entries);
1897
1898         /* Retrieve all of the head indexes first to avoid caching issues. */
1899         for (i = 0; i < free_entries; i++)
1900                 head[i]
1901                         = vq->avail->ring[(vq->last_used_idx_res + i)
1902                         & (vq->size - 1)];
1903
1904         vq->last_used_idx_res += free_entries;
1905
1906         /* Prefetch descriptor index. */
1907         rte_prefetch0(&vq->desc[head[packet_success]]);
1908         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1909
1910         while (packet_success < free_entries) {
1911                 desc = &vq->desc[head[packet_success]];
1912
1913                 /* Discard first buffer as it is the virtio header */
1914                 desc = &vq->desc[desc->next];
1915
1916                 /* Buffer address translation. */
1917                 buff_addr = gpa_to_vva(dev, desc->addr);
1918                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1919                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1920                         &addr_type);
1921
1922                 if (likely(packet_success < (free_entries - 1)))
1923                         /* Prefetch descriptor index. */
1924                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1925
1926                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1927                         RTE_LOG(ERR, VHOST_DATA,
1928                                 "(%"PRIu64") Invalid frame buffer address found"
1929                                 "when TX packets!\n",
1930                                 dev->device_fh);
1931                         packet_success++;
1932                         continue;
1933                 }
1934
1935                 /* Prefetch buffer address. */
1936                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1937
1938                 /*
1939                  * Setup dummy mbuf. This is copied to a real mbuf if
1940                  * transmitted out the physical port.
1941                  */
1942                 m.data_len = desc->len;
1943                 m.nb_segs = 1;
1944                 m.next = NULL;
1945                 m.data_off = 0;
1946                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1947                 m.buf_physaddr = phys_addr;
1948
1949                 /*
1950                  * Check if the frame buffer address from guest crosses
1951                  * sub-region or not.
1952                  */
1953                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1954                         RTE_LOG(ERR, VHOST_DATA,
1955                                 "(%"PRIu64") Frame buffer address cross "
1956                                 "sub-regioin found when attaching TX frame "
1957                                 "buffer address!\n",
1958                                 dev->device_fh);
1959                         need_copy = 1;
1960                 } else
1961                         need_copy = 0;
1962
1963                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1964
1965                 /*
1966                  * If this is the first received packet we need to learn
1967                  * the MAC and setup VMDQ
1968                  */
1969                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1970                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1971                                 /*
1972                                  * Discard frame if device is scheduled for
1973                                  * removal or a duplicate MAC address is found.
1974                                  */
1975                                 packet_success += free_entries;
1976                                 vq->last_used_idx += packet_success;
1977                                 break;
1978                         }
1979                 }
1980
1981                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1982                 packet_success++;
1983         }
1984 }
1985
1986 /*
1987  * This function is called by each data core. It handles all RX/TX registered
1988  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1989  * addresses are compared with all devices in the main linked list.
1990  */
1991 static int
1992 switch_worker_zcp(__attribute__((unused)) void *arg)
1993 {
1994         struct virtio_net *dev = NULL;
1995         struct vhost_dev  *vdev = NULL;
1996         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1997         struct virtio_net_data_ll *dev_ll;
1998         struct mbuf_table *tx_q;
1999         volatile struct lcore_ll_info *lcore_ll;
2000         const uint64_t drain_tsc
2001                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2002                 * BURST_TX_DRAIN_US;
2003         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2004         unsigned ret;
2005         const uint16_t lcore_id = rte_lcore_id();
2006         uint16_t count_in_ring, rx_count = 0;
2007
2008         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2009
2010         lcore_ll = lcore_info[lcore_id].lcore_ll;
2011         prev_tsc = 0;
2012
2013         while (1) {
2014                 cur_tsc = rte_rdtsc();
2015
2016                 /* TX burst queue drain */
2017                 diff_tsc = cur_tsc - prev_tsc;
2018                 if (unlikely(diff_tsc > drain_tsc)) {
2019                         /*
2020                          * Get mbuf from vpool.pool and detach mbuf and
2021                          * put back into vpool.ring.
2022                          */
2023                         dev_ll = lcore_ll->ll_root_used;
2024                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2025                                 /* Get virtio device ID */
2026                                 vdev = dev_ll->vdev;
2027                                 dev = vdev->dev;
2028
2029                                 if (likely(!vdev->remove)) {
2030                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2031                                         if (tx_q->len) {
2032                                                 LOG_DEBUG(VHOST_DATA,
2033                                                 "TX queue drained after timeout"
2034                                                 " with burst size %u\n",
2035                                                 tx_q->len);
2036
2037                                                 /*
2038                                                  * Tx any packets in the queue
2039                                                  */
2040                                                 ret = rte_eth_tx_burst(
2041                                                         ports[0],
2042                                                         (uint16_t)tx_q->txq_id,
2043                                                         (struct rte_mbuf **)
2044                                                         tx_q->m_table,
2045                                                         (uint16_t)tx_q->len);
2046                                                 if (unlikely(ret < tx_q->len)) {
2047                                                         do {
2048                                                                 rte_pktmbuf_free(
2049                                                                         tx_q->m_table[ret]);
2050                                                         } while (++ret < tx_q->len);
2051                                                 }
2052                                                 tx_q->len = 0;
2053
2054                                                 txmbuf_clean_zcp(dev,
2055                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2056                                         }
2057                                 }
2058                                 dev_ll = dev_ll->next;
2059                         }
2060                         prev_tsc = cur_tsc;
2061                 }
2062
2063                 rte_prefetch0(lcore_ll->ll_root_used);
2064
2065                 /*
2066                  * Inform the configuration core that we have exited the linked
2067                  * list and that no devices are in use if requested.
2068                  */
2069                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2070                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2071
2072                 /* Process devices */
2073                 dev_ll = lcore_ll->ll_root_used;
2074
2075                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2076                         vdev = dev_ll->vdev;
2077                         dev  = vdev->dev;
2078                         if (unlikely(vdev->remove)) {
2079                                 dev_ll = dev_ll->next;
2080                                 unlink_vmdq(vdev);
2081                                 vdev->ready = DEVICE_SAFE_REMOVE;
2082                                 continue;
2083                         }
2084
2085                         if (likely(vdev->ready == DEVICE_RX)) {
2086                                 uint32_t index = vdev->vmdq_rx_q;
2087                                 uint16_t i;
2088                                 count_in_ring
2089                                 = rte_ring_count(vpool_array[index].ring);
2090                                 uint16_t free_entries
2091                                 = (uint16_t)get_available_ring_num_zcp(dev);
2092
2093                                 /*
2094                                  * Attach all mbufs in vpool.ring and put back
2095                                  * into vpool.pool.
2096                                  */
2097                                 for (i = 0;
2098                                 i < RTE_MIN(free_entries,
2099                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2100                                 i++)
2101                                         attach_rxmbuf_zcp(dev);
2102
2103                                 /* Handle guest RX */
2104                                 rx_count = rte_eth_rx_burst(ports[0],
2105                                         vdev->vmdq_rx_q, pkts_burst,
2106                                         MAX_PKT_BURST);
2107
2108                                 if (rx_count) {
2109                                         ret_count = virtio_dev_rx_zcp(dev,
2110                                                         pkts_burst, rx_count);
2111                                         if (enable_stats) {
2112                                                 dev_statistics[dev->device_fh].rx_total
2113                                                         += rx_count;
2114                                                 dev_statistics[dev->device_fh].rx
2115                                                         += ret_count;
2116                                         }
2117                                         while (likely(rx_count)) {
2118                                                 rx_count--;
2119                                                 pktmbuf_detach_zcp(
2120                                                         pkts_burst[rx_count]);
2121                                                 rte_ring_sp_enqueue(
2122                                                         vpool_array[index].ring,
2123                                                         (void *)pkts_burst[rx_count]);
2124                                         }
2125                                 }
2126                         }
2127
2128                         if (likely(!vdev->remove))
2129                                 /* Handle guest TX */
2130                                 virtio_dev_tx_zcp(dev);
2131
2132                         /* Move to the next device in the list */
2133                         dev_ll = dev_ll->next;
2134                 }
2135         }
2136
2137         return 0;
2138 }
2139
2140
2141 /*
2142  * Add an entry to a used linked list. A free entry must first be found
2143  * in the free linked list using get_data_ll_free_entry();
2144  */
2145 static void
2146 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2147         struct virtio_net_data_ll *ll_dev)
2148 {
2149         struct virtio_net_data_ll *ll = *ll_root_addr;
2150
2151         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2152         ll_dev->next = NULL;
2153         rte_compiler_barrier();
2154
2155         /* If ll == NULL then this is the first device. */
2156         if (ll) {
2157                 /* Increment to the tail of the linked list. */
2158                 while ((ll->next != NULL) )
2159                         ll = ll->next;
2160
2161                 ll->next = ll_dev;
2162         } else {
2163                 *ll_root_addr = ll_dev;
2164         }
2165 }
2166
2167 /*
2168  * Remove an entry from a used linked list. The entry must then be added to
2169  * the free linked list using put_data_ll_free_entry().
2170  */
2171 static void
2172 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2173         struct virtio_net_data_ll *ll_dev,
2174         struct virtio_net_data_ll *ll_dev_last)
2175 {
2176         struct virtio_net_data_ll *ll = *ll_root_addr;
2177
2178         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2179                 return;
2180
2181         if (ll_dev == ll)
2182                 *ll_root_addr = ll_dev->next;
2183         else
2184                 if (likely(ll_dev_last != NULL))
2185                         ll_dev_last->next = ll_dev->next;
2186                 else
2187                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2188 }
2189
2190 /*
2191  * Find and return an entry from the free linked list.
2192  */
2193 static struct virtio_net_data_ll *
2194 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2195 {
2196         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2197         struct virtio_net_data_ll *ll_dev;
2198
2199         if (ll_free == NULL)
2200                 return NULL;
2201
2202         ll_dev = ll_free;
2203         *ll_root_addr = ll_free->next;
2204
2205         return ll_dev;
2206 }
2207
2208 /*
2209  * Place an entry back on to the free linked list.
2210  */
2211 static void
2212 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2213         struct virtio_net_data_ll *ll_dev)
2214 {
2215         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2216
2217         if (ll_dev == NULL)
2218                 return;
2219
2220         ll_dev->next = ll_free;
2221         *ll_root_addr = ll_dev;
2222 }
2223
2224 /*
2225  * Creates a linked list of a given size.
2226  */
2227 static struct virtio_net_data_ll *
2228 alloc_data_ll(uint32_t size)
2229 {
2230         struct virtio_net_data_ll *ll_new;
2231         uint32_t i;
2232
2233         /* Malloc and then chain the linked list. */
2234         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2235         if (ll_new == NULL) {
2236                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2237                 return NULL;
2238         }
2239
2240         for (i = 0; i < size - 1; i++) {
2241                 ll_new[i].vdev = NULL;
2242                 ll_new[i].next = &ll_new[i+1];
2243         }
2244         ll_new[i].next = NULL;
2245
2246         return (ll_new);
2247 }
2248
2249 /*
2250  * Create the main linked list along with each individual cores linked list. A used and a free list
2251  * are created to manage entries.
2252  */
2253 static int
2254 init_data_ll (void)
2255 {
2256         int lcore;
2257
2258         RTE_LCORE_FOREACH_SLAVE(lcore) {
2259                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2260                 if (lcore_info[lcore].lcore_ll == NULL) {
2261                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2262                         return -1;
2263                 }
2264
2265                 lcore_info[lcore].lcore_ll->device_num = 0;
2266                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2267                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2268                 if (num_devices % num_switching_cores)
2269                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2270                 else
2271                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2272         }
2273
2274         /* Allocate devices up to a maximum of MAX_DEVICES. */
2275         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2276
2277         return 0;
2278 }
2279
2280 /*
2281  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2282  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2283  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2284  */
2285 static void
2286 destroy_device (volatile struct virtio_net *dev)
2287 {
2288         struct virtio_net_data_ll *ll_lcore_dev_cur;
2289         struct virtio_net_data_ll *ll_main_dev_cur;
2290         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2291         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2292         struct vhost_dev *vdev;
2293         int lcore;
2294
2295         dev->flags &= ~VIRTIO_DEV_RUNNING;
2296
2297         vdev = (struct vhost_dev *)dev->priv;
2298         /*set the remove flag. */
2299         vdev->remove = 1;
2300         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2301                 rte_pause();
2302         }
2303
2304         /* Search for entry to be removed from lcore ll */
2305         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2306         while (ll_lcore_dev_cur != NULL) {
2307                 if (ll_lcore_dev_cur->vdev == vdev) {
2308                         break;
2309                 } else {
2310                         ll_lcore_dev_last = ll_lcore_dev_cur;
2311                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2312                 }
2313         }
2314
2315         if (ll_lcore_dev_cur == NULL) {
2316                 RTE_LOG(ERR, VHOST_CONFIG,
2317                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2318                         dev->device_fh);
2319                 return;
2320         }
2321
2322         /* Search for entry to be removed from main ll */
2323         ll_main_dev_cur = ll_root_used;
2324         ll_main_dev_last = NULL;
2325         while (ll_main_dev_cur != NULL) {
2326                 if (ll_main_dev_cur->vdev == vdev) {
2327                         break;
2328                 } else {
2329                         ll_main_dev_last = ll_main_dev_cur;
2330                         ll_main_dev_cur = ll_main_dev_cur->next;
2331                 }
2332         }
2333
2334         /* Remove entries from the lcore and main ll. */
2335         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2336         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2337
2338         /* Set the dev_removal_flag on each lcore. */
2339         RTE_LCORE_FOREACH_SLAVE(lcore) {
2340                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2341         }
2342
2343         /*
2344          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2345          * they can no longer access the device removed from the linked lists and that the devices
2346          * are no longer in use.
2347          */
2348         RTE_LCORE_FOREACH_SLAVE(lcore) {
2349                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2350                         rte_pause();
2351                 }
2352         }
2353
2354         /* Add the entries back to the lcore and main free ll.*/
2355         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2356         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2357
2358         /* Decrement number of device on the lcore. */
2359         lcore_info[vdev->coreid].lcore_ll->device_num--;
2360
2361         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2362
2363         if (zero_copy) {
2364                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2365
2366                 /* Stop the RX queue. */
2367                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2368                         LOG_DEBUG(VHOST_CONFIG,
2369                                 "(%"PRIu64") In destroy_device: Failed to stop "
2370                                 "rx queue:%d\n",
2371                                 dev->device_fh,
2372                                 vdev->vmdq_rx_q);
2373                 }
2374
2375                 LOG_DEBUG(VHOST_CONFIG,
2376                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2377                         "mempool back to ring for RX queue: %d\n",
2378                         dev->device_fh, vdev->vmdq_rx_q);
2379
2380                 mbuf_destroy_zcp(vpool);
2381
2382                 /* Stop the TX queue. */
2383                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2384                         LOG_DEBUG(VHOST_CONFIG,
2385                                 "(%"PRIu64") In destroy_device: Failed to "
2386                                 "stop tx queue:%d\n",
2387                                 dev->device_fh, vdev->vmdq_rx_q);
2388                 }
2389
2390                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2391
2392                 LOG_DEBUG(VHOST_CONFIG,
2393                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2394                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2395                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2396                         dev->device_fh);
2397
2398                 mbuf_destroy_zcp(vpool);
2399                 rte_free(vdev->regions_hpa);
2400         }
2401         rte_free(vdev);
2402
2403 }
2404
2405 /*
2406  * Calculate the region count of physical continous regions for one particular
2407  * region of whose vhost virtual address is continous. The particular region
2408  * start from vva_start, with size of 'size' in argument.
2409  */
2410 static uint32_t
2411 check_hpa_regions(uint64_t vva_start, uint64_t size)
2412 {
2413         uint32_t i, nregions = 0, page_size = getpagesize();
2414         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2415         if (vva_start % page_size) {
2416                 LOG_DEBUG(VHOST_CONFIG,
2417                         "in check_countinous: vva start(%p) mod page_size(%d) "
2418                         "has remainder\n",
2419                         (void *)(uintptr_t)vva_start, page_size);
2420                 return 0;
2421         }
2422         if (size % page_size) {
2423                 LOG_DEBUG(VHOST_CONFIG,
2424                         "in check_countinous: "
2425                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2426                         size, page_size);
2427                 return 0;
2428         }
2429         for (i = 0; i < size - page_size; i = i + page_size) {
2430                 cur_phys_addr
2431                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2432                 next_phys_addr = rte_mem_virt2phy(
2433                         (void *)(uintptr_t)(vva_start + i + page_size));
2434                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2435                         ++nregions;
2436                         LOG_DEBUG(VHOST_CONFIG,
2437                                 "in check_continuous: hva addr:(%p) is not "
2438                                 "continuous with hva addr:(%p), diff:%d\n",
2439                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2440                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2441                                 + page_size), page_size);
2442                         LOG_DEBUG(VHOST_CONFIG,
2443                                 "in check_continuous: hpa addr:(%p) is not "
2444                                 "continuous with hpa addr:(%p), "
2445                                 "diff:(%"PRIu64")\n",
2446                                 (void *)(uintptr_t)cur_phys_addr,
2447                                 (void *)(uintptr_t)next_phys_addr,
2448                                 (next_phys_addr-cur_phys_addr));
2449                 }
2450         }
2451         return nregions;
2452 }
2453
2454 /*
2455  * Divide each region whose vhost virtual address is continous into a few
2456  * sub-regions, make sure the physical address within each sub-region are
2457  * continous. And fill offset(to GPA) and size etc. information of each
2458  * sub-region into regions_hpa.
2459  */
2460 static uint32_t
2461 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2462 {
2463         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2464         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2465
2466         if (mem_region_hpa == NULL)
2467                 return 0;
2468
2469         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2470                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2471                         virtio_memory->regions[regionidx].address_offset;
2472                 mem_region_hpa[regionidx_hpa].guest_phys_address
2473                         = virtio_memory->regions[regionidx].guest_phys_address;
2474                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2475                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2476                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2477                 LOG_DEBUG(VHOST_CONFIG,
2478                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2479                         regionidx_hpa,
2480                         (void *)(uintptr_t)
2481                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2482                 LOG_DEBUG(VHOST_CONFIG,
2483                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2484                         regionidx_hpa,
2485                         (void *)(uintptr_t)
2486                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2487                 for (i = 0, k = 0;
2488                         i < virtio_memory->regions[regionidx].memory_size -
2489                                 page_size;
2490                         i += page_size) {
2491                         cur_phys_addr = rte_mem_virt2phy(
2492                                         (void *)(uintptr_t)(vva_start + i));
2493                         next_phys_addr = rte_mem_virt2phy(
2494                                         (void *)(uintptr_t)(vva_start +
2495                                         i + page_size));
2496                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2497                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2498                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2499                                         k + page_size;
2500                                 mem_region_hpa[regionidx_hpa].memory_size
2501                                         = k + page_size;
2502                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2503                                         "phys addr end  [%d]:(%p)\n",
2504                                         regionidx_hpa,
2505                                         (void *)(uintptr_t)
2506                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2507                                 LOG_DEBUG(VHOST_CONFIG,
2508                                         "in fill_hpa_regions: guest phys addr "
2509                                         "size [%d]:(%p)\n",
2510                                         regionidx_hpa,
2511                                         (void *)(uintptr_t)
2512                                         (mem_region_hpa[regionidx_hpa].memory_size));
2513                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2514                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2515                                 ++regionidx_hpa;
2516                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2517                                         next_phys_addr -
2518                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2519                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2520                                         " phys addr start[%d]:(%p)\n",
2521                                         regionidx_hpa,
2522                                         (void *)(uintptr_t)
2523                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2524                                 LOG_DEBUG(VHOST_CONFIG,
2525                                         "in fill_hpa_regions: host  phys addr "
2526                                         "start[%d]:(%p)\n",
2527                                         regionidx_hpa,
2528                                         (void *)(uintptr_t)
2529                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2530                                 k = 0;
2531                         } else {
2532                                 k += page_size;
2533                         }
2534                 }
2535                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2536                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2537                         + k + page_size;
2538                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2539                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2540                         "[%d]:(%p)\n", regionidx_hpa,
2541                         (void *)(uintptr_t)
2542                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2543                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2544                         "[%d]:(%p)\n", regionidx_hpa,
2545                         (void *)(uintptr_t)
2546                         (mem_region_hpa[regionidx_hpa].memory_size));
2547                 ++regionidx_hpa;
2548         }
2549         return regionidx_hpa;
2550 }
2551
2552 /*
2553  * A new device is added to a data core. First the device is added to the main linked list
2554  * and the allocated to a specific data core.
2555  */
2556 static int
2557 new_device (struct virtio_net *dev)
2558 {
2559         struct virtio_net_data_ll *ll_dev;
2560         int lcore, core_add = 0;
2561         uint32_t device_num_min = num_devices;
2562         struct vhost_dev *vdev;
2563         uint32_t regionidx;
2564
2565         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2566         if (vdev == NULL) {
2567                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2568                         dev->device_fh);
2569                 return -1;
2570         }
2571         vdev->dev = dev;
2572         dev->priv = vdev;
2573
2574         if (zero_copy) {
2575                 vdev->nregions_hpa = dev->mem->nregions;
2576                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2577                         vdev->nregions_hpa
2578                                 += check_hpa_regions(
2579                                         dev->mem->regions[regionidx].guest_phys_address
2580                                         + dev->mem->regions[regionidx].address_offset,
2581                                         dev->mem->regions[regionidx].memory_size);
2582
2583                 }
2584
2585                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2586                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2587                         RTE_CACHE_LINE_SIZE);
2588                 if (vdev->regions_hpa == NULL) {
2589                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2590                         rte_free(vdev);
2591                         return -1;
2592                 }
2593
2594
2595                 if (fill_hpa_memory_regions(
2596                         vdev->regions_hpa, dev->mem
2597                         ) != vdev->nregions_hpa) {
2598
2599                         RTE_LOG(ERR, VHOST_CONFIG,
2600                                 "hpa memory regions number mismatch: "
2601                                 "[%d]\n", vdev->nregions_hpa);
2602                         rte_free(vdev->regions_hpa);
2603                         rte_free(vdev);
2604                         return -1;
2605                 }
2606         }
2607
2608
2609         /* Add device to main ll */
2610         ll_dev = get_data_ll_free_entry(&ll_root_free);
2611         if (ll_dev == NULL) {
2612                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2613                         "of %d devices per core has been reached\n",
2614                         dev->device_fh, num_devices);
2615                 if (vdev->regions_hpa)
2616                         rte_free(vdev->regions_hpa);
2617                 rte_free(vdev);
2618                 return -1;
2619         }
2620         ll_dev->vdev = vdev;
2621         add_data_ll_entry(&ll_root_used, ll_dev);
2622         vdev->vmdq_rx_q
2623                 = dev->device_fh * (num_queues / num_devices);
2624
2625         if (zero_copy) {
2626                 uint32_t index = vdev->vmdq_rx_q;
2627                 uint32_t count_in_ring, i;
2628                 struct mbuf_table *tx_q;
2629
2630                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2631
2632                 LOG_DEBUG(VHOST_CONFIG,
2633                         "(%"PRIu64") in new_device: mbuf count in mempool "
2634                         "before attach is: %d\n",
2635                         dev->device_fh,
2636                         rte_mempool_count(vpool_array[index].pool));
2637                 LOG_DEBUG(VHOST_CONFIG,
2638                         "(%"PRIu64") in new_device: mbuf count in  ring "
2639                         "before attach  is : %d\n",
2640                         dev->device_fh, count_in_ring);
2641
2642                 /*
2643                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2644                  */
2645                 for (i = 0; i < count_in_ring; i++)
2646                         attach_rxmbuf_zcp(dev);
2647
2648                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2649                         "mempool after attach is: %d\n",
2650                         dev->device_fh,
2651                         rte_mempool_count(vpool_array[index].pool));
2652                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2653                         "ring after attach  is : %d\n",
2654                         dev->device_fh,
2655                         rte_ring_count(vpool_array[index].ring));
2656
2657                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2658                 tx_q->txq_id = vdev->vmdq_rx_q;
2659
2660                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2661                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2662
2663                         LOG_DEBUG(VHOST_CONFIG,
2664                                 "(%"PRIu64") In new_device: Failed to start "
2665                                 "tx queue:%d\n",
2666                                 dev->device_fh, vdev->vmdq_rx_q);
2667
2668                         mbuf_destroy_zcp(vpool);
2669                         rte_free(vdev->regions_hpa);
2670                         rte_free(vdev);
2671                         return -1;
2672                 }
2673
2674                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2675                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2676
2677                         LOG_DEBUG(VHOST_CONFIG,
2678                                 "(%"PRIu64") In new_device: Failed to start "
2679                                 "rx queue:%d\n",
2680                                 dev->device_fh, vdev->vmdq_rx_q);
2681
2682                         /* Stop the TX queue. */
2683                         if (rte_eth_dev_tx_queue_stop(ports[0],
2684                                 vdev->vmdq_rx_q) != 0) {
2685                                 LOG_DEBUG(VHOST_CONFIG,
2686                                         "(%"PRIu64") In new_device: Failed to "
2687                                         "stop tx queue:%d\n",
2688                                         dev->device_fh, vdev->vmdq_rx_q);
2689                         }
2690
2691                         mbuf_destroy_zcp(vpool);
2692                         rte_free(vdev->regions_hpa);
2693                         rte_free(vdev);
2694                         return -1;
2695                 }
2696
2697         }
2698
2699         /*reset ready flag*/
2700         vdev->ready = DEVICE_MAC_LEARNING;
2701         vdev->remove = 0;
2702
2703         /* Find a suitable lcore to add the device. */
2704         RTE_LCORE_FOREACH_SLAVE(lcore) {
2705                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2706                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2707                         core_add = lcore;
2708                 }
2709         }
2710         /* Add device to lcore ll */
2711         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2712         if (ll_dev == NULL) {
2713                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2714                 vdev->ready = DEVICE_SAFE_REMOVE;
2715                 destroy_device(dev);
2716                 if (vdev->regions_hpa)
2717                         rte_free(vdev->regions_hpa);
2718                 rte_free(vdev);
2719                 return -1;
2720         }
2721         ll_dev->vdev = vdev;
2722         vdev->coreid = core_add;
2723
2724         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2725
2726         /* Initialize device stats */
2727         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2728
2729         /* Disable notifications. */
2730         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2731         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2732         lcore_info[vdev->coreid].lcore_ll->device_num++;
2733         dev->flags |= VIRTIO_DEV_RUNNING;
2734
2735         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2736
2737         return 0;
2738 }
2739
2740 /*
2741  * These callback allow devices to be added to the data core when configuration
2742  * has been fully complete.
2743  */
2744 static const struct virtio_net_device_ops virtio_net_device_ops =
2745 {
2746         .new_device =  new_device,
2747         .destroy_device = destroy_device,
2748 };
2749
2750 /*
2751  * This is a thread will wake up after a period to print stats if the user has
2752  * enabled them.
2753  */
2754 static void
2755 print_stats(void)
2756 {
2757         struct virtio_net_data_ll *dev_ll;
2758         uint64_t tx_dropped, rx_dropped;
2759         uint64_t tx, tx_total, rx, rx_total;
2760         uint32_t device_fh;
2761         const char clr[] = { 27, '[', '2', 'J', '\0' };
2762         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2763
2764         while(1) {
2765                 sleep(enable_stats);
2766
2767                 /* Clear screen and move to top left */
2768                 printf("%s%s", clr, top_left);
2769
2770                 printf("\nDevice statistics ====================================");
2771
2772                 dev_ll = ll_root_used;
2773                 while (dev_ll != NULL) {
2774                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2775                         tx_total = dev_statistics[device_fh].tx_total;
2776                         tx = dev_statistics[device_fh].tx;
2777                         tx_dropped = tx_total - tx;
2778                         if (zero_copy == 0) {
2779                                 rx_total = rte_atomic64_read(
2780                                         &dev_statistics[device_fh].rx_total_atomic);
2781                                 rx = rte_atomic64_read(
2782                                         &dev_statistics[device_fh].rx_atomic);
2783                         } else {
2784                                 rx_total = dev_statistics[device_fh].rx_total;
2785                                 rx = dev_statistics[device_fh].rx;
2786                         }
2787                         rx_dropped = rx_total - rx;
2788
2789                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2790                                         "\nTX total:            %"PRIu64""
2791                                         "\nTX dropped:          %"PRIu64""
2792                                         "\nTX successful:               %"PRIu64""
2793                                         "\nRX total:            %"PRIu64""
2794                                         "\nRX dropped:          %"PRIu64""
2795                                         "\nRX successful:               %"PRIu64"",
2796                                         device_fh,
2797                                         tx_total,
2798                                         tx_dropped,
2799                                         tx,
2800                                         rx_total,
2801                                         rx_dropped,
2802                                         rx);
2803
2804                         dev_ll = dev_ll->next;
2805                 }
2806                 printf("\n======================================================\n");
2807         }
2808 }
2809
2810 static void
2811 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2812         char *ring_name, uint32_t nb_mbuf)
2813 {
2814         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2815         vpool_array[index].pool
2816                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2817                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2818                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2819                 rte_pktmbuf_init, NULL, socket, 0);
2820         if (vpool_array[index].pool != NULL) {
2821                 vpool_array[index].ring
2822                         = rte_ring_create(ring_name,
2823                                 rte_align32pow2(nb_mbuf + 1),
2824                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2825                 if (likely(vpool_array[index].ring != NULL)) {
2826                         LOG_DEBUG(VHOST_CONFIG,
2827                                 "in setup_mempool_tbl: mbuf count in "
2828                                 "mempool is: %d\n",
2829                                 rte_mempool_count(vpool_array[index].pool));
2830                         LOG_DEBUG(VHOST_CONFIG,
2831                                 "in setup_mempool_tbl: mbuf count in "
2832                                 "ring   is: %d\n",
2833                                 rte_ring_count(vpool_array[index].ring));
2834                 } else {
2835                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2836                                 ring_name);
2837                 }
2838
2839                 /* Need consider head room. */
2840                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2841         } else {
2842                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2843         }
2844 }
2845
2846
2847 /*
2848  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2849  * device is also registered here to handle the IOCTLs.
2850  */
2851 int
2852 main(int argc, char *argv[])
2853 {
2854         struct rte_mempool *mbuf_pool = NULL;
2855         unsigned lcore_id, core_id = 0;
2856         unsigned nb_ports, valid_num_ports;
2857         int ret;
2858         uint8_t portid, queue_id = 0;
2859         static pthread_t tid;
2860
2861         /* init EAL */
2862         ret = rte_eal_init(argc, argv);
2863         if (ret < 0)
2864                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2865         argc -= ret;
2866         argv += ret;
2867
2868         /* parse app arguments */
2869         ret = us_vhost_parse_args(argc, argv);
2870         if (ret < 0)
2871                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2872
2873         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2874                 if (rte_lcore_is_enabled(lcore_id))
2875                         lcore_ids[core_id ++] = lcore_id;
2876
2877         if (rte_lcore_count() > RTE_MAX_LCORE)
2878                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2879
2880         /*set the number of swithcing cores available*/
2881         num_switching_cores = rte_lcore_count()-1;
2882
2883         /* Get the number of physical ports. */
2884         nb_ports = rte_eth_dev_count();
2885         if (nb_ports > RTE_MAX_ETHPORTS)
2886                 nb_ports = RTE_MAX_ETHPORTS;
2887
2888         /*
2889          * Update the global var NUM_PORTS and global array PORTS
2890          * and get value of var VALID_NUM_PORTS according to system ports number
2891          */
2892         valid_num_ports = check_ports_num(nb_ports);
2893
2894         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2895                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2896                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2897                 return -1;
2898         }
2899
2900         if (zero_copy == 0) {
2901                 /* Create the mbuf pool. */
2902                 mbuf_pool = rte_mempool_create(
2903                                 "MBUF_POOL",
2904                                 NUM_MBUFS_PER_PORT
2905                                 * valid_num_ports,
2906                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2907                                 sizeof(struct rte_pktmbuf_pool_private),
2908                                 rte_pktmbuf_pool_init, NULL,
2909                                 rte_pktmbuf_init, NULL,
2910                                 rte_socket_id(), 0);
2911                 if (mbuf_pool == NULL)
2912                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2913
2914                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2915                         vpool_array[queue_id].pool = mbuf_pool;
2916
2917                 if (vm2vm_mode == VM2VM_HARDWARE) {
2918                         /* Enable VT loop back to let L2 switch to do it. */
2919                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2920                         LOG_DEBUG(VHOST_CONFIG,
2921                                 "Enable loop back for L2 switch in vmdq.\n");
2922                 }
2923         } else {
2924                 uint32_t nb_mbuf;
2925                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2926                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2927
2928                 /*
2929                  * Zero copy defers queue RX/TX start to the time when guest
2930                  * finishes its startup and packet buffers from that guest are
2931                  * available.
2932                  */
2933                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2934                 rx_conf_default.rx_drop_en = 0;
2935                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2936                 nb_mbuf = num_rx_descriptor
2937                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2938                         + num_switching_cores * MAX_PKT_BURST;
2939
2940                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2941                         snprintf(pool_name, sizeof(pool_name),
2942                                 "rxmbuf_pool_%u", queue_id);
2943                         snprintf(ring_name, sizeof(ring_name),
2944                                 "rxmbuf_ring_%u", queue_id);
2945                         setup_mempool_tbl(rte_socket_id(), queue_id,
2946                                 pool_name, ring_name, nb_mbuf);
2947                 }
2948
2949                 nb_mbuf = num_tx_descriptor
2950                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951                                 + num_switching_cores * MAX_PKT_BURST;
2952
2953                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954                         snprintf(pool_name, sizeof(pool_name),
2955                                 "txmbuf_pool_%u", queue_id);
2956                         snprintf(ring_name, sizeof(ring_name),
2957                                 "txmbuf_ring_%u", queue_id);
2958                         setup_mempool_tbl(rte_socket_id(),
2959                                 (queue_id + MAX_QUEUES),
2960                                 pool_name, ring_name, nb_mbuf);
2961                 }
2962
2963                 if (vm2vm_mode == VM2VM_HARDWARE) {
2964                         /* Enable VT loop back to let L2 switch to do it. */
2965                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2966                         LOG_DEBUG(VHOST_CONFIG,
2967                                 "Enable loop back for L2 switch in vmdq.\n");
2968                 }
2969         }
2970         /* Set log level. */
2971         rte_set_log_level(LOG_LEVEL);
2972
2973         /* initialize all ports */
2974         for (portid = 0; portid < nb_ports; portid++) {
2975                 /* skip ports that are not enabled */
2976                 if ((enabled_port_mask & (1 << portid)) == 0) {
2977                         RTE_LOG(INFO, VHOST_PORT,
2978                                 "Skipping disabled port %d\n", portid);
2979                         continue;
2980                 }
2981                 if (port_init(portid) != 0)
2982                         rte_exit(EXIT_FAILURE,
2983                                 "Cannot initialize network ports\n");
2984         }
2985
2986         /* Initialise all linked lists. */
2987         if (init_data_ll() == -1)
2988                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2989
2990         /* Initialize device stats */
2991         memset(&dev_statistics, 0, sizeof(dev_statistics));
2992
2993         /* Enable stats if the user option is set. */
2994         if (enable_stats)
2995                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2996
2997         /* Launch all data cores. */
2998         if (zero_copy == 0) {
2999                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3000                         rte_eal_remote_launch(switch_worker,
3001                                 mbuf_pool, lcore_id);
3002                 }
3003         } else {
3004                 uint32_t count_in_mempool, index, i;
3005                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3006                         /* For all RX and TX queues. */
3007                         count_in_mempool
3008                                 = rte_mempool_count(vpool_array[index].pool);
3009
3010                         /*
3011                          * Transfer all un-attached mbufs from vpool.pool
3012                          * to vpoo.ring.
3013                          */
3014                         for (i = 0; i < count_in_mempool; i++) {
3015                                 struct rte_mbuf *mbuf
3016                                         = __rte_mbuf_raw_alloc(
3017                                                 vpool_array[index].pool);
3018                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3019                                                 (void *)mbuf);
3020                         }
3021
3022                         LOG_DEBUG(VHOST_CONFIG,
3023                                 "in main: mbuf count in mempool at initial "
3024                                 "is: %d\n", count_in_mempool);
3025                         LOG_DEBUG(VHOST_CONFIG,
3026                                 "in main: mbuf count in  ring at initial  is :"
3027                                 " %d\n",
3028                                 rte_ring_count(vpool_array[index].ring));
3029                 }
3030
3031                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3032                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3033                                 lcore_id);
3034         }
3035
3036         if (mergeable == 0)
3037                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3038
3039         /* Register CUSE device to handle IOCTLs. */
3040         ret = rte_vhost_driver_register((char *)&dev_basename);
3041         if (ret != 0)
3042                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3043
3044         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3045
3046         /* Start CUSE session. */
3047         rte_vhost_driver_session_start();
3048         return 0;
3049
3050 }
3051