remove extra parentheses in return statement
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #ifndef MAX_QUEUES
57 #define MAX_QUEUES 128
58 #endif
59
60 /* the maximum number of external ports supported */
61 #define MAX_SUP_PORTS 1
62
63 /*
64  * Calculate the number of buffers needed per port
65  */
66 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
67                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
68                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
69                                                         (num_switching_cores*MBUF_CACHE_SIZE))
70
71 #define MBUF_CACHE_SIZE 128
72 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
73
74 /*
75  * No frame data buffer allocated from host are required for zero copy
76  * implementation, guest will allocate the frame data buffer, and vhost
77  * directly use it.
78  */
79 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
80 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
81 #define MBUF_CACHE_SIZE_ZCP 0
82
83 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
84 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
85
86 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
87 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
88
89 #define JUMBO_FRAME_MAX_SIZE    0x2600
90
91 /* State of virtio device. */
92 #define DEVICE_MAC_LEARNING 0
93 #define DEVICE_RX                       1
94 #define DEVICE_SAFE_REMOVE      2
95
96 /* Config_core_flag status definitions. */
97 #define REQUEST_DEV_REMOVAL 1
98 #define ACK_DEV_REMOVAL 0
99
100 /* Configurable number of RX/TX ring descriptors */
101 #define RTE_TEST_RX_DESC_DEFAULT 1024
102 #define RTE_TEST_TX_DESC_DEFAULT 512
103
104 /*
105  * Need refine these 2 macros for legacy and DPDK based front end:
106  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
107  * And then adjust power 2.
108  */
109 /*
110  * For legacy front end, 128 descriptors,
111  * half for virtio header, another half for mbuf.
112  */
113 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
114 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
115
116 /* Get first 4 bytes in mbuf headroom. */
117 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
118                 + sizeof(struct rte_mbuf)))
119
120 /* true if x is a power of 2 */
121 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
122
123 #define INVALID_PORT_ID 0xFF
124
125 /* Max number of devices. Limited by vmdq. */
126 #define MAX_DEVICES 64
127
128 /* Size of buffers used for snprintfs. */
129 #define MAX_PRINT_BUFF 6072
130
131 /* Maximum character device basename size. */
132 #define MAX_BASENAME_SZ 10
133
134 /* Maximum long option length for option parsing. */
135 #define MAX_LONG_OPT_SZ 64
136
137 /* Used to compare MAC addresses. */
138 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
139
140 /* Number of descriptors per cacheline. */
141 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
142
143 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
144
145 /* mask of enabled ports */
146 static uint32_t enabled_port_mask = 0;
147
148 /* Promiscuous mode */
149 static uint32_t promiscuous;
150
151 /*Number of switching cores enabled*/
152 static uint32_t num_switching_cores = 0;
153
154 /* number of devices/queues to support*/
155 static uint32_t num_queues = 0;
156 static uint32_t num_devices;
157
158 /*
159  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
160  * disabled on default.
161  */
162 static uint32_t zero_copy;
163 static int mergeable;
164
165 /* Do vlan strip on host, enabled on default */
166 static uint32_t vlan_strip = 1;
167
168 /* number of descriptors to apply*/
169 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
170 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
171
172 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
173 #define MAX_RING_DESC 4096
174
175 struct vpool {
176         struct rte_mempool *pool;
177         struct rte_ring *ring;
178         uint32_t buf_size;
179 } vpool_array[MAX_QUEUES+MAX_QUEUES];
180
181 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
182 typedef enum {
183         VM2VM_DISABLED = 0,
184         VM2VM_SOFTWARE = 1,
185         VM2VM_HARDWARE = 2,
186         VM2VM_LAST
187 } vm2vm_type;
188 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
189
190 /* The type of host physical address translated from guest physical address. */
191 typedef enum {
192         PHYS_ADDR_CONTINUOUS = 0,
193         PHYS_ADDR_CROSS_SUBREG = 1,
194         PHYS_ADDR_INVALID = 2,
195         PHYS_ADDR_LAST
196 } hpa_type;
197
198 /* Enable stats. */
199 static uint32_t enable_stats = 0;
200 /* Enable retries on RX. */
201 static uint32_t enable_retry = 1;
202 /* Specify timeout (in useconds) between retries on RX. */
203 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
204 /* Specify the number of retries on RX. */
205 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
206
207 /* Character device basename. Can be set by user. */
208 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
209
210 /* empty vmdq configuration structure. Filled in programatically */
211 static struct rte_eth_conf vmdq_conf_default = {
212         .rxmode = {
213                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
214                 .split_hdr_size = 0,
215                 .header_split   = 0, /**< Header Split disabled */
216                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
217                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
218                 /*
219                  * It is necessary for 1G NIC such as I350,
220                  * this fixes bug of ipv4 forwarding in guest can't
221                  * forward pakets from one virtio dev to another virtio dev.
222                  */
223                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
224                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
225                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
226         },
227
228         .txmode = {
229                 .mq_mode = ETH_MQ_TX_NONE,
230         },
231         .rx_adv_conf = {
232                 /*
233                  * should be overridden separately in code with
234                  * appropriate values
235                  */
236                 .vmdq_rx_conf = {
237                         .nb_queue_pools = ETH_8_POOLS,
238                         .enable_default_pool = 0,
239                         .default_pool = 0,
240                         .nb_pool_maps = 0,
241                         .pool_map = {{0, 0},},
242                 },
243         },
244 };
245
246 static unsigned lcore_ids[RTE_MAX_LCORE];
247 static uint8_t ports[RTE_MAX_ETHPORTS];
248 static unsigned num_ports = 0; /**< The number of ports specified in command line */
249 static uint16_t num_pf_queues, num_vmdq_queues;
250 static uint16_t vmdq_pool_base, vmdq_queue_base;
251 static uint16_t queues_per_pool;
252
253 static const uint16_t external_pkt_default_vlan_tag = 2000;
254 const uint16_t vlan_tags[] = {
255         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
256         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
257         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
258         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
259         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
260         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
261         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
262         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
263 };
264
265 /* ethernet addresses of ports */
266 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
267
268 /* heads for the main used and free linked lists for the data path. */
269 static struct virtio_net_data_ll *ll_root_used = NULL;
270 static struct virtio_net_data_ll *ll_root_free = NULL;
271
272 /* Array of data core structures containing information on individual core linked lists. */
273 static struct lcore_info lcore_info[RTE_MAX_LCORE];
274
275 /* Used for queueing bursts of TX packets. */
276 struct mbuf_table {
277         unsigned len;
278         unsigned txq_id;
279         struct rte_mbuf *m_table[MAX_PKT_BURST];
280 };
281
282 /* TX queue for each data core. */
283 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
284
285 /* TX queue fori each virtio device for zero copy. */
286 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
287
288 /* Vlan header struct used to insert vlan tags on TX. */
289 struct vlan_ethhdr {
290         unsigned char   h_dest[ETH_ALEN];
291         unsigned char   h_source[ETH_ALEN];
292         __be16          h_vlan_proto;
293         __be16          h_vlan_TCI;
294         __be16          h_vlan_encapsulated_proto;
295 };
296
297 /* IPv4 Header */
298 struct ipv4_hdr {
299         uint8_t  version_ihl;           /**< version and header length */
300         uint8_t  type_of_service;       /**< type of service */
301         uint16_t total_length;          /**< length of packet */
302         uint16_t packet_id;             /**< packet ID */
303         uint16_t fragment_offset;       /**< fragmentation offset */
304         uint8_t  time_to_live;          /**< time to live */
305         uint8_t  next_proto_id;         /**< protocol ID */
306         uint16_t hdr_checksum;          /**< header checksum */
307         uint32_t src_addr;              /**< source address */
308         uint32_t dst_addr;              /**< destination address */
309 } __attribute__((__packed__));
310
311 /* Header lengths. */
312 #define VLAN_HLEN       4
313 #define VLAN_ETH_HLEN   18
314
315 /* Per-device statistics struct */
316 struct device_statistics {
317         uint64_t tx_total;
318         rte_atomic64_t rx_total_atomic;
319         uint64_t rx_total;
320         uint64_t tx;
321         rte_atomic64_t rx_atomic;
322         uint64_t rx;
323 } __rte_cache_aligned;
324 struct device_statistics dev_statistics[MAX_DEVICES];
325
326 /*
327  * Builds up the correct configuration for VMDQ VLAN pool map
328  * according to the pool & queue limits.
329  */
330 static inline int
331 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
332 {
333         struct rte_eth_vmdq_rx_conf conf;
334         struct rte_eth_vmdq_rx_conf *def_conf =
335                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
336         unsigned i;
337
338         memset(&conf, 0, sizeof(conf));
339         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
340         conf.nb_pool_maps = num_devices;
341         conf.enable_loop_back = def_conf->enable_loop_back;
342         conf.rx_mode = def_conf->rx_mode;
343
344         for (i = 0; i < conf.nb_pool_maps; i++) {
345                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
346                 conf.pool_map[i].pools = (1UL << i);
347         }
348
349         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
350         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
351                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
352         return 0;
353 }
354
355 /*
356  * Validate the device number according to the max pool number gotten form
357  * dev_info. If the device number is invalid, give the error message and
358  * return -1. Each device must have its own pool.
359  */
360 static inline int
361 validate_num_devices(uint32_t max_nb_devices)
362 {
363         if (num_devices > max_nb_devices) {
364                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
365                 return -1;
366         }
367         return 0;
368 }
369
370 /*
371  * Initialises a given port using global settings and with the rx buffers
372  * coming from the mbuf_pool passed as parameter
373  */
374 static inline int
375 port_init(uint8_t port)
376 {
377         struct rte_eth_dev_info dev_info;
378         struct rte_eth_conf port_conf;
379         struct rte_eth_rxconf *rxconf;
380         struct rte_eth_txconf *txconf;
381         int16_t rx_rings, tx_rings;
382         uint16_t rx_ring_size, tx_ring_size;
383         int retval;
384         uint16_t q;
385
386         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
387         rte_eth_dev_info_get (port, &dev_info);
388
389         if (dev_info.max_rx_queues > MAX_QUEUES) {
390                 rte_exit(EXIT_FAILURE,
391                         "please define MAX_QUEUES no less than %u in %s\n",
392                         dev_info.max_rx_queues, __FILE__);
393         }
394
395         rxconf = &dev_info.default_rxconf;
396         txconf = &dev_info.default_txconf;
397         rxconf->rx_drop_en = 1;
398
399         /* Enable vlan offload */
400         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
401
402         /*
403          * Zero copy defers queue RX/TX start to the time when guest
404          * finishes its startup and packet buffers from that guest are
405          * available.
406          */
407         if (zero_copy) {
408                 rxconf->rx_deferred_start = 1;
409                 rxconf->rx_drop_en = 0;
410                 txconf->tx_deferred_start = 1;
411         }
412
413         /*configure the number of supported virtio devices based on VMDQ limits */
414         num_devices = dev_info.max_vmdq_pools;
415
416         if (zero_copy) {
417                 rx_ring_size = num_rx_descriptor;
418                 tx_ring_size = num_tx_descriptor;
419                 tx_rings = dev_info.max_tx_queues;
420         } else {
421                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
422                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
423                 tx_rings = (uint16_t)rte_lcore_count();
424         }
425
426         retval = validate_num_devices(MAX_DEVICES);
427         if (retval < 0)
428                 return retval;
429
430         /* Get port configuration. */
431         retval = get_eth_conf(&port_conf, num_devices);
432         if (retval < 0)
433                 return retval;
434         /* NIC queues are divided into pf queues and vmdq queues.  */
435         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
436         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
437         num_vmdq_queues = num_devices * queues_per_pool;
438         num_queues = num_pf_queues + num_vmdq_queues;
439         vmdq_queue_base = dev_info.vmdq_queue_base;
440         vmdq_pool_base  = dev_info.vmdq_pool_base;
441         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
442                 num_pf_queues, num_devices, queues_per_pool);
443
444         if (port >= rte_eth_dev_count()) return -1;
445
446         rx_rings = (uint16_t)dev_info.max_rx_queues;
447         /* Configure ethernet device. */
448         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449         if (retval != 0)
450                 return retval;
451
452         /* Setup the queues. */
453         for (q = 0; q < rx_rings; q ++) {
454                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455                                                 rte_eth_dev_socket_id(port),
456                                                 rxconf,
457                                                 vpool_array[q].pool);
458                 if (retval < 0)
459                         return retval;
460         }
461         for (q = 0; q < tx_rings; q ++) {
462                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
463                                                 rte_eth_dev_socket_id(port),
464                                                 txconf);
465                 if (retval < 0)
466                         return retval;
467         }
468
469         /* Start the device. */
470         retval  = rte_eth_dev_start(port);
471         if (retval < 0) {
472                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
473                 return retval;
474         }
475
476         if (promiscuous)
477                 rte_eth_promiscuous_enable(port);
478
479         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
480         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
481         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
482                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
483                         (unsigned)port,
484                         vmdq_ports_eth_addr[port].addr_bytes[0],
485                         vmdq_ports_eth_addr[port].addr_bytes[1],
486                         vmdq_ports_eth_addr[port].addr_bytes[2],
487                         vmdq_ports_eth_addr[port].addr_bytes[3],
488                         vmdq_ports_eth_addr[port].addr_bytes[4],
489                         vmdq_ports_eth_addr[port].addr_bytes[5]);
490
491         return 0;
492 }
493
494 /*
495  * Set character device basename.
496  */
497 static int
498 us_vhost_parse_basename(const char *q_arg)
499 {
500         /* parse number string */
501
502         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503                 return -1;
504         else
505                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
506
507         return 0;
508 }
509
510 /*
511  * Parse the portmask provided at run time.
512  */
513 static int
514 parse_portmask(const char *portmask)
515 {
516         char *end = NULL;
517         unsigned long pm;
518
519         errno = 0;
520
521         /* parse hexadecimal string */
522         pm = strtoul(portmask, &end, 16);
523         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
524                 return -1;
525
526         if (pm == 0)
527                 return -1;
528
529         return pm;
530
531 }
532
533 /*
534  * Parse num options at run time.
535  */
536 static int
537 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
538 {
539         char *end = NULL;
540         unsigned long num;
541
542         errno = 0;
543
544         /* parse unsigned int string */
545         num = strtoul(q_arg, &end, 10);
546         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547                 return -1;
548
549         if (num > max_valid_value)
550                 return -1;
551
552         return num;
553
554 }
555
556 /*
557  * Display usage
558  */
559 static void
560 us_vhost_usage(const char *prgname)
561 {
562         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
563         "               --vm2vm [0|1|2]\n"
564         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
565         "               --dev-basename <name>\n"
566         "               --nb-devices ND\n"
567         "               -p PORTMASK: Set mask for ports to be used by application\n"
568         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
569         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
570         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
571         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
572         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
573         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
574         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
575         "               --dev-basename: The basename to be used for the character device.\n"
576         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
577                         "zero copy\n"
578         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
579                         "used only when zero copy is enabled.\n"
580         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
581                         "used only when zero copy is enabled.\n",
582                prgname);
583 }
584
585 /*
586  * Parse the arguments given in the command line of the application.
587  */
588 static int
589 us_vhost_parse_args(int argc, char **argv)
590 {
591         int opt, ret;
592         int option_index;
593         unsigned i;
594         const char *prgname = argv[0];
595         static struct option long_option[] = {
596                 {"vm2vm", required_argument, NULL, 0},
597                 {"rx-retry", required_argument, NULL, 0},
598                 {"rx-retry-delay", required_argument, NULL, 0},
599                 {"rx-retry-num", required_argument, NULL, 0},
600                 {"mergeable", required_argument, NULL, 0},
601                 {"vlan-strip", required_argument, NULL, 0},
602                 {"stats", required_argument, NULL, 0},
603                 {"dev-basename", required_argument, NULL, 0},
604                 {"zero-copy", required_argument, NULL, 0},
605                 {"rx-desc-num", required_argument, NULL, 0},
606                 {"tx-desc-num", required_argument, NULL, 0},
607                 {NULL, 0, 0, 0},
608         };
609
610         /* Parse command line */
611         while ((opt = getopt_long(argc, argv, "p:P",
612                         long_option, &option_index)) != EOF) {
613                 switch (opt) {
614                 /* Portmask */
615                 case 'p':
616                         enabled_port_mask = parse_portmask(optarg);
617                         if (enabled_port_mask == 0) {
618                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
619                                 us_vhost_usage(prgname);
620                                 return -1;
621                         }
622                         break;
623
624                 case 'P':
625                         promiscuous = 1;
626                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
627                                 ETH_VMDQ_ACCEPT_BROADCAST |
628                                 ETH_VMDQ_ACCEPT_MULTICAST;
629                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
630
631                         break;
632
633                 case 0:
634                         /* Enable/disable vm2vm comms. */
635                         if (!strncmp(long_option[option_index].name, "vm2vm",
636                                 MAX_LONG_OPT_SZ)) {
637                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
638                                 if (ret == -1) {
639                                         RTE_LOG(INFO, VHOST_CONFIG,
640                                                 "Invalid argument for "
641                                                 "vm2vm [0|1|2]\n");
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 } else {
645                                         vm2vm_mode = (vm2vm_type)ret;
646                                 }
647                         }
648
649                         /* Enable/disable retries on RX. */
650                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
651                                 ret = parse_num_opt(optarg, 1);
652                                 if (ret == -1) {
653                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
654                                         us_vhost_usage(prgname);
655                                         return -1;
656                                 } else {
657                                         enable_retry = ret;
658                                 }
659                         }
660
661                         /* Specify the retries delay time (in useconds) on RX. */
662                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
663                                 ret = parse_num_opt(optarg, INT32_MAX);
664                                 if (ret == -1) {
665                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
666                                         us_vhost_usage(prgname);
667                                         return -1;
668                                 } else {
669                                         burst_rx_delay_time = ret;
670                                 }
671                         }
672
673                         /* Specify the retries number on RX. */
674                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
675                                 ret = parse_num_opt(optarg, INT32_MAX);
676                                 if (ret == -1) {
677                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
678                                         us_vhost_usage(prgname);
679                                         return -1;
680                                 } else {
681                                         burst_rx_retry_num = ret;
682                                 }
683                         }
684
685                         /* Enable/disable RX mergeable buffers. */
686                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
687                                 ret = parse_num_opt(optarg, 1);
688                                 if (ret == -1) {
689                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
690                                         us_vhost_usage(prgname);
691                                         return -1;
692                                 } else {
693                                         mergeable = !!ret;
694                                         if (ret) {
695                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
696                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
697                                                         = JUMBO_FRAME_MAX_SIZE;
698                                         }
699                                 }
700                         }
701
702                         /* Enable/disable RX VLAN strip on host. */
703                         if (!strncmp(long_option[option_index].name,
704                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
705                                 ret = parse_num_opt(optarg, 1);
706                                 if (ret == -1) {
707                                         RTE_LOG(INFO, VHOST_CONFIG,
708                                                 "Invalid argument for VLAN strip [0|1]\n");
709                                         us_vhost_usage(prgname);
710                                         return -1;
711                                 } else {
712                                         vlan_strip = !!ret;
713                                         vmdq_conf_default.rxmode.hw_vlan_strip =
714                                                 vlan_strip;
715                                 }
716                         }
717
718                         /* Enable/disable stats. */
719                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
720                                 ret = parse_num_opt(optarg, INT32_MAX);
721                                 if (ret == -1) {
722                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
723                                         us_vhost_usage(prgname);
724                                         return -1;
725                                 } else {
726                                         enable_stats = ret;
727                                 }
728                         }
729
730                         /* Set character device basename. */
731                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
732                                 if (us_vhost_parse_basename(optarg) == -1) {
733                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
734                                         us_vhost_usage(prgname);
735                                         return -1;
736                                 }
737                         }
738
739                         /* Enable/disable rx/tx zero copy. */
740                         if (!strncmp(long_option[option_index].name,
741                                 "zero-copy", MAX_LONG_OPT_SZ)) {
742                                 ret = parse_num_opt(optarg, 1);
743                                 if (ret == -1) {
744                                         RTE_LOG(INFO, VHOST_CONFIG,
745                                                 "Invalid argument"
746                                                 " for zero-copy [0|1]\n");
747                                         us_vhost_usage(prgname);
748                                         return -1;
749                                 } else
750                                         zero_copy = ret;
751                         }
752
753                         /* Specify the descriptor number on RX. */
754                         if (!strncmp(long_option[option_index].name,
755                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
756                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
757                                 if ((ret == -1) || (!POWEROF2(ret))) {
758                                         RTE_LOG(INFO, VHOST_CONFIG,
759                                         "Invalid argument for rx-desc-num[0-N],"
760                                         "power of 2 required.\n");
761                                         us_vhost_usage(prgname);
762                                         return -1;
763                                 } else {
764                                         num_rx_descriptor = ret;
765                                 }
766                         }
767
768                         /* Specify the descriptor number on TX. */
769                         if (!strncmp(long_option[option_index].name,
770                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
771                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
772                                 if ((ret == -1) || (!POWEROF2(ret))) {
773                                         RTE_LOG(INFO, VHOST_CONFIG,
774                                         "Invalid argument for tx-desc-num [0-N],"
775                                         "power of 2 required.\n");
776                                         us_vhost_usage(prgname);
777                                         return -1;
778                                 } else {
779                                         num_tx_descriptor = ret;
780                                 }
781                         }
782
783                         break;
784
785                         /* Invalid option - print options. */
786                 default:
787                         us_vhost_usage(prgname);
788                         return -1;
789                 }
790         }
791
792         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
793                 if (enabled_port_mask & (1 << i))
794                         ports[num_ports++] = (uint8_t)i;
795         }
796
797         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
798                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
799                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
800                 return -1;
801         }
802
803         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
804                 RTE_LOG(INFO, VHOST_PORT,
805                         "Vhost zero copy doesn't support software vm2vm,"
806                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
807                 return -1;
808         }
809
810         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
811                 RTE_LOG(INFO, VHOST_PORT,
812                         "Vhost zero copy doesn't support jumbo frame,"
813                         "please specify '--mergeable 0' to disable the "
814                         "mergeable feature.\n");
815                 return -1;
816         }
817
818         return 0;
819 }
820
821 /*
822  * Update the global var NUM_PORTS and array PORTS according to system ports number
823  * and return valid ports number
824  */
825 static unsigned check_ports_num(unsigned nb_ports)
826 {
827         unsigned valid_num_ports = num_ports;
828         unsigned portid;
829
830         if (num_ports > nb_ports) {
831                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
832                         num_ports, nb_ports);
833                 num_ports = nb_ports;
834         }
835
836         for (portid = 0; portid < num_ports; portid ++) {
837                 if (ports[portid] >= nb_ports) {
838                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
839                                 ports[portid], (nb_ports - 1));
840                         ports[portid] = INVALID_PORT_ID;
841                         valid_num_ports--;
842                 }
843         }
844         return valid_num_ports;
845 }
846
847 /*
848  * Macro to print out packet contents. Wrapped in debug define so that the
849  * data path is not effected when debug is disabled.
850  */
851 #ifdef DEBUG
852 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
853         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
854         unsigned int index;                                                                                                                                                                                             \
855         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
856                                                                                                                                                                                                                                         \
857         if ((header))                                                                                                                                                                                                   \
858                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
859         else                                                                                                                                                                                                                    \
860                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
861         for (index = 0; index < (size); index++) {                                                                                                                                              \
862                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
863                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
864         }                                                                                                                                                                                                                               \
865         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
866                                                                                                                                                                                                                                         \
867         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
868 } while(0)
869 #else
870 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
871 #endif
872
873 /*
874  * Function to convert guest physical addresses to vhost physical addresses.
875  * This is used to convert virtio buffer addresses.
876  */
877 static inline uint64_t __attribute__((always_inline))
878 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
879         uint32_t buf_len, hpa_type *addr_type)
880 {
881         struct virtio_memory_regions_hpa *region;
882         uint32_t regionidx;
883         uint64_t vhost_pa = 0;
884
885         *addr_type = PHYS_ADDR_INVALID;
886
887         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
888                 region = &vdev->regions_hpa[regionidx];
889                 if ((guest_pa >= region->guest_phys_address) &&
890                         (guest_pa <= region->guest_phys_address_end)) {
891                         vhost_pa = region->host_phys_addr_offset + guest_pa;
892                         if (likely((guest_pa + buf_len - 1)
893                                 <= region->guest_phys_address_end))
894                                 *addr_type = PHYS_ADDR_CONTINUOUS;
895                         else
896                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
897                         break;
898                 }
899         }
900
901         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
902                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
903                 (void *)(uintptr_t)vhost_pa);
904
905         return vhost_pa;
906 }
907
908 /*
909  * Compares a packet destination MAC address to a device MAC address.
910  */
911 static inline int __attribute__((always_inline))
912 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
913 {
914         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
915 }
916
917 /*
918  * This function learns the MAC address of the device and registers this along with a
919  * vlan tag to a VMDQ.
920  */
921 static int
922 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
923 {
924         struct ether_hdr *pkt_hdr;
925         struct virtio_net_data_ll *dev_ll;
926         struct virtio_net *dev = vdev->dev;
927         int i, ret;
928
929         /* Learn MAC address of guest device from packet */
930         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
931
932         dev_ll = ll_root_used;
933
934         while (dev_ll != NULL) {
935                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
936                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
937                         return -1;
938                 }
939                 dev_ll = dev_ll->next;
940         }
941
942         for (i = 0; i < ETHER_ADDR_LEN; i++)
943                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
944
945         /* vlan_tag currently uses the device_id. */
946         vdev->vlan_tag = vlan_tags[dev->device_fh];
947
948         /* Print out VMDQ registration info. */
949         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
950                 dev->device_fh,
951                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
952                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
953                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
954                 vdev->vlan_tag);
955
956         /* Register the MAC address. */
957         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
958                                 (uint32_t)dev->device_fh + vmdq_pool_base);
959         if (ret)
960                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
961                                         dev->device_fh);
962
963         /* Enable stripping of the vlan tag as we handle routing. */
964         if (vlan_strip)
965                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
966                         (uint16_t)vdev->vmdq_rx_q, 1);
967
968         /* Set device as ready for RX. */
969         vdev->ready = DEVICE_RX;
970
971         return 0;
972 }
973
974 /*
975  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
976  * queue before disabling RX on the device.
977  */
978 static inline void
979 unlink_vmdq(struct vhost_dev *vdev)
980 {
981         unsigned i = 0;
982         unsigned rx_count;
983         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
984
985         if (vdev->ready == DEVICE_RX) {
986                 /*clear MAC and VLAN settings*/
987                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
988                 for (i = 0; i < 6; i++)
989                         vdev->mac_address.addr_bytes[i] = 0;
990
991                 vdev->vlan_tag = 0;
992
993                 /*Clear out the receive buffers*/
994                 rx_count = rte_eth_rx_burst(ports[0],
995                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
996
997                 while (rx_count) {
998                         for (i = 0; i < rx_count; i++)
999                                 rte_pktmbuf_free(pkts_burst[i]);
1000
1001                         rx_count = rte_eth_rx_burst(ports[0],
1002                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1003                 }
1004
1005                 vdev->ready = DEVICE_MAC_LEARNING;
1006         }
1007 }
1008
1009 /*
1010  * Check if the packet destination MAC address is for a local device. If so then put
1011  * the packet on that devices RX queue. If not then return.
1012  */
1013 static inline int __attribute__((always_inline))
1014 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1015 {
1016         struct virtio_net_data_ll *dev_ll;
1017         struct ether_hdr *pkt_hdr;
1018         uint64_t ret = 0;
1019         struct virtio_net *dev = vdev->dev;
1020         struct virtio_net *tdev; /* destination virito device */
1021
1022         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1023
1024         /*get the used devices list*/
1025         dev_ll = ll_root_used;
1026
1027         while (dev_ll != NULL) {
1028                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1029                                           &dev_ll->vdev->mac_address)) {
1030
1031                         /* Drop the packet if the TX packet is destined for the TX device. */
1032                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1033                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1034                                                         dev->device_fh);
1035                                 return 0;
1036                         }
1037                         tdev = dev_ll->vdev->dev;
1038
1039
1040                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1041
1042                         if (unlikely(dev_ll->vdev->remove)) {
1043                                 /*drop the packet if the device is marked for removal*/
1044                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1045                         } else {
1046                                 /*send the packet to the local virtio device*/
1047                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1048                                 if (enable_stats) {
1049                                         rte_atomic64_add(
1050                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1051                                         1);
1052                                         rte_atomic64_add(
1053                                         &dev_statistics[tdev->device_fh].rx_atomic,
1054                                         ret);
1055                                         dev_statistics[dev->device_fh].tx_total++;
1056                                         dev_statistics[dev->device_fh].tx += ret;
1057                                 }
1058                         }
1059
1060                         return 0;
1061                 }
1062                 dev_ll = dev_ll->next;
1063         }
1064
1065         return -1;
1066 }
1067
1068 /*
1069  * Check if the destination MAC of a packet is one local VM,
1070  * and get its vlan tag, and offset if it is.
1071  */
1072 static inline int __attribute__((always_inline))
1073 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1074         uint32_t *offset, uint16_t *vlan_tag)
1075 {
1076         struct virtio_net_data_ll *dev_ll = ll_root_used;
1077         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1078
1079         while (dev_ll != NULL) {
1080                 if ((dev_ll->vdev->ready == DEVICE_RX)
1081                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1082                 &dev_ll->vdev->mac_address)) {
1083                         /*
1084                          * Drop the packet if the TX packet is
1085                          * destined for the TX device.
1086                          */
1087                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1088                                 LOG_DEBUG(VHOST_DATA,
1089                                 "(%"PRIu64") TX: Source and destination"
1090                                 " MAC addresses are the same. Dropping "
1091                                 "packet.\n",
1092                                 dev_ll->vdev->dev->device_fh);
1093                                 return -1;
1094                         }
1095
1096                         /*
1097                          * HW vlan strip will reduce the packet length
1098                          * by minus length of vlan tag, so need restore
1099                          * the packet length by plus it.
1100                          */
1101                         *offset = VLAN_HLEN;
1102                         *vlan_tag =
1103                         (uint16_t)
1104                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1105
1106                         LOG_DEBUG(VHOST_DATA,
1107                         "(%"PRIu64") TX: pkt to local VM device id:"
1108                         "(%"PRIu64") vlan tag: %d.\n",
1109                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1110                         (int)*vlan_tag);
1111
1112                         break;
1113                 }
1114                 dev_ll = dev_ll->next;
1115         }
1116         return 0;
1117 }
1118
1119 /*
1120  * This function routes the TX packet to the correct interface. This may be a local device
1121  * or the physical port.
1122  */
1123 static inline void __attribute__((always_inline))
1124 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1125 {
1126         struct mbuf_table *tx_q;
1127         struct rte_mbuf **m_table;
1128         unsigned len, ret, offset = 0;
1129         const uint16_t lcore_id = rte_lcore_id();
1130         struct virtio_net *dev = vdev->dev;
1131         struct ether_hdr *nh;
1132
1133         /*check if destination is local VM*/
1134         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1135                 rte_pktmbuf_free(m);
1136                 return;
1137         }
1138
1139         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1140                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1141                         rte_pktmbuf_free(m);
1142                         return;
1143                 }
1144         }
1145
1146         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1147
1148         /*Add packet to the port tx queue*/
1149         tx_q = &lcore_tx_queue[lcore_id];
1150         len = tx_q->len;
1151
1152         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1153         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1154                 /* Guest has inserted the vlan tag. */
1155                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1156                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1157                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1158                         (vh->vlan_tci != vlan_tag_be))
1159                         vh->vlan_tci = vlan_tag_be;
1160         } else {
1161                 m->ol_flags = PKT_TX_VLAN_PKT;
1162
1163                 /*
1164                  * Find the right seg to adjust the data len when offset is
1165                  * bigger than tail room size.
1166                  */
1167                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1168                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1169                                 m->data_len += offset;
1170                         else {
1171                                 struct rte_mbuf *seg = m;
1172
1173                                 while ((seg->next != NULL) &&
1174                                         (offset > rte_pktmbuf_tailroom(seg)))
1175                                         seg = seg->next;
1176
1177                                 seg->data_len += offset;
1178                         }
1179                         m->pkt_len += offset;
1180                 }
1181
1182                 m->vlan_tci = vlan_tag;
1183         }
1184
1185         tx_q->m_table[len] = m;
1186         len++;
1187         if (enable_stats) {
1188                 dev_statistics[dev->device_fh].tx_total++;
1189                 dev_statistics[dev->device_fh].tx++;
1190         }
1191
1192         if (unlikely(len == MAX_PKT_BURST)) {
1193                 m_table = (struct rte_mbuf **)tx_q->m_table;
1194                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1195                 /* Free any buffers not handled by TX and update the port stats. */
1196                 if (unlikely(ret < len)) {
1197                         do {
1198                                 rte_pktmbuf_free(m_table[ret]);
1199                         } while (++ret < len);
1200                 }
1201
1202                 len = 0;
1203         }
1204
1205         tx_q->len = len;
1206         return;
1207 }
1208 /*
1209  * This function is called by each data core. It handles all RX/TX registered with the
1210  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1211  * with all devices in the main linked list.
1212  */
1213 static int
1214 switch_worker(__attribute__((unused)) void *arg)
1215 {
1216         struct rte_mempool *mbuf_pool = arg;
1217         struct virtio_net *dev = NULL;
1218         struct vhost_dev *vdev = NULL;
1219         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1220         struct virtio_net_data_ll *dev_ll;
1221         struct mbuf_table *tx_q;
1222         volatile struct lcore_ll_info *lcore_ll;
1223         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1224         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1225         unsigned ret, i;
1226         const uint16_t lcore_id = rte_lcore_id();
1227         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1228         uint16_t rx_count = 0;
1229         uint16_t tx_count;
1230         uint32_t retry = 0;
1231
1232         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1233         lcore_ll = lcore_info[lcore_id].lcore_ll;
1234         prev_tsc = 0;
1235
1236         tx_q = &lcore_tx_queue[lcore_id];
1237         for (i = 0; i < num_cores; i ++) {
1238                 if (lcore_ids[i] == lcore_id) {
1239                         tx_q->txq_id = i;
1240                         break;
1241                 }
1242         }
1243
1244         while(1) {
1245                 cur_tsc = rte_rdtsc();
1246                 /*
1247                  * TX burst queue drain
1248                  */
1249                 diff_tsc = cur_tsc - prev_tsc;
1250                 if (unlikely(diff_tsc > drain_tsc)) {
1251
1252                         if (tx_q->len) {
1253                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1254
1255                                 /*Tx any packets in the queue*/
1256                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1257                                                                            (struct rte_mbuf **)tx_q->m_table,
1258                                                                            (uint16_t)tx_q->len);
1259                                 if (unlikely(ret < tx_q->len)) {
1260                                         do {
1261                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1262                                         } while (++ret < tx_q->len);
1263                                 }
1264
1265                                 tx_q->len = 0;
1266                         }
1267
1268                         prev_tsc = cur_tsc;
1269
1270                 }
1271
1272                 rte_prefetch0(lcore_ll->ll_root_used);
1273                 /*
1274                  * Inform the configuration core that we have exited the linked list and that no devices are
1275                  * in use if requested.
1276                  */
1277                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1278                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1279
1280                 /*
1281                  * Process devices
1282                  */
1283                 dev_ll = lcore_ll->ll_root_used;
1284
1285                 while (dev_ll != NULL) {
1286                         /*get virtio device ID*/
1287                         vdev = dev_ll->vdev;
1288                         dev = vdev->dev;
1289
1290                         if (unlikely(vdev->remove)) {
1291                                 dev_ll = dev_ll->next;
1292                                 unlink_vmdq(vdev);
1293                                 vdev->ready = DEVICE_SAFE_REMOVE;
1294                                 continue;
1295                         }
1296                         if (likely(vdev->ready == DEVICE_RX)) {
1297                                 /*Handle guest RX*/
1298                                 rx_count = rte_eth_rx_burst(ports[0],
1299                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1300
1301                                 if (rx_count) {
1302                                         /*
1303                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1304                                         * Here MAX_PKT_BURST must be less than virtio queue size
1305                                         */
1306                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1307                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1308                                                         rte_delay_us(burst_rx_delay_time);
1309                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1310                                                                 break;
1311                                                 }
1312                                         }
1313                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1314                                         if (enable_stats) {
1315                                                 rte_atomic64_add(
1316                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1317                                                 rx_count);
1318                                                 rte_atomic64_add(
1319                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1320                                         }
1321                                         while (likely(rx_count)) {
1322                                                 rx_count--;
1323                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1324                                         }
1325
1326                                 }
1327                         }
1328
1329                         if (likely(!vdev->remove)) {
1330                                 /* Handle guest TX*/
1331                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1332                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1333                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1334                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1335                                                 while (tx_count)
1336                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1337                                         }
1338                                 }
1339                                 while (tx_count)
1340                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1341                         }
1342
1343                         /*move to the next device in the list*/
1344                         dev_ll = dev_ll->next;
1345                 }
1346         }
1347
1348         return 0;
1349 }
1350
1351 /*
1352  * This function gets available ring number for zero copy rx.
1353  * Only one thread will call this funciton for a paticular virtio device,
1354  * so, it is designed as non-thread-safe function.
1355  */
1356 static inline uint32_t __attribute__((always_inline))
1357 get_available_ring_num_zcp(struct virtio_net *dev)
1358 {
1359         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1360         uint16_t avail_idx;
1361
1362         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1363         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1364 }
1365
1366 /*
1367  * This function gets available ring index for zero copy rx,
1368  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1369  * Only one thread will call this funciton for a paticular virtio device,
1370  * so, it is designed as non-thread-safe function.
1371  */
1372 static inline uint32_t __attribute__((always_inline))
1373 get_available_ring_index_zcp(struct virtio_net *dev,
1374         uint16_t *res_base_idx, uint32_t count)
1375 {
1376         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1377         uint16_t avail_idx;
1378         uint32_t retry = 0;
1379         uint16_t free_entries;
1380
1381         *res_base_idx = vq->last_used_idx_res;
1382         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1383         free_entries = (avail_idx - *res_base_idx);
1384
1385         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1386                         "avail idx: %d, "
1387                         "res base idx:%d, free entries:%d\n",
1388                         dev->device_fh, avail_idx, *res_base_idx,
1389                         free_entries);
1390
1391         /*
1392          * If retry is enabled and the queue is full then we wait
1393          * and retry to avoid packet loss.
1394          */
1395         if (enable_retry && unlikely(count > free_entries)) {
1396                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1397                         rte_delay_us(burst_rx_delay_time);
1398                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1399                         free_entries = (avail_idx - *res_base_idx);
1400                         if (count <= free_entries)
1401                                 break;
1402                 }
1403         }
1404
1405         /*check that we have enough buffers*/
1406         if (unlikely(count > free_entries))
1407                 count = free_entries;
1408
1409         if (unlikely(count == 0)) {
1410                 LOG_DEBUG(VHOST_DATA,
1411                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1412                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1413                         dev->device_fh, avail_idx,
1414                         *res_base_idx, free_entries);
1415                 return 0;
1416         }
1417
1418         vq->last_used_idx_res = *res_base_idx + count;
1419
1420         return count;
1421 }
1422
1423 /*
1424  * This function put descriptor back to used list.
1425  */
1426 static inline void __attribute__((always_inline))
1427 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1428 {
1429         uint16_t res_cur_idx = vq->last_used_idx;
1430         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1431         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1432         rte_compiler_barrier();
1433         *(volatile uint16_t *)&vq->used->idx += 1;
1434         vq->last_used_idx += 1;
1435
1436         /* Kick the guest if necessary. */
1437         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1438                 eventfd_write(vq->callfd, (eventfd_t)1);
1439 }
1440
1441 /*
1442  * This function get available descriptor from vitio vring and un-attached mbuf
1443  * from vpool->ring, and then attach them together. It needs adjust the offset
1444  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1445  * frame data may be put to wrong location in mbuf.
1446  */
1447 static inline void __attribute__((always_inline))
1448 attach_rxmbuf_zcp(struct virtio_net *dev)
1449 {
1450         uint16_t res_base_idx, desc_idx;
1451         uint64_t buff_addr, phys_addr;
1452         struct vhost_virtqueue *vq;
1453         struct vring_desc *desc;
1454         void *obj = NULL;
1455         struct rte_mbuf *mbuf;
1456         struct vpool *vpool;
1457         hpa_type addr_type;
1458         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1459
1460         vpool = &vpool_array[vdev->vmdq_rx_q];
1461         vq = dev->virtqueue[VIRTIO_RXQ];
1462
1463         do {
1464                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1465                                 1) != 1))
1466                         return;
1467                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1468
1469                 desc = &vq->desc[desc_idx];
1470                 if (desc->flags & VRING_DESC_F_NEXT) {
1471                         desc = &vq->desc[desc->next];
1472                         buff_addr = gpa_to_vva(dev, desc->addr);
1473                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1474                                         &addr_type);
1475                 } else {
1476                         buff_addr = gpa_to_vva(dev,
1477                                         desc->addr + vq->vhost_hlen);
1478                         phys_addr = gpa_to_hpa(vdev,
1479                                         desc->addr + vq->vhost_hlen,
1480                                         desc->len, &addr_type);
1481                 }
1482
1483                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1484                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1485                                 " address found when attaching RX frame buffer"
1486                                 " address!\n", dev->device_fh);
1487                         put_desc_to_used_list_zcp(vq, desc_idx);
1488                         continue;
1489                 }
1490
1491                 /*
1492                  * Check if the frame buffer address from guest crosses
1493                  * sub-region or not.
1494                  */
1495                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1496                         RTE_LOG(ERR, VHOST_DATA,
1497                                 "(%"PRIu64") Frame buffer address cross "
1498                                 "sub-regioin found when attaching RX frame "
1499                                 "buffer address!\n",
1500                                 dev->device_fh);
1501                         put_desc_to_used_list_zcp(vq, desc_idx);
1502                         continue;
1503                 }
1504         } while (unlikely(phys_addr == 0));
1505
1506         rte_ring_sc_dequeue(vpool->ring, &obj);
1507         mbuf = obj;
1508         if (unlikely(mbuf == NULL)) {
1509                 LOG_DEBUG(VHOST_DATA,
1510                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1511                         "ring_sc_dequeue fail.\n",
1512                         dev->device_fh);
1513                 put_desc_to_used_list_zcp(vq, desc_idx);
1514                 return;
1515         }
1516
1517         if (unlikely(vpool->buf_size > desc->len)) {
1518                 LOG_DEBUG(VHOST_DATA,
1519                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1520                         "length(%d) of descriptor idx: %d less than room "
1521                         "size required: %d\n",
1522                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1523                 put_desc_to_used_list_zcp(vq, desc_idx);
1524                 rte_ring_sp_enqueue(vpool->ring, obj);
1525                 return;
1526         }
1527
1528         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1529         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1530         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1531         mbuf->data_len = desc->len;
1532         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1533
1534         LOG_DEBUG(VHOST_DATA,
1535                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1536                 "descriptor idx:%d\n",
1537                 dev->device_fh, res_base_idx, desc_idx);
1538
1539         __rte_mbuf_raw_free(mbuf);
1540
1541         return;
1542 }
1543
1544 /*
1545  * Detach an attched packet mbuf -
1546  *  - restore original mbuf address and length values.
1547  *  - reset pktmbuf data and data_len to their default values.
1548  *  All other fields of the given packet mbuf will be left intact.
1549  *
1550  * @param m
1551  *   The attached packet mbuf.
1552  */
1553 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1554 {
1555         const struct rte_mempool *mp = m->pool;
1556         void *buf = rte_mbuf_to_baddr(m);
1557         uint32_t buf_ofs;
1558         uint32_t buf_len = mp->elt_size - sizeof(*m);
1559         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1560
1561         m->buf_addr = buf;
1562         m->buf_len = (uint16_t)buf_len;
1563
1564         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1565                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1566         m->data_off = buf_ofs;
1567
1568         m->data_len = 0;
1569 }
1570
1571 /*
1572  * This function is called after packets have been transimited. It fetchs mbuf
1573  * from vpool->pool, detached it and put into vpool->ring. It also update the
1574  * used index and kick the guest if necessary.
1575  */
1576 static inline uint32_t __attribute__((always_inline))
1577 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1578 {
1579         struct rte_mbuf *mbuf;
1580         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1581         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1582         uint32_t index = 0;
1583         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1584
1585         LOG_DEBUG(VHOST_DATA,
1586                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1587                 "clean is: %d\n",
1588                 dev->device_fh, mbuf_count);
1589         LOG_DEBUG(VHOST_DATA,
1590                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1591                 "clean  is : %d\n",
1592                 dev->device_fh, rte_ring_count(vpool->ring));
1593
1594         for (index = 0; index < mbuf_count; index++) {
1595                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1596                 if (likely(MBUF_EXT_MEM(mbuf)))
1597                         pktmbuf_detach_zcp(mbuf);
1598                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1599
1600                 /* Update used index buffer information. */
1601                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1602                 vq->used->ring[used_idx].len = 0;
1603
1604                 used_idx = (used_idx + 1) & (vq->size - 1);
1605         }
1606
1607         LOG_DEBUG(VHOST_DATA,
1608                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1609                 "clean is: %d\n",
1610                 dev->device_fh, rte_mempool_count(vpool->pool));
1611         LOG_DEBUG(VHOST_DATA,
1612                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1613                 "clean  is : %d\n",
1614                 dev->device_fh, rte_ring_count(vpool->ring));
1615         LOG_DEBUG(VHOST_DATA,
1616                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1617                 "vq->last_used_idx:%d\n",
1618                 dev->device_fh, vq->last_used_idx);
1619
1620         vq->last_used_idx += mbuf_count;
1621
1622         LOG_DEBUG(VHOST_DATA,
1623                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1624                 "vq->last_used_idx:%d\n",
1625                 dev->device_fh, vq->last_used_idx);
1626
1627         rte_compiler_barrier();
1628
1629         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1630
1631         /* Kick guest if required. */
1632         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1633                 eventfd_write(vq->callfd, (eventfd_t)1);
1634
1635         return 0;
1636 }
1637
1638 /*
1639  * This function is called when a virtio device is destroy.
1640  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1641  */
1642 static void mbuf_destroy_zcp(struct vpool *vpool)
1643 {
1644         struct rte_mbuf *mbuf = NULL;
1645         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1646
1647         LOG_DEBUG(VHOST_CONFIG,
1648                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1649                 "mbuf_destroy_zcp is: %d\n",
1650                 mbuf_count);
1651         LOG_DEBUG(VHOST_CONFIG,
1652                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1653                 "mbuf_destroy_zcp  is : %d\n",
1654                 rte_ring_count(vpool->ring));
1655
1656         for (index = 0; index < mbuf_count; index++) {
1657                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1658                 if (likely(mbuf != NULL)) {
1659                         if (likely(MBUF_EXT_MEM(mbuf)))
1660                                 pktmbuf_detach_zcp(mbuf);
1661                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1662                 }
1663         }
1664
1665         LOG_DEBUG(VHOST_CONFIG,
1666                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1667                 "mbuf_destroy_zcp is: %d\n",
1668                 rte_mempool_count(vpool->pool));
1669         LOG_DEBUG(VHOST_CONFIG,
1670                 "in mbuf_destroy_zcp: mbuf count in ring after "
1671                 "mbuf_destroy_zcp is : %d\n",
1672                 rte_ring_count(vpool->ring));
1673 }
1674
1675 /*
1676  * This function update the use flag and counter.
1677  */
1678 static inline uint32_t __attribute__((always_inline))
1679 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1680         uint32_t count)
1681 {
1682         struct vhost_virtqueue *vq;
1683         struct vring_desc *desc;
1684         struct rte_mbuf *buff;
1685         /* The virtio_hdr is initialised to 0. */
1686         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1687                 = {{0, 0, 0, 0, 0, 0}, 0};
1688         uint64_t buff_hdr_addr = 0;
1689         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1690         uint32_t head_idx, packet_success = 0;
1691         uint16_t res_cur_idx;
1692
1693         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1694
1695         if (count == 0)
1696                 return 0;
1697
1698         vq = dev->virtqueue[VIRTIO_RXQ];
1699         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1700
1701         res_cur_idx = vq->last_used_idx;
1702         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1703                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1704
1705         /* Retrieve all of the head indexes first to avoid caching issues. */
1706         for (head_idx = 0; head_idx < count; head_idx++)
1707                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1708
1709         /*Prefetch descriptor index. */
1710         rte_prefetch0(&vq->desc[head[packet_success]]);
1711
1712         while (packet_success != count) {
1713                 /* Get descriptor from available ring */
1714                 desc = &vq->desc[head[packet_success]];
1715
1716                 buff = pkts[packet_success];
1717                 LOG_DEBUG(VHOST_DATA,
1718                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1719                         "pkt[%d] descriptor idx: %d\n",
1720                         dev->device_fh, packet_success,
1721                         MBUF_HEADROOM_UINT32(buff));
1722
1723                 PRINT_PACKET(dev,
1724                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1725                         + RTE_PKTMBUF_HEADROOM),
1726                         rte_pktmbuf_data_len(buff), 0);
1727
1728                 /* Buffer address translation for virtio header. */
1729                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1730                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1731
1732                 /*
1733                  * If the descriptors are chained the header and data are
1734                  * placed in separate buffers.
1735                  */
1736                 if (desc->flags & VRING_DESC_F_NEXT) {
1737                         desc->len = vq->vhost_hlen;
1738                         desc = &vq->desc[desc->next];
1739                         desc->len = rte_pktmbuf_data_len(buff);
1740                 } else {
1741                         desc->len = packet_len;
1742                 }
1743
1744                 /* Update used ring with desc information */
1745                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1746                         = head[packet_success];
1747                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1748                         = packet_len;
1749                 res_cur_idx++;
1750                 packet_success++;
1751
1752                 /* A header is required per buffer. */
1753                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1754                         (const void *)&virtio_hdr, vq->vhost_hlen);
1755
1756                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1757
1758                 if (likely(packet_success < count)) {
1759                         /* Prefetch descriptor index. */
1760                         rte_prefetch0(&vq->desc[head[packet_success]]);
1761                 }
1762         }
1763
1764         rte_compiler_barrier();
1765
1766         LOG_DEBUG(VHOST_DATA,
1767                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1768                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1769                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1770
1771         *(volatile uint16_t *)&vq->used->idx += count;
1772         vq->last_used_idx += count;
1773
1774         LOG_DEBUG(VHOST_DATA,
1775                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1776                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1777                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1778
1779         /* Kick the guest if necessary. */
1780         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1781                 eventfd_write(vq->callfd, (eventfd_t)1);
1782
1783         return count;
1784 }
1785
1786 /*
1787  * This function routes the TX packet to the correct interface.
1788  * This may be a local device or the physical port.
1789  */
1790 static inline void __attribute__((always_inline))
1791 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1792         uint32_t desc_idx, uint8_t need_copy)
1793 {
1794         struct mbuf_table *tx_q;
1795         struct rte_mbuf **m_table;
1796         void *obj = NULL;
1797         struct rte_mbuf *mbuf;
1798         unsigned len, ret, offset = 0;
1799         struct vpool *vpool;
1800         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1801         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1802
1803         /*Add packet to the port tx queue*/
1804         tx_q = &tx_queue_zcp[vmdq_rx_q];
1805         len = tx_q->len;
1806
1807         /* Allocate an mbuf and populate the structure. */
1808         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1809         rte_ring_sc_dequeue(vpool->ring, &obj);
1810         mbuf = obj;
1811         if (unlikely(mbuf == NULL)) {
1812                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1813                 RTE_LOG(ERR, VHOST_DATA,
1814                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1815                         dev->device_fh);
1816                 put_desc_to_used_list_zcp(vq, desc_idx);
1817                 return;
1818         }
1819
1820         if (vm2vm_mode == VM2VM_HARDWARE) {
1821                 /* Avoid using a vlan tag from any vm for external pkt, such as
1822                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1823                  * selection, MAC address determines it as an external pkt
1824                  * which should go to network, while vlan tag determine it as
1825                  * a vm2vm pkt should forward to another vm. Hardware confuse
1826                  * such a ambiguous situation, so pkt will lost.
1827                  */
1828                 vlan_tag = external_pkt_default_vlan_tag;
1829                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1830                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1831                         __rte_mbuf_raw_free(mbuf);
1832                         return;
1833                 }
1834         }
1835
1836         mbuf->nb_segs = m->nb_segs;
1837         mbuf->next = m->next;
1838         mbuf->data_len = m->data_len + offset;
1839         mbuf->pkt_len = mbuf->data_len;
1840         if (unlikely(need_copy)) {
1841                 /* Copy the packet contents to the mbuf. */
1842                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1843                         rte_pktmbuf_mtod(m, void *),
1844                         m->data_len);
1845         } else {
1846                 mbuf->data_off = m->data_off;
1847                 mbuf->buf_physaddr = m->buf_physaddr;
1848                 mbuf->buf_addr = m->buf_addr;
1849         }
1850         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1851         mbuf->vlan_tci = vlan_tag;
1852         mbuf->l2_len = sizeof(struct ether_hdr);
1853         mbuf->l3_len = sizeof(struct ipv4_hdr);
1854         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1855
1856         tx_q->m_table[len] = mbuf;
1857         len++;
1858
1859         LOG_DEBUG(VHOST_DATA,
1860                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1861                 dev->device_fh,
1862                 mbuf->nb_segs,
1863                 (mbuf->next == NULL) ? "null" : "non-null");
1864
1865         if (enable_stats) {
1866                 dev_statistics[dev->device_fh].tx_total++;
1867                 dev_statistics[dev->device_fh].tx++;
1868         }
1869
1870         if (unlikely(len == MAX_PKT_BURST)) {
1871                 m_table = (struct rte_mbuf **)tx_q->m_table;
1872                 ret = rte_eth_tx_burst(ports[0],
1873                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1874
1875                 /*
1876                  * Free any buffers not handled by TX and update
1877                  * the port stats.
1878                  */
1879                 if (unlikely(ret < len)) {
1880                         do {
1881                                 rte_pktmbuf_free(m_table[ret]);
1882                         } while (++ret < len);
1883                 }
1884
1885                 len = 0;
1886                 txmbuf_clean_zcp(dev, vpool);
1887         }
1888
1889         tx_q->len = len;
1890
1891         return;
1892 }
1893
1894 /*
1895  * This function TX all available packets in virtio TX queue for one
1896  * virtio-net device. If it is first packet, it learns MAC address and
1897  * setup VMDQ.
1898  */
1899 static inline void __attribute__((always_inline))
1900 virtio_dev_tx_zcp(struct virtio_net *dev)
1901 {
1902         struct rte_mbuf m;
1903         struct vhost_virtqueue *vq;
1904         struct vring_desc *desc;
1905         uint64_t buff_addr = 0, phys_addr;
1906         uint32_t head[MAX_PKT_BURST];
1907         uint32_t i;
1908         uint16_t free_entries, packet_success = 0;
1909         uint16_t avail_idx;
1910         uint8_t need_copy = 0;
1911         hpa_type addr_type;
1912         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1913
1914         vq = dev->virtqueue[VIRTIO_TXQ];
1915         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1916
1917         /* If there are no available buffers then return. */
1918         if (vq->last_used_idx_res == avail_idx)
1919                 return;
1920
1921         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1922
1923         /* Prefetch available ring to retrieve head indexes. */
1924         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1925
1926         /* Get the number of free entries in the ring */
1927         free_entries = (avail_idx - vq->last_used_idx_res);
1928
1929         /* Limit to MAX_PKT_BURST. */
1930         free_entries
1931                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1932
1933         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1934                 dev->device_fh, free_entries);
1935
1936         /* Retrieve all of the head indexes first to avoid caching issues. */
1937         for (i = 0; i < free_entries; i++)
1938                 head[i]
1939                         = vq->avail->ring[(vq->last_used_idx_res + i)
1940                         & (vq->size - 1)];
1941
1942         vq->last_used_idx_res += free_entries;
1943
1944         /* Prefetch descriptor index. */
1945         rte_prefetch0(&vq->desc[head[packet_success]]);
1946         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1947
1948         while (packet_success < free_entries) {
1949                 desc = &vq->desc[head[packet_success]];
1950
1951                 /* Discard first buffer as it is the virtio header */
1952                 desc = &vq->desc[desc->next];
1953
1954                 /* Buffer address translation. */
1955                 buff_addr = gpa_to_vva(dev, desc->addr);
1956                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1957                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1958                         &addr_type);
1959
1960                 if (likely(packet_success < (free_entries - 1)))
1961                         /* Prefetch descriptor index. */
1962                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1963
1964                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1965                         RTE_LOG(ERR, VHOST_DATA,
1966                                 "(%"PRIu64") Invalid frame buffer address found"
1967                                 "when TX packets!\n",
1968                                 dev->device_fh);
1969                         packet_success++;
1970                         continue;
1971                 }
1972
1973                 /* Prefetch buffer address. */
1974                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1975
1976                 /*
1977                  * Setup dummy mbuf. This is copied to a real mbuf if
1978                  * transmitted out the physical port.
1979                  */
1980                 m.data_len = desc->len;
1981                 m.nb_segs = 1;
1982                 m.next = NULL;
1983                 m.data_off = 0;
1984                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1985                 m.buf_physaddr = phys_addr;
1986
1987                 /*
1988                  * Check if the frame buffer address from guest crosses
1989                  * sub-region or not.
1990                  */
1991                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1992                         RTE_LOG(ERR, VHOST_DATA,
1993                                 "(%"PRIu64") Frame buffer address cross "
1994                                 "sub-regioin found when attaching TX frame "
1995                                 "buffer address!\n",
1996                                 dev->device_fh);
1997                         need_copy = 1;
1998                 } else
1999                         need_copy = 0;
2000
2001                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2002
2003                 /*
2004                  * If this is the first received packet we need to learn
2005                  * the MAC and setup VMDQ
2006                  */
2007                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2008                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2009                                 /*
2010                                  * Discard frame if device is scheduled for
2011                                  * removal or a duplicate MAC address is found.
2012                                  */
2013                                 packet_success += free_entries;
2014                                 vq->last_used_idx += packet_success;
2015                                 break;
2016                         }
2017                 }
2018
2019                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2020                 packet_success++;
2021         }
2022 }
2023
2024 /*
2025  * This function is called by each data core. It handles all RX/TX registered
2026  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2027  * addresses are compared with all devices in the main linked list.
2028  */
2029 static int
2030 switch_worker_zcp(__attribute__((unused)) void *arg)
2031 {
2032         struct virtio_net *dev = NULL;
2033         struct vhost_dev  *vdev = NULL;
2034         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2035         struct virtio_net_data_ll *dev_ll;
2036         struct mbuf_table *tx_q;
2037         volatile struct lcore_ll_info *lcore_ll;
2038         const uint64_t drain_tsc
2039                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2040                 * BURST_TX_DRAIN_US;
2041         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2042         unsigned ret;
2043         const uint16_t lcore_id = rte_lcore_id();
2044         uint16_t count_in_ring, rx_count = 0;
2045
2046         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2047
2048         lcore_ll = lcore_info[lcore_id].lcore_ll;
2049         prev_tsc = 0;
2050
2051         while (1) {
2052                 cur_tsc = rte_rdtsc();
2053
2054                 /* TX burst queue drain */
2055                 diff_tsc = cur_tsc - prev_tsc;
2056                 if (unlikely(diff_tsc > drain_tsc)) {
2057                         /*
2058                          * Get mbuf from vpool.pool and detach mbuf and
2059                          * put back into vpool.ring.
2060                          */
2061                         dev_ll = lcore_ll->ll_root_used;
2062                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2063                                 /* Get virtio device ID */
2064                                 vdev = dev_ll->vdev;
2065                                 dev = vdev->dev;
2066
2067                                 if (likely(!vdev->remove)) {
2068                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2069                                         if (tx_q->len) {
2070                                                 LOG_DEBUG(VHOST_DATA,
2071                                                 "TX queue drained after timeout"
2072                                                 " with burst size %u\n",
2073                                                 tx_q->len);
2074
2075                                                 /*
2076                                                  * Tx any packets in the queue
2077                                                  */
2078                                                 ret = rte_eth_tx_burst(
2079                                                         ports[0],
2080                                                         (uint16_t)tx_q->txq_id,
2081                                                         (struct rte_mbuf **)
2082                                                         tx_q->m_table,
2083                                                         (uint16_t)tx_q->len);
2084                                                 if (unlikely(ret < tx_q->len)) {
2085                                                         do {
2086                                                                 rte_pktmbuf_free(
2087                                                                         tx_q->m_table[ret]);
2088                                                         } while (++ret < tx_q->len);
2089                                                 }
2090                                                 tx_q->len = 0;
2091
2092                                                 txmbuf_clean_zcp(dev,
2093                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2094                                         }
2095                                 }
2096                                 dev_ll = dev_ll->next;
2097                         }
2098                         prev_tsc = cur_tsc;
2099                 }
2100
2101                 rte_prefetch0(lcore_ll->ll_root_used);
2102
2103                 /*
2104                  * Inform the configuration core that we have exited the linked
2105                  * list and that no devices are in use if requested.
2106                  */
2107                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2108                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2109
2110                 /* Process devices */
2111                 dev_ll = lcore_ll->ll_root_used;
2112
2113                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2114                         vdev = dev_ll->vdev;
2115                         dev  = vdev->dev;
2116                         if (unlikely(vdev->remove)) {
2117                                 dev_ll = dev_ll->next;
2118                                 unlink_vmdq(vdev);
2119                                 vdev->ready = DEVICE_SAFE_REMOVE;
2120                                 continue;
2121                         }
2122
2123                         if (likely(vdev->ready == DEVICE_RX)) {
2124                                 uint32_t index = vdev->vmdq_rx_q;
2125                                 uint16_t i;
2126                                 count_in_ring
2127                                 = rte_ring_count(vpool_array[index].ring);
2128                                 uint16_t free_entries
2129                                 = (uint16_t)get_available_ring_num_zcp(dev);
2130
2131                                 /*
2132                                  * Attach all mbufs in vpool.ring and put back
2133                                  * into vpool.pool.
2134                                  */
2135                                 for (i = 0;
2136                                 i < RTE_MIN(free_entries,
2137                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2138                                 i++)
2139                                         attach_rxmbuf_zcp(dev);
2140
2141                                 /* Handle guest RX */
2142                                 rx_count = rte_eth_rx_burst(ports[0],
2143                                         vdev->vmdq_rx_q, pkts_burst,
2144                                         MAX_PKT_BURST);
2145
2146                                 if (rx_count) {
2147                                         ret_count = virtio_dev_rx_zcp(dev,
2148                                                         pkts_burst, rx_count);
2149                                         if (enable_stats) {
2150                                                 dev_statistics[dev->device_fh].rx_total
2151                                                         += rx_count;
2152                                                 dev_statistics[dev->device_fh].rx
2153                                                         += ret_count;
2154                                         }
2155                                         while (likely(rx_count)) {
2156                                                 rx_count--;
2157                                                 pktmbuf_detach_zcp(
2158                                                         pkts_burst[rx_count]);
2159                                                 rte_ring_sp_enqueue(
2160                                                         vpool_array[index].ring,
2161                                                         (void *)pkts_burst[rx_count]);
2162                                         }
2163                                 }
2164                         }
2165
2166                         if (likely(!vdev->remove))
2167                                 /* Handle guest TX */
2168                                 virtio_dev_tx_zcp(dev);
2169
2170                         /* Move to the next device in the list */
2171                         dev_ll = dev_ll->next;
2172                 }
2173         }
2174
2175         return 0;
2176 }
2177
2178
2179 /*
2180  * Add an entry to a used linked list. A free entry must first be found
2181  * in the free linked list using get_data_ll_free_entry();
2182  */
2183 static void
2184 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2185         struct virtio_net_data_ll *ll_dev)
2186 {
2187         struct virtio_net_data_ll *ll = *ll_root_addr;
2188
2189         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2190         ll_dev->next = NULL;
2191         rte_compiler_barrier();
2192
2193         /* If ll == NULL then this is the first device. */
2194         if (ll) {
2195                 /* Increment to the tail of the linked list. */
2196                 while ((ll->next != NULL) )
2197                         ll = ll->next;
2198
2199                 ll->next = ll_dev;
2200         } else {
2201                 *ll_root_addr = ll_dev;
2202         }
2203 }
2204
2205 /*
2206  * Remove an entry from a used linked list. The entry must then be added to
2207  * the free linked list using put_data_ll_free_entry().
2208  */
2209 static void
2210 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2211         struct virtio_net_data_ll *ll_dev,
2212         struct virtio_net_data_ll *ll_dev_last)
2213 {
2214         struct virtio_net_data_ll *ll = *ll_root_addr;
2215
2216         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2217                 return;
2218
2219         if (ll_dev == ll)
2220                 *ll_root_addr = ll_dev->next;
2221         else
2222                 if (likely(ll_dev_last != NULL))
2223                         ll_dev_last->next = ll_dev->next;
2224                 else
2225                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2226 }
2227
2228 /*
2229  * Find and return an entry from the free linked list.
2230  */
2231 static struct virtio_net_data_ll *
2232 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2233 {
2234         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2235         struct virtio_net_data_ll *ll_dev;
2236
2237         if (ll_free == NULL)
2238                 return NULL;
2239
2240         ll_dev = ll_free;
2241         *ll_root_addr = ll_free->next;
2242
2243         return ll_dev;
2244 }
2245
2246 /*
2247  * Place an entry back on to the free linked list.
2248  */
2249 static void
2250 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2251         struct virtio_net_data_ll *ll_dev)
2252 {
2253         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2254
2255         if (ll_dev == NULL)
2256                 return;
2257
2258         ll_dev->next = ll_free;
2259         *ll_root_addr = ll_dev;
2260 }
2261
2262 /*
2263  * Creates a linked list of a given size.
2264  */
2265 static struct virtio_net_data_ll *
2266 alloc_data_ll(uint32_t size)
2267 {
2268         struct virtio_net_data_ll *ll_new;
2269         uint32_t i;
2270
2271         /* Malloc and then chain the linked list. */
2272         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2273         if (ll_new == NULL) {
2274                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2275                 return NULL;
2276         }
2277
2278         for (i = 0; i < size - 1; i++) {
2279                 ll_new[i].vdev = NULL;
2280                 ll_new[i].next = &ll_new[i+1];
2281         }
2282         ll_new[i].next = NULL;
2283
2284         return ll_new;
2285 }
2286
2287 /*
2288  * Create the main linked list along with each individual cores linked list. A used and a free list
2289  * are created to manage entries.
2290  */
2291 static int
2292 init_data_ll (void)
2293 {
2294         int lcore;
2295
2296         RTE_LCORE_FOREACH_SLAVE(lcore) {
2297                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2298                 if (lcore_info[lcore].lcore_ll == NULL) {
2299                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2300                         return -1;
2301                 }
2302
2303                 lcore_info[lcore].lcore_ll->device_num = 0;
2304                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2305                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2306                 if (num_devices % num_switching_cores)
2307                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2308                 else
2309                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2310         }
2311
2312         /* Allocate devices up to a maximum of MAX_DEVICES. */
2313         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2314
2315         return 0;
2316 }
2317
2318 /*
2319  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2320  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2321  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2322  */
2323 static void
2324 destroy_device (volatile struct virtio_net *dev)
2325 {
2326         struct virtio_net_data_ll *ll_lcore_dev_cur;
2327         struct virtio_net_data_ll *ll_main_dev_cur;
2328         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2329         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2330         struct vhost_dev *vdev;
2331         int lcore;
2332
2333         dev->flags &= ~VIRTIO_DEV_RUNNING;
2334
2335         vdev = (struct vhost_dev *)dev->priv;
2336         /*set the remove flag. */
2337         vdev->remove = 1;
2338         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2339                 rte_pause();
2340         }
2341
2342         /* Search for entry to be removed from lcore ll */
2343         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2344         while (ll_lcore_dev_cur != NULL) {
2345                 if (ll_lcore_dev_cur->vdev == vdev) {
2346                         break;
2347                 } else {
2348                         ll_lcore_dev_last = ll_lcore_dev_cur;
2349                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2350                 }
2351         }
2352
2353         if (ll_lcore_dev_cur == NULL) {
2354                 RTE_LOG(ERR, VHOST_CONFIG,
2355                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2356                         dev->device_fh);
2357                 return;
2358         }
2359
2360         /* Search for entry to be removed from main ll */
2361         ll_main_dev_cur = ll_root_used;
2362         ll_main_dev_last = NULL;
2363         while (ll_main_dev_cur != NULL) {
2364                 if (ll_main_dev_cur->vdev == vdev) {
2365                         break;
2366                 } else {
2367                         ll_main_dev_last = ll_main_dev_cur;
2368                         ll_main_dev_cur = ll_main_dev_cur->next;
2369                 }
2370         }
2371
2372         /* Remove entries from the lcore and main ll. */
2373         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2374         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2375
2376         /* Set the dev_removal_flag on each lcore. */
2377         RTE_LCORE_FOREACH_SLAVE(lcore) {
2378                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2379         }
2380
2381         /*
2382          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2383          * they can no longer access the device removed from the linked lists and that the devices
2384          * are no longer in use.
2385          */
2386         RTE_LCORE_FOREACH_SLAVE(lcore) {
2387                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2388                         rte_pause();
2389                 }
2390         }
2391
2392         /* Add the entries back to the lcore and main free ll.*/
2393         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2394         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2395
2396         /* Decrement number of device on the lcore. */
2397         lcore_info[vdev->coreid].lcore_ll->device_num--;
2398
2399         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2400
2401         if (zero_copy) {
2402                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2403
2404                 /* Stop the RX queue. */
2405                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2406                         LOG_DEBUG(VHOST_CONFIG,
2407                                 "(%"PRIu64") In destroy_device: Failed to stop "
2408                                 "rx queue:%d\n",
2409                                 dev->device_fh,
2410                                 vdev->vmdq_rx_q);
2411                 }
2412
2413                 LOG_DEBUG(VHOST_CONFIG,
2414                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2415                         "mempool back to ring for RX queue: %d\n",
2416                         dev->device_fh, vdev->vmdq_rx_q);
2417
2418                 mbuf_destroy_zcp(vpool);
2419
2420                 /* Stop the TX queue. */
2421                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2422                         LOG_DEBUG(VHOST_CONFIG,
2423                                 "(%"PRIu64") In destroy_device: Failed to "
2424                                 "stop tx queue:%d\n",
2425                                 dev->device_fh, vdev->vmdq_rx_q);
2426                 }
2427
2428                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2429
2430                 LOG_DEBUG(VHOST_CONFIG,
2431                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2432                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2433                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2434                         dev->device_fh);
2435
2436                 mbuf_destroy_zcp(vpool);
2437                 rte_free(vdev->regions_hpa);
2438         }
2439         rte_free(vdev);
2440
2441 }
2442
2443 /*
2444  * Calculate the region count of physical continous regions for one particular
2445  * region of whose vhost virtual address is continous. The particular region
2446  * start from vva_start, with size of 'size' in argument.
2447  */
2448 static uint32_t
2449 check_hpa_regions(uint64_t vva_start, uint64_t size)
2450 {
2451         uint32_t i, nregions = 0, page_size = getpagesize();
2452         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2453         if (vva_start % page_size) {
2454                 LOG_DEBUG(VHOST_CONFIG,
2455                         "in check_countinous: vva start(%p) mod page_size(%d) "
2456                         "has remainder\n",
2457                         (void *)(uintptr_t)vva_start, page_size);
2458                 return 0;
2459         }
2460         if (size % page_size) {
2461                 LOG_DEBUG(VHOST_CONFIG,
2462                         "in check_countinous: "
2463                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2464                         size, page_size);
2465                 return 0;
2466         }
2467         for (i = 0; i < size - page_size; i = i + page_size) {
2468                 cur_phys_addr
2469                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2470                 next_phys_addr = rte_mem_virt2phy(
2471                         (void *)(uintptr_t)(vva_start + i + page_size));
2472                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2473                         ++nregions;
2474                         LOG_DEBUG(VHOST_CONFIG,
2475                                 "in check_continuous: hva addr:(%p) is not "
2476                                 "continuous with hva addr:(%p), diff:%d\n",
2477                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2478                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2479                                 + page_size), page_size);
2480                         LOG_DEBUG(VHOST_CONFIG,
2481                                 "in check_continuous: hpa addr:(%p) is not "
2482                                 "continuous with hpa addr:(%p), "
2483                                 "diff:(%"PRIu64")\n",
2484                                 (void *)(uintptr_t)cur_phys_addr,
2485                                 (void *)(uintptr_t)next_phys_addr,
2486                                 (next_phys_addr-cur_phys_addr));
2487                 }
2488         }
2489         return nregions;
2490 }
2491
2492 /*
2493  * Divide each region whose vhost virtual address is continous into a few
2494  * sub-regions, make sure the physical address within each sub-region are
2495  * continous. And fill offset(to GPA) and size etc. information of each
2496  * sub-region into regions_hpa.
2497  */
2498 static uint32_t
2499 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2500 {
2501         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2502         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2503
2504         if (mem_region_hpa == NULL)
2505                 return 0;
2506
2507         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2508                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2509                         virtio_memory->regions[regionidx].address_offset;
2510                 mem_region_hpa[regionidx_hpa].guest_phys_address
2511                         = virtio_memory->regions[regionidx].guest_phys_address;
2512                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2513                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2514                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2515                 LOG_DEBUG(VHOST_CONFIG,
2516                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2517                         regionidx_hpa,
2518                         (void *)(uintptr_t)
2519                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2520                 LOG_DEBUG(VHOST_CONFIG,
2521                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2522                         regionidx_hpa,
2523                         (void *)(uintptr_t)
2524                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2525                 for (i = 0, k = 0;
2526                         i < virtio_memory->regions[regionidx].memory_size -
2527                                 page_size;
2528                         i += page_size) {
2529                         cur_phys_addr = rte_mem_virt2phy(
2530                                         (void *)(uintptr_t)(vva_start + i));
2531                         next_phys_addr = rte_mem_virt2phy(
2532                                         (void *)(uintptr_t)(vva_start +
2533                                         i + page_size));
2534                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2535                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2536                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2537                                         k + page_size;
2538                                 mem_region_hpa[regionidx_hpa].memory_size
2539                                         = k + page_size;
2540                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2541                                         "phys addr end  [%d]:(%p)\n",
2542                                         regionidx_hpa,
2543                                         (void *)(uintptr_t)
2544                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2545                                 LOG_DEBUG(VHOST_CONFIG,
2546                                         "in fill_hpa_regions: guest phys addr "
2547                                         "size [%d]:(%p)\n",
2548                                         regionidx_hpa,
2549                                         (void *)(uintptr_t)
2550                                         (mem_region_hpa[regionidx_hpa].memory_size));
2551                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2552                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2553                                 ++regionidx_hpa;
2554                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2555                                         next_phys_addr -
2556                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2557                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2558                                         " phys addr start[%d]:(%p)\n",
2559                                         regionidx_hpa,
2560                                         (void *)(uintptr_t)
2561                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2562                                 LOG_DEBUG(VHOST_CONFIG,
2563                                         "in fill_hpa_regions: host  phys addr "
2564                                         "start[%d]:(%p)\n",
2565                                         regionidx_hpa,
2566                                         (void *)(uintptr_t)
2567                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2568                                 k = 0;
2569                         } else {
2570                                 k += page_size;
2571                         }
2572                 }
2573                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2574                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2575                         + k + page_size;
2576                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2577                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2578                         "[%d]:(%p)\n", regionidx_hpa,
2579                         (void *)(uintptr_t)
2580                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2581                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2582                         "[%d]:(%p)\n", regionidx_hpa,
2583                         (void *)(uintptr_t)
2584                         (mem_region_hpa[regionidx_hpa].memory_size));
2585                 ++regionidx_hpa;
2586         }
2587         return regionidx_hpa;
2588 }
2589
2590 /*
2591  * A new device is added to a data core. First the device is added to the main linked list
2592  * and the allocated to a specific data core.
2593  */
2594 static int
2595 new_device (struct virtio_net *dev)
2596 {
2597         struct virtio_net_data_ll *ll_dev;
2598         int lcore, core_add = 0;
2599         uint32_t device_num_min = num_devices;
2600         struct vhost_dev *vdev;
2601         uint32_t regionidx;
2602
2603         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2604         if (vdev == NULL) {
2605                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2606                         dev->device_fh);
2607                 return -1;
2608         }
2609         vdev->dev = dev;
2610         dev->priv = vdev;
2611
2612         if (zero_copy) {
2613                 vdev->nregions_hpa = dev->mem->nregions;
2614                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2615                         vdev->nregions_hpa
2616                                 += check_hpa_regions(
2617                                         dev->mem->regions[regionidx].guest_phys_address
2618                                         + dev->mem->regions[regionidx].address_offset,
2619                                         dev->mem->regions[regionidx].memory_size);
2620
2621                 }
2622
2623                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2624                                                vdev->nregions_hpa,
2625                                                sizeof(struct virtio_memory_regions_hpa),
2626                                                RTE_CACHE_LINE_SIZE);
2627                 if (vdev->regions_hpa == NULL) {
2628                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2629                         rte_free(vdev);
2630                         return -1;
2631                 }
2632
2633
2634                 if (fill_hpa_memory_regions(
2635                         vdev->regions_hpa, dev->mem
2636                         ) != vdev->nregions_hpa) {
2637
2638                         RTE_LOG(ERR, VHOST_CONFIG,
2639                                 "hpa memory regions number mismatch: "
2640                                 "[%d]\n", vdev->nregions_hpa);
2641                         rte_free(vdev->regions_hpa);
2642                         rte_free(vdev);
2643                         return -1;
2644                 }
2645         }
2646
2647
2648         /* Add device to main ll */
2649         ll_dev = get_data_ll_free_entry(&ll_root_free);
2650         if (ll_dev == NULL) {
2651                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2652                         "of %d devices per core has been reached\n",
2653                         dev->device_fh, num_devices);
2654                 if (vdev->regions_hpa)
2655                         rte_free(vdev->regions_hpa);
2656                 rte_free(vdev);
2657                 return -1;
2658         }
2659         ll_dev->vdev = vdev;
2660         add_data_ll_entry(&ll_root_used, ll_dev);
2661         vdev->vmdq_rx_q
2662                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2663
2664         if (zero_copy) {
2665                 uint32_t index = vdev->vmdq_rx_q;
2666                 uint32_t count_in_ring, i;
2667                 struct mbuf_table *tx_q;
2668
2669                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2670
2671                 LOG_DEBUG(VHOST_CONFIG,
2672                         "(%"PRIu64") in new_device: mbuf count in mempool "
2673                         "before attach is: %d\n",
2674                         dev->device_fh,
2675                         rte_mempool_count(vpool_array[index].pool));
2676                 LOG_DEBUG(VHOST_CONFIG,
2677                         "(%"PRIu64") in new_device: mbuf count in  ring "
2678                         "before attach  is : %d\n",
2679                         dev->device_fh, count_in_ring);
2680
2681                 /*
2682                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2683                  */
2684                 for (i = 0; i < count_in_ring; i++)
2685                         attach_rxmbuf_zcp(dev);
2686
2687                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2688                         "mempool after attach is: %d\n",
2689                         dev->device_fh,
2690                         rte_mempool_count(vpool_array[index].pool));
2691                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2692                         "ring after attach  is : %d\n",
2693                         dev->device_fh,
2694                         rte_ring_count(vpool_array[index].ring));
2695
2696                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2697                 tx_q->txq_id = vdev->vmdq_rx_q;
2698
2699                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2700                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2701
2702                         LOG_DEBUG(VHOST_CONFIG,
2703                                 "(%"PRIu64") In new_device: Failed to start "
2704                                 "tx queue:%d\n",
2705                                 dev->device_fh, vdev->vmdq_rx_q);
2706
2707                         mbuf_destroy_zcp(vpool);
2708                         rte_free(vdev->regions_hpa);
2709                         rte_free(vdev);
2710                         return -1;
2711                 }
2712
2713                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2714                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2715
2716                         LOG_DEBUG(VHOST_CONFIG,
2717                                 "(%"PRIu64") In new_device: Failed to start "
2718                                 "rx queue:%d\n",
2719                                 dev->device_fh, vdev->vmdq_rx_q);
2720
2721                         /* Stop the TX queue. */
2722                         if (rte_eth_dev_tx_queue_stop(ports[0],
2723                                 vdev->vmdq_rx_q) != 0) {
2724                                 LOG_DEBUG(VHOST_CONFIG,
2725                                         "(%"PRIu64") In new_device: Failed to "
2726                                         "stop tx queue:%d\n",
2727                                         dev->device_fh, vdev->vmdq_rx_q);
2728                         }
2729
2730                         mbuf_destroy_zcp(vpool);
2731                         rte_free(vdev->regions_hpa);
2732                         rte_free(vdev);
2733                         return -1;
2734                 }
2735
2736         }
2737
2738         /*reset ready flag*/
2739         vdev->ready = DEVICE_MAC_LEARNING;
2740         vdev->remove = 0;
2741
2742         /* Find a suitable lcore to add the device. */
2743         RTE_LCORE_FOREACH_SLAVE(lcore) {
2744                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2745                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2746                         core_add = lcore;
2747                 }
2748         }
2749         /* Add device to lcore ll */
2750         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2751         if (ll_dev == NULL) {
2752                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2753                 vdev->ready = DEVICE_SAFE_REMOVE;
2754                 destroy_device(dev);
2755                 rte_free(vdev->regions_hpa);
2756                 rte_free(vdev);
2757                 return -1;
2758         }
2759         ll_dev->vdev = vdev;
2760         vdev->coreid = core_add;
2761
2762         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2763
2764         /* Initialize device stats */
2765         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2766
2767         /* Disable notifications. */
2768         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2769         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2770         lcore_info[vdev->coreid].lcore_ll->device_num++;
2771         dev->flags |= VIRTIO_DEV_RUNNING;
2772
2773         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2774
2775         return 0;
2776 }
2777
2778 /*
2779  * These callback allow devices to be added to the data core when configuration
2780  * has been fully complete.
2781  */
2782 static const struct virtio_net_device_ops virtio_net_device_ops =
2783 {
2784         .new_device =  new_device,
2785         .destroy_device = destroy_device,
2786 };
2787
2788 /*
2789  * This is a thread will wake up after a period to print stats if the user has
2790  * enabled them.
2791  */
2792 static void
2793 print_stats(void)
2794 {
2795         struct virtio_net_data_ll *dev_ll;
2796         uint64_t tx_dropped, rx_dropped;
2797         uint64_t tx, tx_total, rx, rx_total;
2798         uint32_t device_fh;
2799         const char clr[] = { 27, '[', '2', 'J', '\0' };
2800         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2801
2802         while(1) {
2803                 sleep(enable_stats);
2804
2805                 /* Clear screen and move to top left */
2806                 printf("%s%s", clr, top_left);
2807
2808                 printf("\nDevice statistics ====================================");
2809
2810                 dev_ll = ll_root_used;
2811                 while (dev_ll != NULL) {
2812                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2813                         tx_total = dev_statistics[device_fh].tx_total;
2814                         tx = dev_statistics[device_fh].tx;
2815                         tx_dropped = tx_total - tx;
2816                         if (zero_copy == 0) {
2817                                 rx_total = rte_atomic64_read(
2818                                         &dev_statistics[device_fh].rx_total_atomic);
2819                                 rx = rte_atomic64_read(
2820                                         &dev_statistics[device_fh].rx_atomic);
2821                         } else {
2822                                 rx_total = dev_statistics[device_fh].rx_total;
2823                                 rx = dev_statistics[device_fh].rx;
2824                         }
2825                         rx_dropped = rx_total - rx;
2826
2827                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2828                                         "\nTX total:            %"PRIu64""
2829                                         "\nTX dropped:          %"PRIu64""
2830                                         "\nTX successful:               %"PRIu64""
2831                                         "\nRX total:            %"PRIu64""
2832                                         "\nRX dropped:          %"PRIu64""
2833                                         "\nRX successful:               %"PRIu64"",
2834                                         device_fh,
2835                                         tx_total,
2836                                         tx_dropped,
2837                                         tx,
2838                                         rx_total,
2839                                         rx_dropped,
2840                                         rx);
2841
2842                         dev_ll = dev_ll->next;
2843                 }
2844                 printf("\n======================================================\n");
2845         }
2846 }
2847
2848 static void
2849 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2850         char *ring_name, uint32_t nb_mbuf)
2851 {
2852         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2853                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2854         if (vpool_array[index].pool != NULL) {
2855                 vpool_array[index].ring
2856                         = rte_ring_create(ring_name,
2857                                 rte_align32pow2(nb_mbuf + 1),
2858                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2859                 if (likely(vpool_array[index].ring != NULL)) {
2860                         LOG_DEBUG(VHOST_CONFIG,
2861                                 "in setup_mempool_tbl: mbuf count in "
2862                                 "mempool is: %d\n",
2863                                 rte_mempool_count(vpool_array[index].pool));
2864                         LOG_DEBUG(VHOST_CONFIG,
2865                                 "in setup_mempool_tbl: mbuf count in "
2866                                 "ring   is: %d\n",
2867                                 rte_ring_count(vpool_array[index].ring));
2868                 } else {
2869                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2870                                 ring_name);
2871                 }
2872
2873                 /* Need consider head room. */
2874                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2875         } else {
2876                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2877         }
2878 }
2879
2880 /* When we receive a INT signal, unregister vhost driver */
2881 static void
2882 sigint_handler(__rte_unused int signum)
2883 {
2884         /* Unregister vhost driver. */
2885         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2886         if (ret != 0)
2887                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2888         exit(0);
2889 }
2890
2891 /*
2892  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2893  * device is also registered here to handle the IOCTLs.
2894  */
2895 int
2896 main(int argc, char *argv[])
2897 {
2898         struct rte_mempool *mbuf_pool = NULL;
2899         unsigned lcore_id, core_id = 0;
2900         unsigned nb_ports, valid_num_ports;
2901         int ret;
2902         uint8_t portid;
2903         uint16_t queue_id;
2904         static pthread_t tid;
2905         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2906
2907         signal(SIGINT, sigint_handler);
2908
2909         /* init EAL */
2910         ret = rte_eal_init(argc, argv);
2911         if (ret < 0)
2912                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2913         argc -= ret;
2914         argv += ret;
2915
2916         /* parse app arguments */
2917         ret = us_vhost_parse_args(argc, argv);
2918         if (ret < 0)
2919                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2920
2921         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2922                 if (rte_lcore_is_enabled(lcore_id))
2923                         lcore_ids[core_id ++] = lcore_id;
2924
2925         if (rte_lcore_count() > RTE_MAX_LCORE)
2926                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2927
2928         /*set the number of swithcing cores available*/
2929         num_switching_cores = rte_lcore_count()-1;
2930
2931         /* Get the number of physical ports. */
2932         nb_ports = rte_eth_dev_count();
2933         if (nb_ports > RTE_MAX_ETHPORTS)
2934                 nb_ports = RTE_MAX_ETHPORTS;
2935
2936         /*
2937          * Update the global var NUM_PORTS and global array PORTS
2938          * and get value of var VALID_NUM_PORTS according to system ports number
2939          */
2940         valid_num_ports = check_ports_num(nb_ports);
2941
2942         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2943                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2944                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2945                 return -1;
2946         }
2947
2948         if (zero_copy == 0) {
2949                 /* Create the mbuf pool. */
2950                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2951                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2952                         0, MBUF_DATA_SIZE, rte_socket_id());
2953                 if (mbuf_pool == NULL)
2954                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2955
2956                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2957                         vpool_array[queue_id].pool = mbuf_pool;
2958
2959                 if (vm2vm_mode == VM2VM_HARDWARE) {
2960                         /* Enable VT loop back to let L2 switch to do it. */
2961                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2962                         LOG_DEBUG(VHOST_CONFIG,
2963                                 "Enable loop back for L2 switch in vmdq.\n");
2964                 }
2965         } else {
2966                 uint32_t nb_mbuf;
2967                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2968                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2969
2970                 nb_mbuf = num_rx_descriptor
2971                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2972                         + num_switching_cores * MAX_PKT_BURST;
2973
2974                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2975                         snprintf(pool_name, sizeof(pool_name),
2976                                 "rxmbuf_pool_%u", queue_id);
2977                         snprintf(ring_name, sizeof(ring_name),
2978                                 "rxmbuf_ring_%u", queue_id);
2979                         setup_mempool_tbl(rte_socket_id(), queue_id,
2980                                 pool_name, ring_name, nb_mbuf);
2981                 }
2982
2983                 nb_mbuf = num_tx_descriptor
2984                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2985                                 + num_switching_cores * MAX_PKT_BURST;
2986
2987                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2988                         snprintf(pool_name, sizeof(pool_name),
2989                                 "txmbuf_pool_%u", queue_id);
2990                         snprintf(ring_name, sizeof(ring_name),
2991                                 "txmbuf_ring_%u", queue_id);
2992                         setup_mempool_tbl(rte_socket_id(),
2993                                 (queue_id + MAX_QUEUES),
2994                                 pool_name, ring_name, nb_mbuf);
2995                 }
2996
2997                 if (vm2vm_mode == VM2VM_HARDWARE) {
2998                         /* Enable VT loop back to let L2 switch to do it. */
2999                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3000                         LOG_DEBUG(VHOST_CONFIG,
3001                                 "Enable loop back for L2 switch in vmdq.\n");
3002                 }
3003         }
3004         /* Set log level. */
3005         rte_set_log_level(LOG_LEVEL);
3006
3007         /* initialize all ports */
3008         for (portid = 0; portid < nb_ports; portid++) {
3009                 /* skip ports that are not enabled */
3010                 if ((enabled_port_mask & (1 << portid)) == 0) {
3011                         RTE_LOG(INFO, VHOST_PORT,
3012                                 "Skipping disabled port %d\n", portid);
3013                         continue;
3014                 }
3015                 if (port_init(portid) != 0)
3016                         rte_exit(EXIT_FAILURE,
3017                                 "Cannot initialize network ports\n");
3018         }
3019
3020         /* Initialise all linked lists. */
3021         if (init_data_ll() == -1)
3022                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3023
3024         /* Initialize device stats */
3025         memset(&dev_statistics, 0, sizeof(dev_statistics));
3026
3027         /* Enable stats if the user option is set. */
3028         if (enable_stats) {
3029                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3030                 if (ret != 0)
3031                         rte_exit(EXIT_FAILURE,
3032                                 "Cannot create print-stats thread\n");
3033
3034                 /* Set thread_name for aid in debugging.  */
3035                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3036                 ret = rte_thread_setname(tid, thread_name);
3037                 if (ret != 0)
3038                         RTE_LOG(ERR, VHOST_CONFIG,
3039                                 "Cannot set print-stats name\n");
3040         }
3041
3042         /* Launch all data cores. */
3043         if (zero_copy == 0) {
3044                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3045                         rte_eal_remote_launch(switch_worker,
3046                                 mbuf_pool, lcore_id);
3047                 }
3048         } else {
3049                 uint32_t count_in_mempool, index, i;
3050                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3051                         /* For all RX and TX queues. */
3052                         count_in_mempool
3053                                 = rte_mempool_count(vpool_array[index].pool);
3054
3055                         /*
3056                          * Transfer all un-attached mbufs from vpool.pool
3057                          * to vpoo.ring.
3058                          */
3059                         for (i = 0; i < count_in_mempool; i++) {
3060                                 struct rte_mbuf *mbuf
3061                                         = __rte_mbuf_raw_alloc(
3062                                                 vpool_array[index].pool);
3063                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3064                                                 (void *)mbuf);
3065                         }
3066
3067                         LOG_DEBUG(VHOST_CONFIG,
3068                                 "in main: mbuf count in mempool at initial "
3069                                 "is: %d\n", count_in_mempool);
3070                         LOG_DEBUG(VHOST_CONFIG,
3071                                 "in main: mbuf count in  ring at initial  is :"
3072                                 " %d\n",
3073                                 rte_ring_count(vpool_array[index].ring));
3074                 }
3075
3076                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3077                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3078                                 lcore_id);
3079         }
3080
3081         if (mergeable == 0)
3082                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3083
3084         /* Register vhost(cuse or user) driver to handle vhost messages. */
3085         ret = rte_vhost_driver_register((char *)&dev_basename);
3086         if (ret != 0)
3087                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3088
3089         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3090
3091         /* Start CUSE session. */
3092         rte_vhost_driver_session_start();
3093         return 0;
3094
3095 }