examples/vhost: remove functions implemented in lib
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56
57 #define MAX_QUEUES 128
58
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
66                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
67                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68                                                         (num_switching_cores*MBUF_CACHE_SIZE))
69
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80         + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101
102 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
105
106 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
108
109 #define JUMBO_FRAME_MAX_SIZE    0x2600
110
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
113 #define DEVICE_RX                       1
114 #define DEVICE_SAFE_REMOVE      2
115
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
119
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
123
124 /*
125  * Need refine these 2 macros for legacy and DPDK based front end:
126  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127  * And then adjust power 2.
128  */
129 /*
130  * For legacy front end, 128 descriptors,
131  * half for virtio header, another half for mbuf.
132  */
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138                 + sizeof(struct rte_mbuf)))
139
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142
143 #define INVALID_PORT_ID 0xFF
144
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
147
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
150
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
153
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
156
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
165
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
168
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
172
173 /*
174  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175  * disabled on default.
176  */
177 static uint32_t zero_copy;
178
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
185
186 struct vpool {
187         struct rte_mempool *pool;
188         struct rte_ring *ring;
189         uint32_t buf_size;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193 typedef enum {
194         VM2VM_DISABLED = 0,
195         VM2VM_SOFTWARE = 1,
196         VM2VM_HARDWARE = 2,
197         VM2VM_LAST
198 } vm2vm_type;
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200
201 /* The type of host physical address translated from guest physical address. */
202 typedef enum {
203         PHYS_ADDR_CONTINUOUS = 0,
204         PHYS_ADDR_CROSS_SUBREG = 1,
205         PHYS_ADDR_INVALID = 2,
206         PHYS_ADDR_LAST
207 } hpa_type;
208
209 /* Enable stats. */
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
223
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
226
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
229         .rx_thresh = {
230                 .pthresh = RX_PTHRESH,
231                 .hthresh = RX_HTHRESH,
232                 .wthresh = RX_WTHRESH,
233         },
234         .rx_drop_en = 1,
235 };
236
237 /*
238  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240  * network controllers and/or network drivers.
241  */
242 static struct rte_eth_txconf tx_conf_default = {
243         .tx_thresh = {
244                 .pthresh = TX_PTHRESH,
245                 .hthresh = TX_HTHRESH,
246                 .wthresh = TX_WTHRESH,
247         },
248         .tx_free_thresh = 0, /* Use PMD default values */
249         .tx_rs_thresh = 0, /* Use PMD default values */
250 };
251
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
254         .rxmode = {
255                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256                 .split_hdr_size = 0,
257                 .header_split   = 0, /**< Header Split disabled */
258                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
259                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
260                 /*
261                  * It is necessary for 1G NIC such as I350,
262                  * this fixes bug of ipv4 forwarding in guest can't
263                  * forward pakets from one virtio dev to another virtio dev.
264                  */
265                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
268         },
269
270         .txmode = {
271                 .mq_mode = ETH_MQ_TX_NONE,
272         },
273         .rx_adv_conf = {
274                 /*
275                  * should be overridden separately in code with
276                  * appropriate values
277                  */
278                 .vmdq_rx_conf = {
279                         .nb_queue_pools = ETH_8_POOLS,
280                         .enable_default_pool = 0,
281                         .default_pool = 0,
282                         .nb_pool_maps = 0,
283                         .pool_map = {{0, 0},},
284                 },
285         },
286 };
287
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
291
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316         unsigned len;
317         unsigned txq_id;
318         struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329         unsigned char   h_dest[ETH_ALEN];
330         unsigned char   h_source[ETH_ALEN];
331         __be16          h_vlan_proto;
332         __be16          h_vlan_TCI;
333         __be16          h_vlan_encapsulated_proto;
334 };
335
336 /* IPv4 Header */
337 struct ipv4_hdr {
338         uint8_t  version_ihl;           /**< version and header length */
339         uint8_t  type_of_service;       /**< type of service */
340         uint16_t total_length;          /**< length of packet */
341         uint16_t packet_id;             /**< packet ID */
342         uint16_t fragment_offset;       /**< fragmentation offset */
343         uint8_t  time_to_live;          /**< time to live */
344         uint8_t  next_proto_id;         /**< protocol ID */
345         uint16_t hdr_checksum;          /**< header checksum */
346         uint32_t src_addr;              /**< source address */
347         uint32_t dst_addr;              /**< destination address */
348 } __attribute__((__packed__));
349
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353
354 /* Per-device statistics struct */
355 struct device_statistics {
356         uint64_t tx_total;
357         rte_atomic64_t rx_total_atomic;
358         uint64_t rx_total;
359         uint64_t tx;
360         rte_atomic64_t rx_atomic;
361         uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372         struct rte_eth_vmdq_rx_conf conf;
373         unsigned i;
374
375         memset(&conf, 0, sizeof(conf));
376         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377         conf.nb_pool_maps = num_devices;
378         conf.enable_loop_back =
379                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380
381         for (i = 0; i < conf.nb_pool_maps; i++) {
382                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
383                 conf.pool_map[i].pools = (1UL << i);
384         }
385
386         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389         return 0;
390 }
391
392 /*
393  * Validate the device number according to the max pool number gotten form
394  * dev_info. If the device number is invalid, give the error message and
395  * return -1. Each device must have its own pool.
396  */
397 static inline int
398 validate_num_devices(uint32_t max_nb_devices)
399 {
400         if (num_devices > max_nb_devices) {
401                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402                 return -1;
403         }
404         return 0;
405 }
406
407 /*
408  * Initialises a given port using global settings and with the rx buffers
409  * coming from the mbuf_pool passed as parameter
410  */
411 static inline int
412 port_init(uint8_t port)
413 {
414         struct rte_eth_dev_info dev_info;
415         struct rte_eth_conf port_conf;
416         uint16_t rx_rings, tx_rings;
417         uint16_t rx_ring_size, tx_ring_size;
418         int retval;
419         uint16_t q;
420
421         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422         rte_eth_dev_info_get (port, &dev_info);
423
424         /*configure the number of supported virtio devices based on VMDQ limits */
425         num_devices = dev_info.max_vmdq_pools;
426         num_queues = dev_info.max_rx_queues;
427
428         if (zero_copy) {
429                 rx_ring_size = num_rx_descriptor;
430                 tx_ring_size = num_tx_descriptor;
431                 tx_rings = dev_info.max_tx_queues;
432         } else {
433                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435                 tx_rings = (uint16_t)rte_lcore_count();
436         }
437
438         retval = validate_num_devices(MAX_DEVICES);
439         if (retval < 0)
440                 return retval;
441
442         /* Get port configuration. */
443         retval = get_eth_conf(&port_conf, num_devices);
444         if (retval < 0)
445                 return retval;
446
447         if (port >= rte_eth_dev_count()) return -1;
448
449         rx_rings = (uint16_t)num_queues,
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
459                                                 vpool_array[q].pool);
460                 if (retval < 0)
461                         return retval;
462         }
463         for (q = 0; q < tx_rings; q ++) {
464                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
466                 if (retval < 0)
467                         return retval;
468         }
469
470         /* Start the device. */
471         retval  = rte_eth_dev_start(port);
472         if (retval < 0) {
473                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474                 return retval;
475         }
476
477         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481                         (unsigned)port,
482                         vmdq_ports_eth_addr[port].addr_bytes[0],
483                         vmdq_ports_eth_addr[port].addr_bytes[1],
484                         vmdq_ports_eth_addr[port].addr_bytes[2],
485                         vmdq_ports_eth_addr[port].addr_bytes[3],
486                         vmdq_ports_eth_addr[port].addr_bytes[4],
487                         vmdq_ports_eth_addr[port].addr_bytes[5]);
488
489         return 0;
490 }
491
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498         /* parse number string */
499
500         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501                 return -1;
502         else
503                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504
505         return 0;
506 }
507
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514         char *end = NULL;
515         unsigned long pm;
516
517         errno = 0;
518
519         /* parse hexadecimal string */
520         pm = strtoul(portmask, &end, 16);
521         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522                 return -1;
523
524         if (pm == 0)
525                 return -1;
526
527         return pm;
528
529 }
530
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537         char *end = NULL;
538         unsigned long num;
539
540         errno = 0;
541
542         /* parse unsigned int string */
543         num = strtoul(q_arg, &end, 10);
544         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545                 return -1;
546
547         if (num > max_valid_value)
548                 return -1;
549
550         return num;
551
552 }
553
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561         "               --vm2vm [0|1|2]\n"
562         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563         "               --dev-basename <name> --dev-index [0-N]\n"
564         "               --nb-devices ND\n"
565         "               -p PORTMASK: Set mask for ports to be used by application\n"
566         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572         "               --dev-basename: The basename to be used for the character device.\n"
573         "               --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
575                         "zero copy\n"
576         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
577                         "used only when zero copy is enabled.\n"
578         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
579                         "used only when zero copy is enabled.\n",
580                prgname);
581 }
582
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589         int opt, ret;
590         int option_index;
591         unsigned i;
592         const char *prgname = argv[0];
593         static struct option long_option[] = {
594                 {"vm2vm", required_argument, NULL, 0},
595                 {"rx-retry", required_argument, NULL, 0},
596                 {"rx-retry-delay", required_argument, NULL, 0},
597                 {"rx-retry-num", required_argument, NULL, 0},
598                 {"mergeable", required_argument, NULL, 0},
599                 {"stats", required_argument, NULL, 0},
600                 {"dev-basename", required_argument, NULL, 0},
601                 {"dev-index", required_argument, NULL, 0},
602                 {"zero-copy", required_argument, NULL, 0},
603                 {"rx-desc-num", required_argument, NULL, 0},
604                 {"tx-desc-num", required_argument, NULL, 0},
605                 {NULL, 0, 0, 0},
606         };
607
608         /* Parse command line */
609         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610                 switch (opt) {
611                 /* Portmask */
612                 case 'p':
613                         enabled_port_mask = parse_portmask(optarg);
614                         if (enabled_port_mask == 0) {
615                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616                                 us_vhost_usage(prgname);
617                                 return -1;
618                         }
619                         break;
620
621                 case 0:
622                         /* Enable/disable vm2vm comms. */
623                         if (!strncmp(long_option[option_index].name, "vm2vm",
624                                 MAX_LONG_OPT_SZ)) {
625                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626                                 if (ret == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                                 "Invalid argument for "
629                                                 "vm2vm [0|1|2]\n");
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 } else {
633                                         vm2vm_mode = (vm2vm_type)ret;
634                                 }
635                         }
636
637                         /* Enable/disable retries on RX. */
638                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639                                 ret = parse_num_opt(optarg, 1);
640                                 if (ret == -1) {
641                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 } else {
645                                         enable_retry = ret;
646                                 }
647                         }
648
649                         /* Specify the retries delay time (in useconds) on RX. */
650                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651                                 ret = parse_num_opt(optarg, INT32_MAX);
652                                 if (ret == -1) {
653                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654                                         us_vhost_usage(prgname);
655                                         return -1;
656                                 } else {
657                                         burst_rx_delay_time = ret;
658                                 }
659                         }
660
661                         /* Specify the retries number on RX. */
662                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663                                 ret = parse_num_opt(optarg, INT32_MAX);
664                                 if (ret == -1) {
665                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666                                         us_vhost_usage(prgname);
667                                         return -1;
668                                 } else {
669                                         burst_rx_retry_num = ret;
670                                 }
671                         }
672
673                         /* Enable/disable RX mergeable buffers. */
674                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675                                 ret = parse_num_opt(optarg, 1);
676                                 if (ret == -1) {
677                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678                                         us_vhost_usage(prgname);
679                                         return -1;
680                                 } else {
681                                         if (ret) {
682                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
683                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
684                                                         = JUMBO_FRAME_MAX_SIZE;
685                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686                                         }
687                                 }
688                         }
689
690                         /* Enable/disable stats. */
691                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         enable_stats = ret;
699                                 }
700                         }
701
702                         /* Set character device basename. */
703                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704                                 if (us_vhost_parse_basename(optarg) == -1) {
705                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706                                         us_vhost_usage(prgname);
707                                         return -1;
708                                 }
709                         }
710
711                         /* Set character device index. */
712                         if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713                                 ret = parse_num_opt(optarg, INT32_MAX);
714                                 if (ret == -1) {
715                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716                                         us_vhost_usage(prgname);
717                                         return -1;
718                                 } else
719                                         dev_index = ret;
720                         }
721
722                         /* Enable/disable rx/tx zero copy. */
723                         if (!strncmp(long_option[option_index].name,
724                                 "zero-copy", MAX_LONG_OPT_SZ)) {
725                                 ret = parse_num_opt(optarg, 1);
726                                 if (ret == -1) {
727                                         RTE_LOG(INFO, VHOST_CONFIG,
728                                                 "Invalid argument"
729                                                 " for zero-copy [0|1]\n");
730                                         us_vhost_usage(prgname);
731                                         return -1;
732                                 } else
733                                         zero_copy = ret;
734
735                                 if (zero_copy) {
736 #ifdef RTE_MBUF_REFCNT
737                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738                                         "zero copy vhost APP, please "
739                                         "disable RTE_MBUF_REFCNT\n"
740                                         "in config file and then rebuild DPDK "
741                                         "core lib!\n"
742                                         "Otherwise please disable zero copy "
743                                         "flag in command line!\n");
744                                         return -1;
745 #endif
746                                 }
747                         }
748
749                         /* Specify the descriptor number on RX. */
750                         if (!strncmp(long_option[option_index].name,
751                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
752                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
753                                 if ((ret == -1) || (!POWEROF2(ret))) {
754                                         RTE_LOG(INFO, VHOST_CONFIG,
755                                         "Invalid argument for rx-desc-num[0-N],"
756                                         "power of 2 required.\n");
757                                         us_vhost_usage(prgname);
758                                         return -1;
759                                 } else {
760                                         num_rx_descriptor = ret;
761                                 }
762                         }
763
764                         /* Specify the descriptor number on TX. */
765                         if (!strncmp(long_option[option_index].name,
766                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
767                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
768                                 if ((ret == -1) || (!POWEROF2(ret))) {
769                                         RTE_LOG(INFO, VHOST_CONFIG,
770                                         "Invalid argument for tx-desc-num [0-N],"
771                                         "power of 2 required.\n");
772                                         us_vhost_usage(prgname);
773                                         return -1;
774                                 } else {
775                                         num_tx_descriptor = ret;
776                                 }
777                         }
778
779                         break;
780
781                         /* Invalid option - print options. */
782                 default:
783                         us_vhost_usage(prgname);
784                         return -1;
785                 }
786         }
787
788         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789                 if (enabled_port_mask & (1 << i))
790                         ports[num_ports++] = (uint8_t)i;
791         }
792
793         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796                 return -1;
797         }
798
799         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800                 RTE_LOG(INFO, VHOST_PORT,
801                         "Vhost zero copy doesn't support software vm2vm,"
802                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
803                 return -1;
804         }
805
806         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807                 RTE_LOG(INFO, VHOST_PORT,
808                         "Vhost zero copy doesn't support jumbo frame,"
809                         "please specify '--mergeable 0' to disable the "
810                         "mergeable feature.\n");
811                 return -1;
812         }
813
814         return 0;
815 }
816
817 /*
818  * Update the global var NUM_PORTS and array PORTS according to system ports number
819  * and return valid ports number
820  */
821 static unsigned check_ports_num(unsigned nb_ports)
822 {
823         unsigned valid_num_ports = num_ports;
824         unsigned portid;
825
826         if (num_ports > nb_ports) {
827                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828                         num_ports, nb_ports);
829                 num_ports = nb_ports;
830         }
831
832         for (portid = 0; portid < num_ports; portid ++) {
833                 if (ports[portid] >= nb_ports) {
834                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835                                 ports[portid], (nb_ports - 1));
836                         ports[portid] = INVALID_PORT_ID;
837                         valid_num_ports--;
838                 }
839         }
840         return valid_num_ports;
841 }
842
843 /*
844  * Macro to print out packet contents. Wrapped in debug define so that the
845  * data path is not effected when debug is disabled.
846  */
847 #ifdef DEBUG
848 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
849         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
850         unsigned int index;                                                                                                                                                                                             \
851         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
852                                                                                                                                                                                                                                         \
853         if ((header))                                                                                                                                                                                                   \
854                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
855         else                                                                                                                                                                                                                    \
856                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
857         for (index = 0; index < (size); index++) {                                                                                                                                              \
858                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
859                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
860         }                                                                                                                                                                                                                               \
861         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
862                                                                                                                                                                                                                                         \
863         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
864 } while(0)
865 #else
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867 #endif
868
869 /*
870  * Function to convert guest physical addresses to vhost physical addresses.
871  * This is used to convert virtio buffer addresses.
872  */
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
875         uint32_t buf_len, hpa_type *addr_type)
876 {
877         struct virtio_memory_regions_hpa *region;
878         uint32_t regionidx;
879         uint64_t vhost_pa = 0;
880
881         *addr_type = PHYS_ADDR_INVALID;
882
883         for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
884                 region = &dev->mem->regions_hpa[regionidx];
885                 if ((guest_pa >= region->guest_phys_address) &&
886                         (guest_pa <= region->guest_phys_address_end)) {
887                         vhost_pa = region->host_phys_addr_offset + guest_pa;
888                         if (likely((guest_pa + buf_len - 1)
889                                 <= region->guest_phys_address_end))
890                                 *addr_type = PHYS_ADDR_CONTINUOUS;
891                         else
892                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
893                         break;
894                 }
895         }
896
897         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
898                 dev->device_fh, (void *)(uintptr_t)guest_pa,
899                 (void *)(uintptr_t)vhost_pa);
900
901         return vhost_pa;
902 }
903
904 /*
905  * Compares a packet destination MAC address to a device MAC address.
906  */
907 static inline int __attribute__((always_inline))
908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
909 {
910         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
911 }
912
913 /*
914  * This function learns the MAC address of the device and registers this along with a
915  * vlan tag to a VMDQ.
916  */
917 static int
918 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
919 {
920         struct ether_hdr *pkt_hdr;
921         struct virtio_net_data_ll *dev_ll;
922         int i, ret;
923
924         /* Learn MAC address of guest device from packet */
925         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
926
927         dev_ll = ll_root_used;
928
929         while (dev_ll != NULL) {
930                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
931                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
932                         return -1;
933                 }
934                 dev_ll = dev_ll->next;
935         }
936
937         for (i = 0; i < ETHER_ADDR_LEN; i++)
938                 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
939
940         /* vlan_tag currently uses the device_id. */
941         dev->vlan_tag = vlan_tags[dev->device_fh];
942
943         /* Print out VMDQ registration info. */
944         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
945                 dev->device_fh,
946                 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
947                 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
948                 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
949                 dev->vlan_tag);
950
951         /* Register the MAC address. */
952         ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
953         if (ret)
954                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
955                                         dev->device_fh);
956
957         /* Enable stripping of the vlan tag as we handle routing. */
958         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
959
960         /* Set device as ready for RX. */
961         dev->ready = DEVICE_RX;
962
963         return 0;
964 }
965
966 /*
967  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
968  * queue before disabling RX on the device.
969  */
970 static inline void
971 unlink_vmdq(struct virtio_net *dev)
972 {
973         unsigned i = 0;
974         unsigned rx_count;
975         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
976
977         if (dev->ready == DEVICE_RX) {
978                 /*clear MAC and VLAN settings*/
979                 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
980                 for (i = 0; i < 6; i++)
981                         dev->mac_address.addr_bytes[i] = 0;
982
983                 dev->vlan_tag = 0;
984
985                 /*Clear out the receive buffers*/
986                 rx_count = rte_eth_rx_burst(ports[0],
987                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
988
989                 while (rx_count) {
990                         for (i = 0; i < rx_count; i++)
991                                 rte_pktmbuf_free(pkts_burst[i]);
992
993                         rx_count = rte_eth_rx_burst(ports[0],
994                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
995                 }
996
997                 dev->ready = DEVICE_MAC_LEARNING;
998         }
999 }
1000
1001 /*
1002  * Check if the packet destination MAC address is for a local device. If so then put
1003  * the packet on that devices RX queue. If not then return.
1004  */
1005 static inline unsigned __attribute__((always_inline))
1006 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1007 {
1008         struct virtio_net_data_ll *dev_ll;
1009         struct ether_hdr *pkt_hdr;
1010         uint64_t ret = 0;
1011
1012         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1013
1014         /*get the used devices list*/
1015         dev_ll = ll_root_used;
1016
1017         while (dev_ll != NULL) {
1018                 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1019                                           &dev_ll->dev->mac_address)) {
1020
1021                         /* Drop the packet if the TX packet is destined for the TX device. */
1022                         if (dev_ll->dev->device_fh == dev->device_fh) {
1023                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1024                                                         dev_ll->dev->device_fh);
1025                                 return 0;
1026                         }
1027
1028
1029                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1030
1031                         if (dev_ll->dev->remove) {
1032                                 /*drop the packet if the device is marked for removal*/
1033                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1034                         } else {
1035                                 uint32_t mergeable =
1036                                         dev_ll->dev->features &
1037                                         (1 << VIRTIO_NET_F_MRG_RXBUF);
1038
1039                                 /*send the packet to the local virtio device*/
1040                                 if (likely(mergeable == 0))
1041                                         ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1042                                 else
1043                                         ret = virtio_dev_merge_rx(dev_ll->dev,
1044                                                 &m, 1);
1045
1046                                 if (enable_stats) {
1047                                         rte_atomic64_add(
1048                                         &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1049                                         1);
1050                                         rte_atomic64_add(
1051                                         &dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1052                                         ret);
1053                                         dev_statistics[dev->device_fh].tx_total++;
1054                                         dev_statistics[dev->device_fh].tx += ret;
1055                                 }
1056                         }
1057
1058                         return 0;
1059                 }
1060                 dev_ll = dev_ll->next;
1061         }
1062
1063         return -1;
1064 }
1065
1066 /*
1067  * This function routes the TX packet to the correct interface. This may be a local device
1068  * or the physical port.
1069  */
1070 static inline void __attribute__((always_inline))
1071 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1072 {
1073         struct mbuf_table *tx_q;
1074         struct vlan_ethhdr *vlan_hdr;
1075         struct rte_mbuf **m_table;
1076         struct rte_mbuf *mbuf, *prev;
1077         unsigned len, ret, offset = 0;
1078         const uint16_t lcore_id = rte_lcore_id();
1079         struct virtio_net_data_ll *dev_ll = ll_root_used;
1080         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1081
1082         /*check if destination is local VM*/
1083         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1084                 return;
1085
1086         if (vm2vm_mode == VM2VM_HARDWARE) {
1087                 while (dev_ll != NULL) {
1088                         if ((dev_ll->dev->ready == DEVICE_RX)
1089                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1090                                 &dev_ll->dev->mac_address)) {
1091                                 /*
1092                                  * Drop the packet if the TX packet is
1093                                  * destined for the TX device.
1094                                  */
1095                                 if (dev_ll->dev->device_fh == dev->device_fh) {
1096                                         LOG_DEBUG(VHOST_DATA,
1097                                         "(%"PRIu64") TX: Source and destination"
1098                                         " MAC addresses are the same. Dropping "
1099                                         "packet.\n",
1100                                         dev_ll->dev->device_fh);
1101                                         return;
1102                                 }
1103                                 offset = 4;
1104                                 vlan_tag =
1105                                 (uint16_t)
1106                                 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1107
1108                                 LOG_DEBUG(VHOST_DATA,
1109                                 "(%"PRIu64") TX: pkt to local VM device id:"
1110                                 "(%"PRIu64") vlan tag: %d.\n",
1111                                 dev->device_fh, dev_ll->dev->device_fh,
1112                                 vlan_tag);
1113
1114                                 break;
1115                         }
1116                         dev_ll = dev_ll->next;
1117                 }
1118         }
1119
1120         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1121
1122         /*Add packet to the port tx queue*/
1123         tx_q = &lcore_tx_queue[lcore_id];
1124         len = tx_q->len;
1125
1126         /* Allocate an mbuf and populate the structure. */
1127         mbuf = rte_pktmbuf_alloc(mbuf_pool);
1128         if (unlikely(mbuf == NULL)) {
1129                 RTE_LOG(ERR, VHOST_DATA,
1130                         "Failed to allocate memory for mbuf.\n");
1131                 return;
1132         }
1133
1134         mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1135         mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1136         mbuf->nb_segs = m->nb_segs;
1137
1138         /* Copy ethernet header to mbuf. */
1139         rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1140                 rte_pktmbuf_mtod(m, const void *),
1141                 ETH_HLEN);
1142
1143
1144         /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1145         vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1146         vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1147         vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1148         vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1149
1150         /* Copy the remaining packet contents to the mbuf. */
1151         rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1152                 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1153                 (m->data_len - ETH_HLEN));
1154
1155         /* Copy the remaining segments for the whole packet. */
1156         prev = mbuf;
1157         while (m->next) {
1158                 /* Allocate an mbuf and populate the structure. */
1159                 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1160                 if (unlikely(next_mbuf == NULL)) {
1161                         rte_pktmbuf_free(mbuf);
1162                         RTE_LOG(ERR, VHOST_DATA,
1163                                 "Failed to allocate memory for mbuf.\n");
1164                         return;
1165                 }
1166
1167                 m = m->next;
1168                 prev->next = next_mbuf;
1169                 prev = next_mbuf;
1170                 next_mbuf->data_len = m->data_len;
1171
1172                 /* Copy data to next mbuf. */
1173                 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1174                         rte_pktmbuf_mtod(m, const void *), m->data_len);
1175         }
1176
1177         tx_q->m_table[len] = mbuf;
1178         len++;
1179         if (enable_stats) {
1180                 dev_statistics[dev->device_fh].tx_total++;
1181                 dev_statistics[dev->device_fh].tx++;
1182         }
1183
1184         if (unlikely(len == MAX_PKT_BURST)) {
1185                 m_table = (struct rte_mbuf **)tx_q->m_table;
1186                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1187                 /* Free any buffers not handled by TX and update the port stats. */
1188                 if (unlikely(ret < len)) {
1189                         do {
1190                                 rte_pktmbuf_free(m_table[ret]);
1191                         } while (++ret < len);
1192                 }
1193
1194                 len = 0;
1195         }
1196
1197         tx_q->len = len;
1198         return;
1199 }
1200 /*
1201  * This function is called by each data core. It handles all RX/TX registered with the
1202  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1203  * with all devices in the main linked list.
1204  */
1205 static int
1206 switch_worker(__attribute__((unused)) void *arg)
1207 {
1208         struct rte_mempool *mbuf_pool = arg;
1209         struct virtio_net *dev = NULL;
1210         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1211         struct virtio_net_data_ll *dev_ll;
1212         struct mbuf_table *tx_q;
1213         volatile struct lcore_ll_info *lcore_ll;
1214         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1215         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1216         unsigned ret, i;
1217         const uint16_t lcore_id = rte_lcore_id();
1218         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1219         uint16_t rx_count = 0;
1220         uint32_t mergeable = 0;
1221
1222         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1223         lcore_ll = lcore_info[lcore_id].lcore_ll;
1224         prev_tsc = 0;
1225
1226         tx_q = &lcore_tx_queue[lcore_id];
1227         for (i = 0; i < num_cores; i ++) {
1228                 if (lcore_ids[i] == lcore_id) {
1229                         tx_q->txq_id = i;
1230                         break;
1231                 }
1232         }
1233
1234         while(1) {
1235                 cur_tsc = rte_rdtsc();
1236                 /*
1237                  * TX burst queue drain
1238                  */
1239                 diff_tsc = cur_tsc - prev_tsc;
1240                 if (unlikely(diff_tsc > drain_tsc)) {
1241
1242                         if (tx_q->len) {
1243                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1244
1245                                 /*Tx any packets in the queue*/
1246                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1247                                                                            (struct rte_mbuf **)tx_q->m_table,
1248                                                                            (uint16_t)tx_q->len);
1249                                 if (unlikely(ret < tx_q->len)) {
1250                                         do {
1251                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1252                                         } while (++ret < tx_q->len);
1253                                 }
1254
1255                                 tx_q->len = 0;
1256                         }
1257
1258                         prev_tsc = cur_tsc;
1259
1260                 }
1261
1262                 rte_prefetch0(lcore_ll->ll_root_used);
1263                 /*
1264                  * Inform the configuration core that we have exited the linked list and that no devices are
1265                  * in use if requested.
1266                  */
1267                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1268                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1269
1270                 /*
1271                  * Process devices
1272                  */
1273                 dev_ll = lcore_ll->ll_root_used;
1274
1275                 while (dev_ll != NULL) {
1276                         /*get virtio device ID*/
1277                         dev = dev_ll->dev;
1278                         mergeable =
1279                                 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1280
1281                         if (dev->remove) {
1282                                 dev_ll = dev_ll->next;
1283                                 unlink_vmdq(dev);
1284                                 dev->ready = DEVICE_SAFE_REMOVE;
1285                                 continue;
1286                         }
1287                         if (likely(dev->ready == DEVICE_RX)) {
1288                                 /*Handle guest RX*/
1289                                 rx_count = rte_eth_rx_burst(ports[0],
1290                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1291
1292                                 if (rx_count) {
1293                                         if (likely(mergeable == 0))
1294                                                 ret_count =
1295                                                         virtio_dev_rx(dev,
1296                                                         pkts_burst, rx_count);
1297                                         else
1298                                                 ret_count =
1299                                                         virtio_dev_merge_rx(dev,
1300                                                         pkts_burst, rx_count);
1301
1302                                         if (enable_stats) {
1303                                                 rte_atomic64_add(
1304                                                 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1305                                                 rx_count);
1306                                                 rte_atomic64_add(
1307                                                 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
1308                                         }
1309                                         while (likely(rx_count)) {
1310                                                 rx_count--;
1311                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1312                                         }
1313
1314                                 }
1315                         }
1316
1317                         if (!dev->remove) {
1318                                 /*Handle guest TX*/
1319                                 if (likely(mergeable == 0))
1320                                         virtio_dev_tx(dev, mbuf_pool);
1321                                 else
1322                                         virtio_dev_merge_tx(dev, mbuf_pool);
1323                         }
1324
1325                         /*move to the next device in the list*/
1326                         dev_ll = dev_ll->next;
1327                 }
1328         }
1329
1330         return 0;
1331 }
1332
1333 /*
1334  * This function gets available ring number for zero copy rx.
1335  * Only one thread will call this funciton for a paticular virtio device,
1336  * so, it is designed as non-thread-safe function.
1337  */
1338 static inline uint32_t __attribute__((always_inline))
1339 get_available_ring_num_zcp(struct virtio_net *dev)
1340 {
1341         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1342         uint16_t avail_idx;
1343
1344         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1345         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1346 }
1347
1348 /*
1349  * This function gets available ring index for zero copy rx,
1350  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1351  * Only one thread will call this funciton for a paticular virtio device,
1352  * so, it is designed as non-thread-safe function.
1353  */
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_index_zcp(struct virtio_net *dev,
1356         uint16_t *res_base_idx, uint32_t count)
1357 {
1358         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1359         uint16_t avail_idx;
1360         uint32_t retry = 0;
1361         uint16_t free_entries;
1362
1363         *res_base_idx = vq->last_used_idx_res;
1364         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1365         free_entries = (avail_idx - *res_base_idx);
1366
1367         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1368                         "avail idx: %d, "
1369                         "res base idx:%d, free entries:%d\n",
1370                         dev->device_fh, avail_idx, *res_base_idx,
1371                         free_entries);
1372
1373         /*
1374          * If retry is enabled and the queue is full then we wait
1375          * and retry to avoid packet loss.
1376          */
1377         if (enable_retry && unlikely(count > free_entries)) {
1378                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1379                         rte_delay_us(burst_rx_delay_time);
1380                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381                         free_entries = (avail_idx - *res_base_idx);
1382                         if (count <= free_entries)
1383                                 break;
1384                 }
1385         }
1386
1387         /*check that we have enough buffers*/
1388         if (unlikely(count > free_entries))
1389                 count = free_entries;
1390
1391         if (unlikely(count == 0)) {
1392                 LOG_DEBUG(VHOST_DATA,
1393                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1394                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1395                         dev->device_fh, avail_idx,
1396                         *res_base_idx, free_entries);
1397                 return 0;
1398         }
1399
1400         vq->last_used_idx_res = *res_base_idx + count;
1401
1402         return count;
1403 }
1404
1405 /*
1406  * This function put descriptor back to used list.
1407  */
1408 static inline void __attribute__((always_inline))
1409 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1410 {
1411         uint16_t res_cur_idx = vq->last_used_idx;
1412         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1413         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1414         rte_compiler_barrier();
1415         *(volatile uint16_t *)&vq->used->idx += 1;
1416         vq->last_used_idx += 1;
1417
1418         /* Kick the guest if necessary. */
1419         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1420                 eventfd_write((int)vq->kickfd, 1);
1421 }
1422
1423 /*
1424  * This function get available descriptor from vitio vring and un-attached mbuf
1425  * from vpool->ring, and then attach them together. It needs adjust the offset
1426  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1427  * frame data may be put to wrong location in mbuf.
1428  */
1429 static inline void __attribute__((always_inline))
1430 attach_rxmbuf_zcp(struct virtio_net *dev)
1431 {
1432         uint16_t res_base_idx, desc_idx;
1433         uint64_t buff_addr, phys_addr;
1434         struct vhost_virtqueue *vq;
1435         struct vring_desc *desc;
1436         struct rte_mbuf *mbuf = NULL;
1437         struct vpool *vpool;
1438         hpa_type addr_type;
1439
1440         vpool = &vpool_array[dev->vmdq_rx_q];
1441         vq = dev->virtqueue[VIRTIO_RXQ];
1442
1443         do {
1444                 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
1445                                 1) != 1))
1446                         return;
1447                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1448
1449                 desc = &vq->desc[desc_idx];
1450                 if (desc->flags & VRING_DESC_F_NEXT) {
1451                         desc = &vq->desc[desc->next];
1452                         buff_addr = gpa_to_vva(dev, desc->addr);
1453                         phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
1454                                         &addr_type);
1455                 } else {
1456                         buff_addr = gpa_to_vva(dev,
1457                                         desc->addr + vq->vhost_hlen);
1458                         phys_addr = gpa_to_hpa(dev,
1459                                         desc->addr + vq->vhost_hlen,
1460                                         desc->len, &addr_type);
1461                 }
1462
1463                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1464                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1465                                 " address found when attaching RX frame buffer"
1466                                 " address!\n", dev->device_fh);
1467                         put_desc_to_used_list_zcp(vq, desc_idx);
1468                         continue;
1469                 }
1470
1471                 /*
1472                  * Check if the frame buffer address from guest crosses
1473                  * sub-region or not.
1474                  */
1475                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1476                         RTE_LOG(ERR, VHOST_DATA,
1477                                 "(%"PRIu64") Frame buffer address cross "
1478                                 "sub-regioin found when attaching RX frame "
1479                                 "buffer address!\n",
1480                                 dev->device_fh);
1481                         put_desc_to_used_list_zcp(vq, desc_idx);
1482                         continue;
1483                 }
1484         } while (unlikely(phys_addr == 0));
1485
1486         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1487         if (unlikely(mbuf == NULL)) {
1488                 LOG_DEBUG(VHOST_DATA,
1489                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1490                         "ring_sc_dequeue fail.\n",
1491                         dev->device_fh);
1492                 put_desc_to_used_list_zcp(vq, desc_idx);
1493                 return;
1494         }
1495
1496         if (unlikely(vpool->buf_size > desc->len)) {
1497                 LOG_DEBUG(VHOST_DATA,
1498                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1499                         "length(%d) of descriptor idx: %d less than room "
1500                         "size required: %d\n",
1501                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1502                 put_desc_to_used_list_zcp(vq, desc_idx);
1503                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1504                 return;
1505         }
1506
1507         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1508         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1509         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1510         mbuf->data_len = desc->len;
1511         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1512
1513         LOG_DEBUG(VHOST_DATA,
1514                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1515                 "descriptor idx:%d\n",
1516                 dev->device_fh, res_base_idx, desc_idx);
1517
1518         __rte_mbuf_raw_free(mbuf);
1519
1520         return;
1521 }
1522
1523 /*
1524  * Detach an attched packet mbuf -
1525  *  - restore original mbuf address and length values.
1526  *  - reset pktmbuf data and data_len to their default values.
1527  *  All other fields of the given packet mbuf will be left intact.
1528  *
1529  * @param m
1530  *   The attached packet mbuf.
1531  */
1532 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1533 {
1534         const struct rte_mempool *mp = m->pool;
1535         void *buf = RTE_MBUF_TO_BADDR(m);
1536         uint32_t buf_ofs;
1537         uint32_t buf_len = mp->elt_size - sizeof(*m);
1538         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1539
1540         m->buf_addr = buf;
1541         m->buf_len = (uint16_t)buf_len;
1542
1543         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1544                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1545         m->data_off = buf_ofs;
1546
1547         m->data_len = 0;
1548 }
1549
1550 /*
1551  * This function is called after packets have been transimited. It fetchs mbuf
1552  * from vpool->pool, detached it and put into vpool->ring. It also update the
1553  * used index and kick the guest if necessary.
1554  */
1555 static inline uint32_t __attribute__((always_inline))
1556 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1557 {
1558         struct rte_mbuf *mbuf;
1559         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1560         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1561         uint32_t index = 0;
1562         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1563
1564         LOG_DEBUG(VHOST_DATA,
1565                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1566                 "clean is: %d\n",
1567                 dev->device_fh, mbuf_count);
1568         LOG_DEBUG(VHOST_DATA,
1569                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1570                 "clean  is : %d\n",
1571                 dev->device_fh, rte_ring_count(vpool->ring));
1572
1573         for (index = 0; index < mbuf_count; index++) {
1574                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1575                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1576                         pktmbuf_detach_zcp(mbuf);
1577                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1578
1579                 /* Update used index buffer information. */
1580                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1581                 vq->used->ring[used_idx].len = 0;
1582
1583                 used_idx = (used_idx + 1) & (vq->size - 1);
1584         }
1585
1586         LOG_DEBUG(VHOST_DATA,
1587                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1588                 "clean is: %d\n",
1589                 dev->device_fh, rte_mempool_count(vpool->pool));
1590         LOG_DEBUG(VHOST_DATA,
1591                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1592                 "clean  is : %d\n",
1593                 dev->device_fh, rte_ring_count(vpool->ring));
1594         LOG_DEBUG(VHOST_DATA,
1595                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1596                 "vq->last_used_idx:%d\n",
1597                 dev->device_fh, vq->last_used_idx);
1598
1599         vq->last_used_idx += mbuf_count;
1600
1601         LOG_DEBUG(VHOST_DATA,
1602                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1603                 "vq->last_used_idx:%d\n",
1604                 dev->device_fh, vq->last_used_idx);
1605
1606         rte_compiler_barrier();
1607
1608         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1609
1610         /* Kick guest if required. */
1611         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1612                 eventfd_write((int)vq->kickfd, 1);
1613
1614         return 0;
1615 }
1616
1617 /*
1618  * This function is called when a virtio device is destroy.
1619  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1620  */
1621 static void mbuf_destroy_zcp(struct vpool *vpool)
1622 {
1623         struct rte_mbuf *mbuf = NULL;
1624         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1625
1626         LOG_DEBUG(VHOST_CONFIG,
1627                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1628                 "mbuf_destroy_zcp is: %d\n",
1629                 mbuf_count);
1630         LOG_DEBUG(VHOST_CONFIG,
1631                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1632                 "mbuf_destroy_zcp  is : %d\n",
1633                 rte_ring_count(vpool->ring));
1634
1635         for (index = 0; index < mbuf_count; index++) {
1636                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1637                 if (likely(mbuf != NULL)) {
1638                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1639                                 pktmbuf_detach_zcp(mbuf);
1640                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1641                 }
1642         }
1643
1644         LOG_DEBUG(VHOST_CONFIG,
1645                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1646                 "mbuf_destroy_zcp is: %d\n",
1647                 rte_mempool_count(vpool->pool));
1648         LOG_DEBUG(VHOST_CONFIG,
1649                 "in mbuf_destroy_zcp: mbuf count in ring after "
1650                 "mbuf_destroy_zcp is : %d\n",
1651                 rte_ring_count(vpool->ring));
1652 }
1653
1654 /*
1655  * This function update the use flag and counter.
1656  */
1657 static inline uint32_t __attribute__((always_inline))
1658 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1659         uint32_t count)
1660 {
1661         struct vhost_virtqueue *vq;
1662         struct vring_desc *desc;
1663         struct rte_mbuf *buff;
1664         /* The virtio_hdr is initialised to 0. */
1665         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1666                 = {{0, 0, 0, 0, 0, 0}, 0};
1667         uint64_t buff_hdr_addr = 0;
1668         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1669         uint32_t head_idx, packet_success = 0;
1670         uint16_t res_cur_idx;
1671
1672         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1673
1674         if (count == 0)
1675                 return 0;
1676
1677         vq = dev->virtqueue[VIRTIO_RXQ];
1678         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1679
1680         res_cur_idx = vq->last_used_idx;
1681         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1682                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1683
1684         /* Retrieve all of the head indexes first to avoid caching issues. */
1685         for (head_idx = 0; head_idx < count; head_idx++)
1686                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1687
1688         /*Prefetch descriptor index. */
1689         rte_prefetch0(&vq->desc[head[packet_success]]);
1690
1691         while (packet_success != count) {
1692                 /* Get descriptor from available ring */
1693                 desc = &vq->desc[head[packet_success]];
1694
1695                 buff = pkts[packet_success];
1696                 LOG_DEBUG(VHOST_DATA,
1697                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1698                         "pkt[%d] descriptor idx: %d\n",
1699                         dev->device_fh, packet_success,
1700                         MBUF_HEADROOM_UINT32(buff));
1701
1702                 PRINT_PACKET(dev,
1703                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1704                         + RTE_PKTMBUF_HEADROOM),
1705                         rte_pktmbuf_data_len(buff), 0);
1706
1707                 /* Buffer address translation for virtio header. */
1708                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1709                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1710
1711                 /*
1712                  * If the descriptors are chained the header and data are
1713                  * placed in separate buffers.
1714                  */
1715                 if (desc->flags & VRING_DESC_F_NEXT) {
1716                         desc->len = vq->vhost_hlen;
1717                         desc = &vq->desc[desc->next];
1718                         desc->len = rte_pktmbuf_data_len(buff);
1719                 } else {
1720                         desc->len = packet_len;
1721                 }
1722
1723                 /* Update used ring with desc information */
1724                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1725                         = head[packet_success];
1726                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1727                         = packet_len;
1728                 res_cur_idx++;
1729                 packet_success++;
1730
1731                 /* A header is required per buffer. */
1732                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1733                         (const void *)&virtio_hdr, vq->vhost_hlen);
1734
1735                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1736
1737                 if (likely(packet_success < count)) {
1738                         /* Prefetch descriptor index. */
1739                         rte_prefetch0(&vq->desc[head[packet_success]]);
1740                 }
1741         }
1742
1743         rte_compiler_barrier();
1744
1745         LOG_DEBUG(VHOST_DATA,
1746                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1747                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1748                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1749
1750         *(volatile uint16_t *)&vq->used->idx += count;
1751         vq->last_used_idx += count;
1752
1753         LOG_DEBUG(VHOST_DATA,
1754                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1755                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1756                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1757
1758         /* Kick the guest if necessary. */
1759         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1760                 eventfd_write((int)vq->kickfd, 1);
1761
1762         return count;
1763 }
1764
1765 /*
1766  * This function routes the TX packet to the correct interface.
1767  * This may be a local device or the physical port.
1768  */
1769 static inline void __attribute__((always_inline))
1770 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1771         uint32_t desc_idx, uint8_t need_copy)
1772 {
1773         struct mbuf_table *tx_q;
1774         struct rte_mbuf **m_table;
1775         struct rte_mbuf *mbuf = NULL;
1776         unsigned len, ret, offset = 0;
1777         struct vpool *vpool;
1778         struct virtio_net_data_ll *dev_ll = ll_root_used;
1779         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1780         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1781
1782         /*Add packet to the port tx queue*/
1783         tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
1784         len = tx_q->len;
1785
1786         /* Allocate an mbuf and populate the structure. */
1787         vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
1788         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1789         if (unlikely(mbuf == NULL)) {
1790                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1791                 RTE_LOG(ERR, VHOST_DATA,
1792                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1793                         dev->device_fh);
1794                 put_desc_to_used_list_zcp(vq, desc_idx);
1795                 return;
1796         }
1797
1798         if (vm2vm_mode == VM2VM_HARDWARE) {
1799                 /* Avoid using a vlan tag from any vm for external pkt, such as
1800                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1801                  * selection, MAC address determines it as an external pkt
1802                  * which should go to network, while vlan tag determine it as
1803                  * a vm2vm pkt should forward to another vm. Hardware confuse
1804                  * such a ambiguous situation, so pkt will lost.
1805                  */
1806                 vlan_tag = external_pkt_default_vlan_tag;
1807                 while (dev_ll != NULL) {
1808                         if (likely(dev_ll->dev->ready == DEVICE_RX) &&
1809                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1810                                 &dev_ll->dev->mac_address)) {
1811
1812                                 /*
1813                                  * Drop the packet if the TX packet is destined
1814                                  * for the TX device.
1815                                  */
1816                                 if (unlikely(dev_ll->dev->device_fh
1817                                         == dev->device_fh)) {
1818                                         LOG_DEBUG(VHOST_DATA,
1819                                         "(%"PRIu64") TX: Source and destination"
1820                                         "MAC addresses are the same. Dropping "
1821                                         "packet.\n",
1822                                         dev_ll->dev->device_fh);
1823                                         MBUF_HEADROOM_UINT32(mbuf)
1824                                                 = (uint32_t)desc_idx;
1825                                         __rte_mbuf_raw_free(mbuf);
1826                                         return;
1827                                 }
1828
1829                                 /*
1830                                  * Packet length offset 4 bytes for HW vlan
1831                                  * strip when L2 switch back.
1832                                  */
1833                                 offset = 4;
1834                                 vlan_tag =
1835                                 (uint16_t)
1836                                 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1837
1838                                 LOG_DEBUG(VHOST_DATA,
1839                                 "(%"PRIu64") TX: pkt to local VM device id:"
1840                                 "(%"PRIu64") vlan tag: %d.\n",
1841                                 dev->device_fh, dev_ll->dev->device_fh,
1842                                 vlan_tag);
1843
1844                                 break;
1845                         }
1846                         dev_ll = dev_ll->next;
1847                 }
1848         }
1849
1850         mbuf->nb_segs = m->nb_segs;
1851         mbuf->next = m->next;
1852         mbuf->data_len = m->data_len + offset;
1853         mbuf->pkt_len = mbuf->data_len;
1854         if (unlikely(need_copy)) {
1855                 /* Copy the packet contents to the mbuf. */
1856                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1857                         rte_pktmbuf_mtod(m, void *),
1858                         m->data_len);
1859         } else {
1860                 mbuf->data_off = m->data_off;
1861                 mbuf->buf_physaddr = m->buf_physaddr;
1862                 mbuf->buf_addr = m->buf_addr;
1863         }
1864         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1865         mbuf->vlan_tci = vlan_tag;
1866         mbuf->l2_len = sizeof(struct ether_hdr);
1867         mbuf->l3_len = sizeof(struct ipv4_hdr);
1868         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1869
1870         tx_q->m_table[len] = mbuf;
1871         len++;
1872
1873         LOG_DEBUG(VHOST_DATA,
1874                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1875                 dev->device_fh,
1876                 mbuf->nb_segs,
1877                 (mbuf->next == NULL) ? "null" : "non-null");
1878
1879         if (enable_stats) {
1880                 dev_statistics[dev->device_fh].tx_total++;
1881                 dev_statistics[dev->device_fh].tx++;
1882         }
1883
1884         if (unlikely(len == MAX_PKT_BURST)) {
1885                 m_table = (struct rte_mbuf **)tx_q->m_table;
1886                 ret = rte_eth_tx_burst(ports[0],
1887                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1888
1889                 /*
1890                  * Free any buffers not handled by TX and update
1891                  * the port stats.
1892                  */
1893                 if (unlikely(ret < len)) {
1894                         do {
1895                                 rte_pktmbuf_free(m_table[ret]);
1896                         } while (++ret < len);
1897                 }
1898
1899                 len = 0;
1900                 txmbuf_clean_zcp(dev, vpool);
1901         }
1902
1903         tx_q->len = len;
1904
1905         return;
1906 }
1907
1908 /*
1909  * This function TX all available packets in virtio TX queue for one
1910  * virtio-net device. If it is first packet, it learns MAC address and
1911  * setup VMDQ.
1912  */
1913 static inline void __attribute__((always_inline))
1914 virtio_dev_tx_zcp(struct virtio_net *dev)
1915 {
1916         struct rte_mbuf m;
1917         struct vhost_virtqueue *vq;
1918         struct vring_desc *desc;
1919         uint64_t buff_addr = 0, phys_addr;
1920         uint32_t head[MAX_PKT_BURST];
1921         uint32_t i;
1922         uint16_t free_entries, packet_success = 0;
1923         uint16_t avail_idx;
1924         uint8_t need_copy = 0;
1925         hpa_type addr_type;
1926
1927         vq = dev->virtqueue[VIRTIO_TXQ];
1928         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1929
1930         /* If there are no available buffers then return. */
1931         if (vq->last_used_idx_res == avail_idx)
1932                 return;
1933
1934         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1935
1936         /* Prefetch available ring to retrieve head indexes. */
1937         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1938
1939         /* Get the number of free entries in the ring */
1940         free_entries = (avail_idx - vq->last_used_idx_res);
1941
1942         /* Limit to MAX_PKT_BURST. */
1943         free_entries
1944                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1945
1946         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1947                 dev->device_fh, free_entries);
1948
1949         /* Retrieve all of the head indexes first to avoid caching issues. */
1950         for (i = 0; i < free_entries; i++)
1951                 head[i]
1952                         = vq->avail->ring[(vq->last_used_idx_res + i)
1953                         & (vq->size - 1)];
1954
1955         vq->last_used_idx_res += free_entries;
1956
1957         /* Prefetch descriptor index. */
1958         rte_prefetch0(&vq->desc[head[packet_success]]);
1959         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1960
1961         while (packet_success < free_entries) {
1962                 desc = &vq->desc[head[packet_success]];
1963
1964                 /* Discard first buffer as it is the virtio header */
1965                 desc = &vq->desc[desc->next];
1966
1967                 /* Buffer address translation. */
1968                 buff_addr = gpa_to_vva(dev, desc->addr);
1969                 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
1970
1971                 if (likely(packet_success < (free_entries - 1)))
1972                         /* Prefetch descriptor index. */
1973                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1974
1975                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1976                         RTE_LOG(ERR, VHOST_DATA,
1977                                 "(%"PRIu64") Invalid frame buffer address found"
1978                                 "when TX packets!\n",
1979                                 dev->device_fh);
1980                         packet_success++;
1981                         continue;
1982                 }
1983
1984                 /* Prefetch buffer address. */
1985                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1986
1987                 /*
1988                  * Setup dummy mbuf. This is copied to a real mbuf if
1989                  * transmitted out the physical port.
1990                  */
1991                 m.data_len = desc->len;
1992                 m.nb_segs = 1;
1993                 m.next = NULL;
1994                 m.data_off = 0;
1995                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1996                 m.buf_physaddr = phys_addr;
1997
1998                 /*
1999                  * Check if the frame buffer address from guest crosses
2000                  * sub-region or not.
2001                  */
2002                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2003                         RTE_LOG(ERR, VHOST_DATA,
2004                                 "(%"PRIu64") Frame buffer address cross "
2005                                 "sub-regioin found when attaching TX frame "
2006                                 "buffer address!\n",
2007                                 dev->device_fh);
2008                         need_copy = 1;
2009                 } else
2010                         need_copy = 0;
2011
2012                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2013
2014                 /*
2015                  * If this is the first received packet we need to learn
2016                  * the MAC and setup VMDQ
2017                  */
2018                 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2019                         if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2020                                 /*
2021                                  * Discard frame if device is scheduled for
2022                                  * removal or a duplicate MAC address is found.
2023                                  */
2024                                 packet_success += free_entries;
2025                                 vq->last_used_idx += packet_success;
2026                                 break;
2027                         }
2028                 }
2029
2030                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2031                 packet_success++;
2032         }
2033 }
2034
2035 /*
2036  * This function is called by each data core. It handles all RX/TX registered
2037  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2038  * addresses are compared with all devices in the main linked list.
2039  */
2040 static int
2041 switch_worker_zcp(__attribute__((unused)) void *arg)
2042 {
2043         struct virtio_net *dev = NULL;
2044         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2045         struct virtio_net_data_ll *dev_ll;
2046         struct mbuf_table *tx_q;
2047         volatile struct lcore_ll_info *lcore_ll;
2048         const uint64_t drain_tsc
2049                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2050                 * BURST_TX_DRAIN_US;
2051         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2052         unsigned ret;
2053         const uint16_t lcore_id = rte_lcore_id();
2054         uint16_t count_in_ring, rx_count = 0;
2055
2056         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2057
2058         lcore_ll = lcore_info[lcore_id].lcore_ll;
2059         prev_tsc = 0;
2060
2061         while (1) {
2062                 cur_tsc = rte_rdtsc();
2063
2064                 /* TX burst queue drain */
2065                 diff_tsc = cur_tsc - prev_tsc;
2066                 if (unlikely(diff_tsc > drain_tsc)) {
2067                         /*
2068                          * Get mbuf from vpool.pool and detach mbuf and
2069                          * put back into vpool.ring.
2070                          */
2071                         dev_ll = lcore_ll->ll_root_used;
2072                         while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2073                                 /* Get virtio device ID */
2074                                 dev = dev_ll->dev;
2075
2076                                 if (likely(!dev->remove)) {
2077                                         tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2078                                         if (tx_q->len) {
2079                                                 LOG_DEBUG(VHOST_DATA,
2080                                                 "TX queue drained after timeout"
2081                                                 " with burst size %u\n",
2082                                                 tx_q->len);
2083
2084                                                 /*
2085                                                  * Tx any packets in the queue
2086                                                  */
2087                                                 ret = rte_eth_tx_burst(
2088                                                         ports[0],
2089                                                         (uint16_t)tx_q->txq_id,
2090                                                         (struct rte_mbuf **)
2091                                                         tx_q->m_table,
2092                                                         (uint16_t)tx_q->len);
2093                                                 if (unlikely(ret < tx_q->len)) {
2094                                                         do {
2095                                                                 rte_pktmbuf_free(
2096                                                                         tx_q->m_table[ret]);
2097                                                         } while (++ret < tx_q->len);
2098                                                 }
2099                                                 tx_q->len = 0;
2100
2101                                                 txmbuf_clean_zcp(dev,
2102                                                         &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2103                                         }
2104                                 }
2105                                 dev_ll = dev_ll->next;
2106                         }
2107                         prev_tsc = cur_tsc;
2108                 }
2109
2110                 rte_prefetch0(lcore_ll->ll_root_used);
2111
2112                 /*
2113                  * Inform the configuration core that we have exited the linked
2114                  * list and that no devices are in use if requested.
2115                  */
2116                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2117                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2118
2119                 /* Process devices */
2120                 dev_ll = lcore_ll->ll_root_used;
2121
2122                 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2123                         dev = dev_ll->dev;
2124                         if (unlikely(dev->remove)) {
2125                                 dev_ll = dev_ll->next;
2126                                 unlink_vmdq(dev);
2127                                 dev->ready = DEVICE_SAFE_REMOVE;
2128                                 continue;
2129                         }
2130
2131                         if (likely(dev->ready == DEVICE_RX)) {
2132                                 uint32_t index = dev->vmdq_rx_q;
2133                                 uint16_t i;
2134                                 count_in_ring
2135                                 = rte_ring_count(vpool_array[index].ring);
2136                                 uint16_t free_entries
2137                                 = (uint16_t)get_available_ring_num_zcp(dev);
2138
2139                                 /*
2140                                  * Attach all mbufs in vpool.ring and put back
2141                                  * into vpool.pool.
2142                                  */
2143                                 for (i = 0;
2144                                 i < RTE_MIN(free_entries,
2145                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2146                                 i++)
2147                                         attach_rxmbuf_zcp(dev);
2148
2149                                 /* Handle guest RX */
2150                                 rx_count = rte_eth_rx_burst(ports[0],
2151                                         (uint16_t)dev->vmdq_rx_q, pkts_burst,
2152                                         MAX_PKT_BURST);
2153
2154                                 if (rx_count) {
2155                                         ret_count = virtio_dev_rx_zcp(dev,
2156                                                         pkts_burst, rx_count);
2157                                         if (enable_stats) {
2158                                                 dev_statistics[dev->device_fh].rx_total
2159                                                         += rx_count;
2160                                                 dev_statistics[dev->device_fh].rx
2161                                                         += ret_count;
2162                                         }
2163                                         while (likely(rx_count)) {
2164                                                 rx_count--;
2165                                                 pktmbuf_detach_zcp(
2166                                                         pkts_burst[rx_count]);
2167                                                 rte_ring_sp_enqueue(
2168                                                         vpool_array[index].ring,
2169                                                         (void *)pkts_burst[rx_count]);
2170                                         }
2171                                 }
2172                         }
2173
2174                         if (likely(!dev->remove))
2175                                 /* Handle guest TX */
2176                                 virtio_dev_tx_zcp(dev);
2177
2178                         /* Move to the next device in the list */
2179                         dev_ll = dev_ll->next;
2180                 }
2181         }
2182
2183         return 0;
2184 }
2185
2186
2187 /*
2188  * Add an entry to a used linked list. A free entry must first be found
2189  * in the free linked list using get_data_ll_free_entry();
2190  */
2191 static void
2192 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2193         struct virtio_net_data_ll *ll_dev)
2194 {
2195         struct virtio_net_data_ll *ll = *ll_root_addr;
2196
2197         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2198         ll_dev->next = NULL;
2199         rte_compiler_barrier();
2200
2201         /* If ll == NULL then this is the first device. */
2202         if (ll) {
2203                 /* Increment to the tail of the linked list. */
2204                 while ((ll->next != NULL) )
2205                         ll = ll->next;
2206
2207                 ll->next = ll_dev;
2208         } else {
2209                 *ll_root_addr = ll_dev;
2210         }
2211 }
2212
2213 /*
2214  * Remove an entry from a used linked list. The entry must then be added to
2215  * the free linked list using put_data_ll_free_entry().
2216  */
2217 static void
2218 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2219         struct virtio_net_data_ll *ll_dev,
2220         struct virtio_net_data_ll *ll_dev_last)
2221 {
2222         struct virtio_net_data_ll *ll = *ll_root_addr;
2223
2224         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2225                 return;
2226
2227         if (ll_dev == ll)
2228                 *ll_root_addr = ll_dev->next;
2229         else
2230                 if (likely(ll_dev_last != NULL))
2231                         ll_dev_last->next = ll_dev->next;
2232                 else
2233                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2234 }
2235
2236 /*
2237  * Find and return an entry from the free linked list.
2238  */
2239 static struct virtio_net_data_ll *
2240 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2241 {
2242         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2243         struct virtio_net_data_ll *ll_dev;
2244
2245         if (ll_free == NULL)
2246                 return NULL;
2247
2248         ll_dev = ll_free;
2249         *ll_root_addr = ll_free->next;
2250
2251         return ll_dev;
2252 }
2253
2254 /*
2255  * Place an entry back on to the free linked list.
2256  */
2257 static void
2258 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2259         struct virtio_net_data_ll *ll_dev)
2260 {
2261         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2262
2263         if (ll_dev == NULL)
2264                 return;
2265
2266         ll_dev->next = ll_free;
2267         *ll_root_addr = ll_dev;
2268 }
2269
2270 /*
2271  * Creates a linked list of a given size.
2272  */
2273 static struct virtio_net_data_ll *
2274 alloc_data_ll(uint32_t size)
2275 {
2276         struct virtio_net_data_ll *ll_new;
2277         uint32_t i;
2278
2279         /* Malloc and then chain the linked list. */
2280         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2281         if (ll_new == NULL) {
2282                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2283                 return NULL;
2284         }
2285
2286         for (i = 0; i < size - 1; i++) {
2287                 ll_new[i].dev = NULL;
2288                 ll_new[i].next = &ll_new[i+1];
2289         }
2290         ll_new[i].next = NULL;
2291
2292         return (ll_new);
2293 }
2294
2295 /*
2296  * Create the main linked list along with each individual cores linked list. A used and a free list
2297  * are created to manage entries.
2298  */
2299 static int
2300 init_data_ll (void)
2301 {
2302         int lcore;
2303
2304         RTE_LCORE_FOREACH_SLAVE(lcore) {
2305                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2306                 if (lcore_info[lcore].lcore_ll == NULL) {
2307                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2308                         return -1;
2309                 }
2310
2311                 lcore_info[lcore].lcore_ll->device_num = 0;
2312                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2313                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2314                 if (num_devices % num_switching_cores)
2315                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2316                 else
2317                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2318         }
2319
2320         /* Allocate devices up to a maximum of MAX_DEVICES. */
2321         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2322
2323         return 0;
2324 }
2325
2326 /*
2327  * Set virtqueue flags so that we do not receive interrupts.
2328  */
2329 static void
2330 set_irq_status (struct virtio_net *dev)
2331 {
2332         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2333         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2334 }
2335
2336 /*
2337  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2338  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2339  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2340  */
2341 static void
2342 destroy_device (volatile struct virtio_net *dev)
2343 {
2344         struct virtio_net_data_ll *ll_lcore_dev_cur;
2345         struct virtio_net_data_ll *ll_main_dev_cur;
2346         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2347         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2348         int lcore;
2349
2350         dev->flags &= ~VIRTIO_DEV_RUNNING;
2351
2352         /*set the remove flag. */
2353         dev->remove = 1;
2354
2355         while(dev->ready != DEVICE_SAFE_REMOVE) {
2356                 rte_pause();
2357         }
2358
2359         /* Search for entry to be removed from lcore ll */
2360         ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
2361         while (ll_lcore_dev_cur != NULL) {
2362                 if (ll_lcore_dev_cur->dev == dev) {
2363                         break;
2364                 } else {
2365                         ll_lcore_dev_last = ll_lcore_dev_cur;
2366                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2367                 }
2368         }
2369
2370         if (ll_lcore_dev_cur == NULL) {
2371                 RTE_LOG(ERR, VHOST_CONFIG,
2372                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2373                         dev->device_fh);
2374                 return;
2375         }
2376
2377         /* Search for entry to be removed from main ll */
2378         ll_main_dev_cur = ll_root_used;
2379         ll_main_dev_last = NULL;
2380         while (ll_main_dev_cur != NULL) {
2381                 if (ll_main_dev_cur->dev == dev) {
2382                         break;
2383                 } else {
2384                         ll_main_dev_last = ll_main_dev_cur;
2385                         ll_main_dev_cur = ll_main_dev_cur->next;
2386                 }
2387         }
2388
2389         /* Remove entries from the lcore and main ll. */
2390         rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2391         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2392
2393         /* Set the dev_removal_flag on each lcore. */
2394         RTE_LCORE_FOREACH_SLAVE(lcore) {
2395                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2396         }
2397
2398         /*
2399          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2400          * they can no longer access the device removed from the linked lists and that the devices
2401          * are no longer in use.
2402          */
2403         RTE_LCORE_FOREACH_SLAVE(lcore) {
2404                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2405                         rte_pause();
2406                 }
2407         }
2408
2409         /* Add the entries back to the lcore and main free ll.*/
2410         put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2411         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2412
2413         /* Decrement number of device on the lcore. */
2414         lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
2415
2416         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2417
2418         if (zero_copy) {
2419                 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2420
2421                 /* Stop the RX queue. */
2422                 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2423                         LOG_DEBUG(VHOST_CONFIG,
2424                                 "(%"PRIu64") In destroy_device: Failed to stop "
2425                                 "rx queue:%d\n",
2426                                 dev->device_fh,
2427                                 dev->vmdq_rx_q);
2428                 }
2429
2430                 LOG_DEBUG(VHOST_CONFIG,
2431                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2432                         "mempool back to ring for RX queue: %d\n",
2433                         dev->device_fh, dev->vmdq_rx_q);
2434
2435                 mbuf_destroy_zcp(vpool);
2436
2437                 /* Stop the TX queue. */
2438                 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2439                         LOG_DEBUG(VHOST_CONFIG,
2440                                 "(%"PRIu64") In destroy_device: Failed to "
2441                                 "stop tx queue:%d\n",
2442                                 dev->device_fh, dev->vmdq_rx_q);
2443                 }
2444
2445                 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
2446
2447                 LOG_DEBUG(VHOST_CONFIG,
2448                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2449                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2450                         dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
2451                         dev->device_fh);
2452
2453                 mbuf_destroy_zcp(vpool);
2454         }
2455
2456 }
2457
2458 /*
2459  * A new device is added to a data core. First the device is added to the main linked list
2460  * and the allocated to a specific data core.
2461  */
2462 static int
2463 new_device (struct virtio_net *dev)
2464 {
2465         struct virtio_net_data_ll *ll_dev;
2466         int lcore, core_add = 0;
2467         uint32_t device_num_min = num_devices;
2468
2469         /* Add device to main ll */
2470         ll_dev = get_data_ll_free_entry(&ll_root_free);
2471         if (ll_dev == NULL) {
2472                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2473                         "of %d devices per core has been reached\n",
2474                         dev->device_fh, num_devices);
2475                 return -1;
2476         }
2477         ll_dev->dev = dev;
2478         add_data_ll_entry(&ll_root_used, ll_dev);
2479         ll_dev->dev->vmdq_rx_q
2480                 = ll_dev->dev->device_fh * (num_queues / num_devices);
2481
2482         if (zero_copy) {
2483                 uint32_t index = ll_dev->dev->vmdq_rx_q;
2484                 uint32_t count_in_ring, i;
2485                 struct mbuf_table *tx_q;
2486
2487                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2488
2489                 LOG_DEBUG(VHOST_CONFIG,
2490                         "(%"PRIu64") in new_device: mbuf count in mempool "
2491                         "before attach is: %d\n",
2492                         dev->device_fh,
2493                         rte_mempool_count(vpool_array[index].pool));
2494                 LOG_DEBUG(VHOST_CONFIG,
2495                         "(%"PRIu64") in new_device: mbuf count in  ring "
2496                         "before attach  is : %d\n",
2497                         dev->device_fh, count_in_ring);
2498
2499                 /*
2500                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2501                  */
2502                 for (i = 0; i < count_in_ring; i++)
2503                         attach_rxmbuf_zcp(dev);
2504
2505                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2506                         "mempool after attach is: %d\n",
2507                         dev->device_fh,
2508                         rte_mempool_count(vpool_array[index].pool));
2509                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2510                         "ring after attach  is : %d\n",
2511                         dev->device_fh,
2512                         rte_ring_count(vpool_array[index].ring));
2513
2514                 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2515                 tx_q->txq_id = dev->vmdq_rx_q;
2516
2517                 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2518                         struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2519
2520                         LOG_DEBUG(VHOST_CONFIG,
2521                                 "(%"PRIu64") In new_device: Failed to start "
2522                                 "tx queue:%d\n",
2523                                 dev->device_fh, dev->vmdq_rx_q);
2524
2525                         mbuf_destroy_zcp(vpool);
2526                         return -1;
2527                 }
2528
2529                 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2530                         struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2531
2532                         LOG_DEBUG(VHOST_CONFIG,
2533                                 "(%"PRIu64") In new_device: Failed to start "
2534                                 "rx queue:%d\n",
2535                                 dev->device_fh, dev->vmdq_rx_q);
2536
2537                         /* Stop the TX queue. */
2538                         if (rte_eth_dev_tx_queue_stop(ports[0],
2539                                 dev->vmdq_rx_q) != 0) {
2540                                 LOG_DEBUG(VHOST_CONFIG,
2541                                         "(%"PRIu64") In new_device: Failed to "
2542                                         "stop tx queue:%d\n",
2543                                         dev->device_fh, dev->vmdq_rx_q);
2544                         }
2545
2546                         mbuf_destroy_zcp(vpool);
2547                         return -1;
2548                 }
2549
2550         }
2551
2552         /*reset ready flag*/
2553         dev->ready = DEVICE_MAC_LEARNING;
2554         dev->remove = 0;
2555
2556         /* Find a suitable lcore to add the device. */
2557         RTE_LCORE_FOREACH_SLAVE(lcore) {
2558                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2559                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2560                         core_add = lcore;
2561                 }
2562         }
2563         /* Add device to lcore ll */
2564         ll_dev->dev->coreid = core_add;
2565         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2566         if (ll_dev == NULL) {
2567                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2568                 dev->ready = DEVICE_SAFE_REMOVE;
2569                 destroy_device(dev);
2570                 return -1;
2571         }
2572         ll_dev->dev = dev;
2573         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2574
2575         /* Initialize device stats */
2576         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2577
2578         /* Disable notifications. */
2579         set_irq_status(dev);
2580         lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
2581         dev->flags |= VIRTIO_DEV_RUNNING;
2582
2583         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
2584
2585         return 0;
2586 }
2587
2588 /*
2589  * These callback allow devices to be added to the data core when configuration
2590  * has been fully complete.
2591  */
2592 static const struct virtio_net_device_ops virtio_net_device_ops =
2593 {
2594         .new_device =  new_device,
2595         .destroy_device = destroy_device,
2596 };
2597
2598 /*
2599  * This is a thread will wake up after a period to print stats if the user has
2600  * enabled them.
2601  */
2602 static void
2603 print_stats(void)
2604 {
2605         struct virtio_net_data_ll *dev_ll;
2606         uint64_t tx_dropped, rx_dropped;
2607         uint64_t tx, tx_total, rx, rx_total;
2608         uint32_t device_fh;
2609         const char clr[] = { 27, '[', '2', 'J', '\0' };
2610         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2611
2612         while(1) {
2613                 sleep(enable_stats);
2614
2615                 /* Clear screen and move to top left */
2616                 printf("%s%s", clr, top_left);
2617
2618                 printf("\nDevice statistics ====================================");
2619
2620                 dev_ll = ll_root_used;
2621                 while (dev_ll != NULL) {
2622                         device_fh = (uint32_t)dev_ll->dev->device_fh;
2623                         tx_total = dev_statistics[device_fh].tx_total;
2624                         tx = dev_statistics[device_fh].tx;
2625                         tx_dropped = tx_total - tx;
2626                         if (zero_copy == 0) {
2627                                 rx_total = rte_atomic64_read(
2628                                         &dev_statistics[device_fh].rx_total_atomic);
2629                                 rx = rte_atomic64_read(
2630                                         &dev_statistics[device_fh].rx_atomic);
2631                         } else {
2632                                 rx_total = dev_statistics[device_fh].rx_total;
2633                                 rx = dev_statistics[device_fh].rx;
2634                         }
2635                         rx_dropped = rx_total - rx;
2636
2637                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2638                                         "\nTX total:            %"PRIu64""
2639                                         "\nTX dropped:          %"PRIu64""
2640                                         "\nTX successful:               %"PRIu64""
2641                                         "\nRX total:            %"PRIu64""
2642                                         "\nRX dropped:          %"PRIu64""
2643                                         "\nRX successful:               %"PRIu64"",
2644                                         device_fh,
2645                                         tx_total,
2646                                         tx_dropped,
2647                                         tx,
2648                                         rx_total,
2649                                         rx_dropped,
2650                                         rx);
2651
2652                         dev_ll = dev_ll->next;
2653                 }
2654                 printf("\n======================================================\n");
2655         }
2656 }
2657
2658 static void
2659 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2660         char *ring_name, uint32_t nb_mbuf)
2661 {
2662         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2663         vpool_array[index].pool
2664                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2665                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2666                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2667                 rte_pktmbuf_init, NULL, socket, 0);
2668         if (vpool_array[index].pool != NULL) {
2669                 vpool_array[index].ring
2670                         = rte_ring_create(ring_name,
2671                                 rte_align32pow2(nb_mbuf + 1),
2672                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2673                 if (likely(vpool_array[index].ring != NULL)) {
2674                         LOG_DEBUG(VHOST_CONFIG,
2675                                 "in setup_mempool_tbl: mbuf count in "
2676                                 "mempool is: %d\n",
2677                                 rte_mempool_count(vpool_array[index].pool));
2678                         LOG_DEBUG(VHOST_CONFIG,
2679                                 "in setup_mempool_tbl: mbuf count in "
2680                                 "ring   is: %d\n",
2681                                 rte_ring_count(vpool_array[index].ring));
2682                 } else {
2683                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2684                                 ring_name);
2685                 }
2686
2687                 /* Need consider head room. */
2688                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2689         } else {
2690                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2691         }
2692 }
2693
2694
2695 /*
2696  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2697  * device is also registered here to handle the IOCTLs.
2698  */
2699 int
2700 MAIN(int argc, char *argv[])
2701 {
2702         struct rte_mempool *mbuf_pool = NULL;
2703         unsigned lcore_id, core_id = 0;
2704         unsigned nb_ports, valid_num_ports;
2705         int ret;
2706         uint8_t portid, queue_id = 0;
2707         static pthread_t tid;
2708
2709         /* init EAL */
2710         ret = rte_eal_init(argc, argv);
2711         if (ret < 0)
2712                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2713         argc -= ret;
2714         argv += ret;
2715
2716         /* parse app arguments */
2717         ret = us_vhost_parse_args(argc, argv);
2718         if (ret < 0)
2719                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2720
2721         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2722                 if (rte_lcore_is_enabled(lcore_id))
2723                         lcore_ids[core_id ++] = lcore_id;
2724
2725         if (rte_lcore_count() > RTE_MAX_LCORE)
2726                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2727
2728         /*set the number of swithcing cores available*/
2729         num_switching_cores = rte_lcore_count()-1;
2730
2731         /* Get the number of physical ports. */
2732         nb_ports = rte_eth_dev_count();
2733         if (nb_ports > RTE_MAX_ETHPORTS)
2734                 nb_ports = RTE_MAX_ETHPORTS;
2735
2736         /*
2737          * Update the global var NUM_PORTS and global array PORTS
2738          * and get value of var VALID_NUM_PORTS according to system ports number
2739          */
2740         valid_num_ports = check_ports_num(nb_ports);
2741
2742         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2743                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2744                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2745                 return -1;
2746         }
2747
2748         if (zero_copy == 0) {
2749                 /* Create the mbuf pool. */
2750                 mbuf_pool = rte_mempool_create(
2751                                 "MBUF_POOL",
2752                                 NUM_MBUFS_PER_PORT
2753                                 * valid_num_ports,
2754                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2755                                 sizeof(struct rte_pktmbuf_pool_private),
2756                                 rte_pktmbuf_pool_init, NULL,
2757                                 rte_pktmbuf_init, NULL,
2758                                 rte_socket_id(), 0);
2759                 if (mbuf_pool == NULL)
2760                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2761
2762                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2763                         vpool_array[queue_id].pool = mbuf_pool;
2764
2765                 if (vm2vm_mode == VM2VM_HARDWARE) {
2766                         /* Enable VT loop back to let L2 switch to do it. */
2767                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2768                         LOG_DEBUG(VHOST_CONFIG,
2769                                 "Enable loop back for L2 switch in vmdq.\n");
2770                 }
2771         } else {
2772                 uint32_t nb_mbuf;
2773                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2774                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2775
2776                 /*
2777                  * Zero copy defers queue RX/TX start to the time when guest
2778                  * finishes its startup and packet buffers from that guest are
2779                  * available.
2780                  */
2781                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2782                 rx_conf_default.rx_drop_en = 0;
2783                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2784                 nb_mbuf = num_rx_descriptor
2785                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2786                         + num_switching_cores * MAX_PKT_BURST;
2787
2788                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2789                         snprintf(pool_name, sizeof(pool_name),
2790                                 "rxmbuf_pool_%u", queue_id);
2791                         snprintf(ring_name, sizeof(ring_name),
2792                                 "rxmbuf_ring_%u", queue_id);
2793                         setup_mempool_tbl(rte_socket_id(), queue_id,
2794                                 pool_name, ring_name, nb_mbuf);
2795                 }
2796
2797                 nb_mbuf = num_tx_descriptor
2798                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2799                                 + num_switching_cores * MAX_PKT_BURST;
2800
2801                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2802                         snprintf(pool_name, sizeof(pool_name),
2803                                 "txmbuf_pool_%u", queue_id);
2804                         snprintf(ring_name, sizeof(ring_name),
2805                                 "txmbuf_ring_%u", queue_id);
2806                         setup_mempool_tbl(rte_socket_id(),
2807                                 (queue_id + MAX_QUEUES),
2808                                 pool_name, ring_name, nb_mbuf);
2809                 }
2810
2811                 if (vm2vm_mode == VM2VM_HARDWARE) {
2812                         /* Enable VT loop back to let L2 switch to do it. */
2813                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2814                         LOG_DEBUG(VHOST_CONFIG,
2815                                 "Enable loop back for L2 switch in vmdq.\n");
2816                 }
2817         }
2818         /* Set log level. */
2819         rte_set_log_level(LOG_LEVEL);
2820
2821         /* initialize all ports */
2822         for (portid = 0; portid < nb_ports; portid++) {
2823                 /* skip ports that are not enabled */
2824                 if ((enabled_port_mask & (1 << portid)) == 0) {
2825                         RTE_LOG(INFO, VHOST_PORT,
2826                                 "Skipping disabled port %d\n", portid);
2827                         continue;
2828                 }
2829                 if (port_init(portid) != 0)
2830                         rte_exit(EXIT_FAILURE,
2831                                 "Cannot initialize network ports\n");
2832         }
2833
2834         /* Initialise all linked lists. */
2835         if (init_data_ll() == -1)
2836                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2837
2838         /* Initialize device stats */
2839         memset(&dev_statistics, 0, sizeof(dev_statistics));
2840
2841         /* Enable stats if the user option is set. */
2842         if (enable_stats)
2843                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2844
2845         /* Launch all data cores. */
2846         if (zero_copy == 0) {
2847                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2848                         rte_eal_remote_launch(switch_worker,
2849                                 mbuf_pool, lcore_id);
2850                 }
2851         } else {
2852                 uint32_t count_in_mempool, index, i;
2853                 for (index = 0; index < 2*MAX_QUEUES; index++) {
2854                         /* For all RX and TX queues. */
2855                         count_in_mempool
2856                                 = rte_mempool_count(vpool_array[index].pool);
2857
2858                         /*
2859                          * Transfer all un-attached mbufs from vpool.pool
2860                          * to vpoo.ring.
2861                          */
2862                         for (i = 0; i < count_in_mempool; i++) {
2863                                 struct rte_mbuf *mbuf
2864                                         = __rte_mbuf_raw_alloc(
2865                                                 vpool_array[index].pool);
2866                                 rte_ring_sp_enqueue(vpool_array[index].ring,
2867                                                 (void *)mbuf);
2868                         }
2869
2870                         LOG_DEBUG(VHOST_CONFIG,
2871                                 "in MAIN: mbuf count in mempool at initial "
2872                                 "is: %d\n", count_in_mempool);
2873                         LOG_DEBUG(VHOST_CONFIG,
2874                                 "in MAIN: mbuf count in  ring at initial  is :"
2875                                 " %d\n",
2876                                 rte_ring_count(vpool_array[index].ring));
2877                 }
2878
2879                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
2880                         rte_eal_remote_launch(switch_worker_zcp, NULL,
2881                                 lcore_id);
2882         }
2883
2884         /* Register CUSE device to handle IOCTLs. */
2885         ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
2886         if (ret != 0)
2887                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
2888
2889         init_virtio_net(&virtio_net_device_ops);
2890
2891         /* Start CUSE session. */
2892         start_cuse_session_loop();
2893         return 0;
2894
2895 }
2896