examples/vhost: add vhost dev struct
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56
57 #define MAX_QUEUES 128
58
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
66                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
67                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68                                                         (num_switching_cores*MBUF_CACHE_SIZE))
69
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80         + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101
102 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
105
106 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
108
109 #define JUMBO_FRAME_MAX_SIZE    0x2600
110
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
113 #define DEVICE_RX                       1
114 #define DEVICE_SAFE_REMOVE      2
115
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
119
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
123
124 /*
125  * Need refine these 2 macros for legacy and DPDK based front end:
126  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127  * And then adjust power 2.
128  */
129 /*
130  * For legacy front end, 128 descriptors,
131  * half for virtio header, another half for mbuf.
132  */
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138                 + sizeof(struct rte_mbuf)))
139
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142
143 #define INVALID_PORT_ID 0xFF
144
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
147
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
150
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
153
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
156
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
165
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
168
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
172
173 /*
174  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175  * disabled on default.
176  */
177 static uint32_t zero_copy;
178
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
185
186 struct vpool {
187         struct rte_mempool *pool;
188         struct rte_ring *ring;
189         uint32_t buf_size;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193 typedef enum {
194         VM2VM_DISABLED = 0,
195         VM2VM_SOFTWARE = 1,
196         VM2VM_HARDWARE = 2,
197         VM2VM_LAST
198 } vm2vm_type;
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200
201 /* The type of host physical address translated from guest physical address. */
202 typedef enum {
203         PHYS_ADDR_CONTINUOUS = 0,
204         PHYS_ADDR_CROSS_SUBREG = 1,
205         PHYS_ADDR_INVALID = 2,
206         PHYS_ADDR_LAST
207 } hpa_type;
208
209 /* Enable stats. */
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
223
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
226
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
229         .rx_thresh = {
230                 .pthresh = RX_PTHRESH,
231                 .hthresh = RX_HTHRESH,
232                 .wthresh = RX_WTHRESH,
233         },
234         .rx_drop_en = 1,
235 };
236
237 /*
238  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240  * network controllers and/or network drivers.
241  */
242 static struct rte_eth_txconf tx_conf_default = {
243         .tx_thresh = {
244                 .pthresh = TX_PTHRESH,
245                 .hthresh = TX_HTHRESH,
246                 .wthresh = TX_WTHRESH,
247         },
248         .tx_free_thresh = 0, /* Use PMD default values */
249         .tx_rs_thresh = 0, /* Use PMD default values */
250 };
251
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
254         .rxmode = {
255                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256                 .split_hdr_size = 0,
257                 .header_split   = 0, /**< Header Split disabled */
258                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
259                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
260                 /*
261                  * It is necessary for 1G NIC such as I350,
262                  * this fixes bug of ipv4 forwarding in guest can't
263                  * forward pakets from one virtio dev to another virtio dev.
264                  */
265                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
268         },
269
270         .txmode = {
271                 .mq_mode = ETH_MQ_TX_NONE,
272         },
273         .rx_adv_conf = {
274                 /*
275                  * should be overridden separately in code with
276                  * appropriate values
277                  */
278                 .vmdq_rx_conf = {
279                         .nb_queue_pools = ETH_8_POOLS,
280                         .enable_default_pool = 0,
281                         .default_pool = 0,
282                         .nb_pool_maps = 0,
283                         .pool_map = {{0, 0},},
284                 },
285         },
286 };
287
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
291
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316         unsigned len;
317         unsigned txq_id;
318         struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329         unsigned char   h_dest[ETH_ALEN];
330         unsigned char   h_source[ETH_ALEN];
331         __be16          h_vlan_proto;
332         __be16          h_vlan_TCI;
333         __be16          h_vlan_encapsulated_proto;
334 };
335
336 /* IPv4 Header */
337 struct ipv4_hdr {
338         uint8_t  version_ihl;           /**< version and header length */
339         uint8_t  type_of_service;       /**< type of service */
340         uint16_t total_length;          /**< length of packet */
341         uint16_t packet_id;             /**< packet ID */
342         uint16_t fragment_offset;       /**< fragmentation offset */
343         uint8_t  time_to_live;          /**< time to live */
344         uint8_t  next_proto_id;         /**< protocol ID */
345         uint16_t hdr_checksum;          /**< header checksum */
346         uint32_t src_addr;              /**< source address */
347         uint32_t dst_addr;              /**< destination address */
348 } __attribute__((__packed__));
349
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353
354 /* Per-device statistics struct */
355 struct device_statistics {
356         uint64_t tx_total;
357         rte_atomic64_t rx_total_atomic;
358         uint64_t rx_total;
359         uint64_t tx;
360         rte_atomic64_t rx_atomic;
361         uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372         struct rte_eth_vmdq_rx_conf conf;
373         unsigned i;
374
375         memset(&conf, 0, sizeof(conf));
376         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377         conf.nb_pool_maps = num_devices;
378         conf.enable_loop_back =
379                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380
381         for (i = 0; i < conf.nb_pool_maps; i++) {
382                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
383                 conf.pool_map[i].pools = (1UL << i);
384         }
385
386         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389         return 0;
390 }
391
392 /*
393  * Validate the device number according to the max pool number gotten form
394  * dev_info. If the device number is invalid, give the error message and
395  * return -1. Each device must have its own pool.
396  */
397 static inline int
398 validate_num_devices(uint32_t max_nb_devices)
399 {
400         if (num_devices > max_nb_devices) {
401                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402                 return -1;
403         }
404         return 0;
405 }
406
407 /*
408  * Initialises a given port using global settings and with the rx buffers
409  * coming from the mbuf_pool passed as parameter
410  */
411 static inline int
412 port_init(uint8_t port)
413 {
414         struct rte_eth_dev_info dev_info;
415         struct rte_eth_conf port_conf;
416         uint16_t rx_rings, tx_rings;
417         uint16_t rx_ring_size, tx_ring_size;
418         int retval;
419         uint16_t q;
420
421         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422         rte_eth_dev_info_get (port, &dev_info);
423
424         /*configure the number of supported virtio devices based on VMDQ limits */
425         num_devices = dev_info.max_vmdq_pools;
426         num_queues = dev_info.max_rx_queues;
427
428         if (zero_copy) {
429                 rx_ring_size = num_rx_descriptor;
430                 tx_ring_size = num_tx_descriptor;
431                 tx_rings = dev_info.max_tx_queues;
432         } else {
433                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435                 tx_rings = (uint16_t)rte_lcore_count();
436         }
437
438         retval = validate_num_devices(MAX_DEVICES);
439         if (retval < 0)
440                 return retval;
441
442         /* Get port configuration. */
443         retval = get_eth_conf(&port_conf, num_devices);
444         if (retval < 0)
445                 return retval;
446
447         if (port >= rte_eth_dev_count()) return -1;
448
449         rx_rings = (uint16_t)num_queues,
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
459                                                 vpool_array[q].pool);
460                 if (retval < 0)
461                         return retval;
462         }
463         for (q = 0; q < tx_rings; q ++) {
464                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
466                 if (retval < 0)
467                         return retval;
468         }
469
470         /* Start the device. */
471         retval  = rte_eth_dev_start(port);
472         if (retval < 0) {
473                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474                 return retval;
475         }
476
477         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481                         (unsigned)port,
482                         vmdq_ports_eth_addr[port].addr_bytes[0],
483                         vmdq_ports_eth_addr[port].addr_bytes[1],
484                         vmdq_ports_eth_addr[port].addr_bytes[2],
485                         vmdq_ports_eth_addr[port].addr_bytes[3],
486                         vmdq_ports_eth_addr[port].addr_bytes[4],
487                         vmdq_ports_eth_addr[port].addr_bytes[5]);
488
489         return 0;
490 }
491
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498         /* parse number string */
499
500         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501                 return -1;
502         else
503                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504
505         return 0;
506 }
507
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514         char *end = NULL;
515         unsigned long pm;
516
517         errno = 0;
518
519         /* parse hexadecimal string */
520         pm = strtoul(portmask, &end, 16);
521         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522                 return -1;
523
524         if (pm == 0)
525                 return -1;
526
527         return pm;
528
529 }
530
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537         char *end = NULL;
538         unsigned long num;
539
540         errno = 0;
541
542         /* parse unsigned int string */
543         num = strtoul(q_arg, &end, 10);
544         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545                 return -1;
546
547         if (num > max_valid_value)
548                 return -1;
549
550         return num;
551
552 }
553
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561         "               --vm2vm [0|1|2]\n"
562         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563         "               --dev-basename <name> --dev-index [0-N]\n"
564         "               --nb-devices ND\n"
565         "               -p PORTMASK: Set mask for ports to be used by application\n"
566         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572         "               --dev-basename: The basename to be used for the character device.\n"
573         "               --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
575                         "zero copy\n"
576         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
577                         "used only when zero copy is enabled.\n"
578         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
579                         "used only when zero copy is enabled.\n",
580                prgname);
581 }
582
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589         int opt, ret;
590         int option_index;
591         unsigned i;
592         const char *prgname = argv[0];
593         static struct option long_option[] = {
594                 {"vm2vm", required_argument, NULL, 0},
595                 {"rx-retry", required_argument, NULL, 0},
596                 {"rx-retry-delay", required_argument, NULL, 0},
597                 {"rx-retry-num", required_argument, NULL, 0},
598                 {"mergeable", required_argument, NULL, 0},
599                 {"stats", required_argument, NULL, 0},
600                 {"dev-basename", required_argument, NULL, 0},
601                 {"dev-index", required_argument, NULL, 0},
602                 {"zero-copy", required_argument, NULL, 0},
603                 {"rx-desc-num", required_argument, NULL, 0},
604                 {"tx-desc-num", required_argument, NULL, 0},
605                 {NULL, 0, 0, 0},
606         };
607
608         /* Parse command line */
609         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610                 switch (opt) {
611                 /* Portmask */
612                 case 'p':
613                         enabled_port_mask = parse_portmask(optarg);
614                         if (enabled_port_mask == 0) {
615                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616                                 us_vhost_usage(prgname);
617                                 return -1;
618                         }
619                         break;
620
621                 case 0:
622                         /* Enable/disable vm2vm comms. */
623                         if (!strncmp(long_option[option_index].name, "vm2vm",
624                                 MAX_LONG_OPT_SZ)) {
625                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626                                 if (ret == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                                 "Invalid argument for "
629                                                 "vm2vm [0|1|2]\n");
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 } else {
633                                         vm2vm_mode = (vm2vm_type)ret;
634                                 }
635                         }
636
637                         /* Enable/disable retries on RX. */
638                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639                                 ret = parse_num_opt(optarg, 1);
640                                 if (ret == -1) {
641                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 } else {
645                                         enable_retry = ret;
646                                 }
647                         }
648
649                         /* Specify the retries delay time (in useconds) on RX. */
650                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651                                 ret = parse_num_opt(optarg, INT32_MAX);
652                                 if (ret == -1) {
653                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654                                         us_vhost_usage(prgname);
655                                         return -1;
656                                 } else {
657                                         burst_rx_delay_time = ret;
658                                 }
659                         }
660
661                         /* Specify the retries number on RX. */
662                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663                                 ret = parse_num_opt(optarg, INT32_MAX);
664                                 if (ret == -1) {
665                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666                                         us_vhost_usage(prgname);
667                                         return -1;
668                                 } else {
669                                         burst_rx_retry_num = ret;
670                                 }
671                         }
672
673                         /* Enable/disable RX mergeable buffers. */
674                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675                                 ret = parse_num_opt(optarg, 1);
676                                 if (ret == -1) {
677                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678                                         us_vhost_usage(prgname);
679                                         return -1;
680                                 } else {
681                                         if (ret) {
682                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
683                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
684                                                         = JUMBO_FRAME_MAX_SIZE;
685                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686                                         }
687                                 }
688                         }
689
690                         /* Enable/disable stats. */
691                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         enable_stats = ret;
699                                 }
700                         }
701
702                         /* Set character device basename. */
703                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704                                 if (us_vhost_parse_basename(optarg) == -1) {
705                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706                                         us_vhost_usage(prgname);
707                                         return -1;
708                                 }
709                         }
710
711                         /* Set character device index. */
712                         if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713                                 ret = parse_num_opt(optarg, INT32_MAX);
714                                 if (ret == -1) {
715                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716                                         us_vhost_usage(prgname);
717                                         return -1;
718                                 } else
719                                         dev_index = ret;
720                         }
721
722                         /* Enable/disable rx/tx zero copy. */
723                         if (!strncmp(long_option[option_index].name,
724                                 "zero-copy", MAX_LONG_OPT_SZ)) {
725                                 ret = parse_num_opt(optarg, 1);
726                                 if (ret == -1) {
727                                         RTE_LOG(INFO, VHOST_CONFIG,
728                                                 "Invalid argument"
729                                                 " for zero-copy [0|1]\n");
730                                         us_vhost_usage(prgname);
731                                         return -1;
732                                 } else
733                                         zero_copy = ret;
734
735                                 if (zero_copy) {
736 #ifdef RTE_MBUF_REFCNT
737                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738                                         "zero copy vhost APP, please "
739                                         "disable RTE_MBUF_REFCNT\n"
740                                         "in config file and then rebuild DPDK "
741                                         "core lib!\n"
742                                         "Otherwise please disable zero copy "
743                                         "flag in command line!\n");
744                                         return -1;
745 #endif
746                                 }
747                         }
748
749                         /* Specify the descriptor number on RX. */
750                         if (!strncmp(long_option[option_index].name,
751                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
752                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
753                                 if ((ret == -1) || (!POWEROF2(ret))) {
754                                         RTE_LOG(INFO, VHOST_CONFIG,
755                                         "Invalid argument for rx-desc-num[0-N],"
756                                         "power of 2 required.\n");
757                                         us_vhost_usage(prgname);
758                                         return -1;
759                                 } else {
760                                         num_rx_descriptor = ret;
761                                 }
762                         }
763
764                         /* Specify the descriptor number on TX. */
765                         if (!strncmp(long_option[option_index].name,
766                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
767                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
768                                 if ((ret == -1) || (!POWEROF2(ret))) {
769                                         RTE_LOG(INFO, VHOST_CONFIG,
770                                         "Invalid argument for tx-desc-num [0-N],"
771                                         "power of 2 required.\n");
772                                         us_vhost_usage(prgname);
773                                         return -1;
774                                 } else {
775                                         num_tx_descriptor = ret;
776                                 }
777                         }
778
779                         break;
780
781                         /* Invalid option - print options. */
782                 default:
783                         us_vhost_usage(prgname);
784                         return -1;
785                 }
786         }
787
788         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789                 if (enabled_port_mask & (1 << i))
790                         ports[num_ports++] = (uint8_t)i;
791         }
792
793         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796                 return -1;
797         }
798
799         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800                 RTE_LOG(INFO, VHOST_PORT,
801                         "Vhost zero copy doesn't support software vm2vm,"
802                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
803                 return -1;
804         }
805
806         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807                 RTE_LOG(INFO, VHOST_PORT,
808                         "Vhost zero copy doesn't support jumbo frame,"
809                         "please specify '--mergeable 0' to disable the "
810                         "mergeable feature.\n");
811                 return -1;
812         }
813
814         return 0;
815 }
816
817 /*
818  * Update the global var NUM_PORTS and array PORTS according to system ports number
819  * and return valid ports number
820  */
821 static unsigned check_ports_num(unsigned nb_ports)
822 {
823         unsigned valid_num_ports = num_ports;
824         unsigned portid;
825
826         if (num_ports > nb_ports) {
827                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828                         num_ports, nb_ports);
829                 num_ports = nb_ports;
830         }
831
832         for (portid = 0; portid < num_ports; portid ++) {
833                 if (ports[portid] >= nb_ports) {
834                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835                                 ports[portid], (nb_ports - 1));
836                         ports[portid] = INVALID_PORT_ID;
837                         valid_num_ports--;
838                 }
839         }
840         return valid_num_ports;
841 }
842
843 /*
844  * Macro to print out packet contents. Wrapped in debug define so that the
845  * data path is not effected when debug is disabled.
846  */
847 #ifdef DEBUG
848 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
849         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
850         unsigned int index;                                                                                                                                                                                             \
851         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
852                                                                                                                                                                                                                                         \
853         if ((header))                                                                                                                                                                                                   \
854                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
855         else                                                                                                                                                                                                                    \
856                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
857         for (index = 0; index < (size); index++) {                                                                                                                                              \
858                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
859                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
860         }                                                                                                                                                                                                                               \
861         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
862                                                                                                                                                                                                                                         \
863         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
864 } while(0)
865 #else
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867 #endif
868
869 /*
870  * Function to convert guest physical addresses to vhost physical addresses.
871  * This is used to convert virtio buffer addresses.
872  */
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
875         uint32_t buf_len, hpa_type *addr_type)
876 {
877         struct virtio_memory_regions_hpa *region;
878         uint32_t regionidx;
879         uint64_t vhost_pa = 0;
880
881         *addr_type = PHYS_ADDR_INVALID;
882
883         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
884                 region = &vdev->regions_hpa[regionidx];
885                 if ((guest_pa >= region->guest_phys_address) &&
886                         (guest_pa <= region->guest_phys_address_end)) {
887                         vhost_pa = region->host_phys_addr_offset + guest_pa;
888                         if (likely((guest_pa + buf_len - 1)
889                                 <= region->guest_phys_address_end))
890                                 *addr_type = PHYS_ADDR_CONTINUOUS;
891                         else
892                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
893                         break;
894                 }
895         }
896
897         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
898                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
899                 (void *)(uintptr_t)vhost_pa);
900
901         return vhost_pa;
902 }
903
904 /*
905  * Compares a packet destination MAC address to a device MAC address.
906  */
907 static inline int __attribute__((always_inline))
908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
909 {
910         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
911 }
912
913 /*
914  * This function learns the MAC address of the device and registers this along with a
915  * vlan tag to a VMDQ.
916  */
917 static int
918 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
919 {
920         struct ether_hdr *pkt_hdr;
921         struct virtio_net_data_ll *dev_ll;
922         struct virtio_net *dev = vdev->dev;
923         int i, ret;
924
925         /* Learn MAC address of guest device from packet */
926         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
927
928         dev_ll = ll_root_used;
929
930         while (dev_ll != NULL) {
931                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
932                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
933                         return -1;
934                 }
935                 dev_ll = dev_ll->next;
936         }
937
938         for (i = 0; i < ETHER_ADDR_LEN; i++)
939                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
940
941         /* vlan_tag currently uses the device_id. */
942         vdev->vlan_tag = vlan_tags[dev->device_fh];
943
944         /* Print out VMDQ registration info. */
945         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
946                 dev->device_fh,
947                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
948                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
949                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
950                 vdev->vlan_tag);
951
952         /* Register the MAC address. */
953         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
954         if (ret)
955                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
956                                         dev->device_fh);
957
958         /* Enable stripping of the vlan tag as we handle routing. */
959         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
960
961         /* Set device as ready for RX. */
962         vdev->ready = DEVICE_RX;
963
964         return 0;
965 }
966
967 /*
968  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
969  * queue before disabling RX on the device.
970  */
971 static inline void
972 unlink_vmdq(struct vhost_dev *vdev)
973 {
974         unsigned i = 0;
975         unsigned rx_count;
976         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
977
978         if (vdev->ready == DEVICE_RX) {
979                 /*clear MAC and VLAN settings*/
980                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
981                 for (i = 0; i < 6; i++)
982                         vdev->mac_address.addr_bytes[i] = 0;
983
984                 vdev->vlan_tag = 0;
985
986                 /*Clear out the receive buffers*/
987                 rx_count = rte_eth_rx_burst(ports[0],
988                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
989
990                 while (rx_count) {
991                         for (i = 0; i < rx_count; i++)
992                                 rte_pktmbuf_free(pkts_burst[i]);
993
994                         rx_count = rte_eth_rx_burst(ports[0],
995                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
996                 }
997
998                 vdev->ready = DEVICE_MAC_LEARNING;
999         }
1000 }
1001
1002 /*
1003  * Check if the packet destination MAC address is for a local device. If so then put
1004  * the packet on that devices RX queue. If not then return.
1005  */
1006 static inline unsigned __attribute__((always_inline))
1007 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1008 {
1009         struct virtio_net_data_ll *dev_ll;
1010         struct ether_hdr *pkt_hdr;
1011         uint64_t ret = 0;
1012         struct virtio_net *dev = vdev->dev;
1013         struct virtio_net *tdev; /* destination virito device */
1014
1015         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1016
1017         /*get the used devices list*/
1018         dev_ll = ll_root_used;
1019
1020         while (dev_ll != NULL) {
1021                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1022                                           &dev_ll->vdev->mac_address)) {
1023
1024                         /* Drop the packet if the TX packet is destined for the TX device. */
1025                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1026                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1027                                                         dev->device_fh);
1028                                 return 0;
1029                         }
1030                         tdev = dev_ll->vdev->dev;
1031
1032
1033                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1034
1035                         if (dev_ll->vdev->remove) {
1036                                 /*drop the packet if the device is marked for removal*/
1037                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1038                         } else {
1039                                 uint32_t mergeable =
1040                                         dev_ll->dev->features &
1041                                         (1 << VIRTIO_NET_F_MRG_RXBUF);
1042
1043                                 /*send the packet to the local virtio device*/
1044                                 if (likely(mergeable == 0))
1045                                         ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1046                                 else
1047                                         ret = virtio_dev_merge_rx(dev_ll->dev,
1048                                                 &m, 1);
1049
1050                                 if (enable_stats) {
1051                                         rte_atomic64_add(
1052                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1053                                         1);
1054                                         rte_atomic64_add(
1055                                         &dev_statistics[tdev->device_fh].rx_atomic,
1056                                         ret);
1057                                         dev_statistics[tdev->device_fh].tx_total++;
1058                                         dev_statistics[tdev->device_fh].tx += ret;
1059                                 }
1060                         }
1061
1062                         return 0;
1063                 }
1064                 dev_ll = dev_ll->next;
1065         }
1066
1067         return -1;
1068 }
1069
1070 /*
1071  * This function routes the TX packet to the correct interface. This may be a local device
1072  * or the physical port.
1073  */
1074 static inline void __attribute__((always_inline))
1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1076 {
1077         struct mbuf_table *tx_q;
1078         struct vlan_ethhdr *vlan_hdr;
1079         struct rte_mbuf **m_table;
1080         struct rte_mbuf *mbuf, *prev;
1081         unsigned len, ret, offset = 0;
1082         const uint16_t lcore_id = rte_lcore_id();
1083         struct virtio_net_data_ll *dev_ll = ll_root_used;
1084         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1085         struct virtio_net *dev = vdev->dev;
1086
1087         /*check if destination is local VM*/
1088         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1089                 return;
1090
1091         if (vm2vm_mode == VM2VM_HARDWARE) {
1092                 while (dev_ll != NULL) {
1093                         if ((dev_ll->vdev->ready == DEVICE_RX)
1094                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1095                                 &dev_ll->vdev->mac_address)) {
1096                                 /*
1097                                  * Drop the packet if the TX packet is
1098                                  * destined for the TX device.
1099                                  */
1100                                 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1101                                         LOG_DEBUG(VHOST_DATA,
1102                                         "(%"PRIu64") TX: Source and destination"
1103                                         " MAC addresses are the same. Dropping "
1104                                         "packet.\n",
1105                                         dev_ll->vdev->device_fh);
1106                                         return;
1107                                 }
1108                                 offset = 4;
1109                                 vlan_tag =
1110                                 (uint16_t)
1111                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1112
1113                                 LOG_DEBUG(VHOST_DATA,
1114                                 "(%"PRIu64") TX: pkt to local VM device id:"
1115                                 "(%"PRIu64") vlan tag: %d.\n",
1116                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1117                                 vlan_tag);
1118
1119                                 break;
1120                         }
1121                         dev_ll = dev_ll->next;
1122                 }
1123         }
1124
1125         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1126
1127         /*Add packet to the port tx queue*/
1128         tx_q = &lcore_tx_queue[lcore_id];
1129         len = tx_q->len;
1130
1131         /* Allocate an mbuf and populate the structure. */
1132         mbuf = rte_pktmbuf_alloc(mbuf_pool);
1133         if (unlikely(mbuf == NULL)) {
1134                 RTE_LOG(ERR, VHOST_DATA,
1135                         "Failed to allocate memory for mbuf.\n");
1136                 return;
1137         }
1138
1139         mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1140         mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1141         mbuf->nb_segs = m->nb_segs;
1142
1143         /* Copy ethernet header to mbuf. */
1144         rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1145                 rte_pktmbuf_mtod(m, const void *),
1146                 ETH_HLEN);
1147
1148
1149         /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1150         vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1151         vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1152         vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1153         vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1154
1155         /* Copy the remaining packet contents to the mbuf. */
1156         rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1157                 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1158                 (m->data_len - ETH_HLEN));
1159
1160         /* Copy the remaining segments for the whole packet. */
1161         prev = mbuf;
1162         while (m->next) {
1163                 /* Allocate an mbuf and populate the structure. */
1164                 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1165                 if (unlikely(next_mbuf == NULL)) {
1166                         rte_pktmbuf_free(mbuf);
1167                         RTE_LOG(ERR, VHOST_DATA,
1168                                 "Failed to allocate memory for mbuf.\n");
1169                         return;
1170                 }
1171
1172                 m = m->next;
1173                 prev->next = next_mbuf;
1174                 prev = next_mbuf;
1175                 next_mbuf->data_len = m->data_len;
1176
1177                 /* Copy data to next mbuf. */
1178                 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1179                         rte_pktmbuf_mtod(m, const void *), m->data_len);
1180         }
1181
1182         tx_q->m_table[len] = mbuf;
1183         len++;
1184         if (enable_stats) {
1185                 dev_statistics[dev->device_fh].tx_total++;
1186                 dev_statistics[dev->device_fh].tx++;
1187         }
1188
1189         if (unlikely(len == MAX_PKT_BURST)) {
1190                 m_table = (struct rte_mbuf **)tx_q->m_table;
1191                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1192                 /* Free any buffers not handled by TX and update the port stats. */
1193                 if (unlikely(ret < len)) {
1194                         do {
1195                                 rte_pktmbuf_free(m_table[ret]);
1196                         } while (++ret < len);
1197                 }
1198
1199                 len = 0;
1200         }
1201
1202         tx_q->len = len;
1203         return;
1204 }
1205 /*
1206  * This function is called by each data core. It handles all RX/TX registered with the
1207  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1208  * with all devices in the main linked list.
1209  */
1210 static int
1211 switch_worker(__attribute__((unused)) void *arg)
1212 {
1213         struct rte_mempool *mbuf_pool = arg;
1214         struct virtio_net *dev = NULL;
1215         struct vhost_dev *vdev = NULL;
1216         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1217         struct virtio_net_data_ll *dev_ll;
1218         struct mbuf_table *tx_q;
1219         volatile struct lcore_ll_info *lcore_ll;
1220         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1221         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1222         unsigned ret, i;
1223         const uint16_t lcore_id = rte_lcore_id();
1224         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1225         uint16_t rx_count = 0;
1226         uint32_t mergeable = 0;
1227
1228         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1229         lcore_ll = lcore_info[lcore_id].lcore_ll;
1230         prev_tsc = 0;
1231
1232         tx_q = &lcore_tx_queue[lcore_id];
1233         for (i = 0; i < num_cores; i ++) {
1234                 if (lcore_ids[i] == lcore_id) {
1235                         tx_q->txq_id = i;
1236                         break;
1237                 }
1238         }
1239
1240         while(1) {
1241                 cur_tsc = rte_rdtsc();
1242                 /*
1243                  * TX burst queue drain
1244                  */
1245                 diff_tsc = cur_tsc - prev_tsc;
1246                 if (unlikely(diff_tsc > drain_tsc)) {
1247
1248                         if (tx_q->len) {
1249                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1250
1251                                 /*Tx any packets in the queue*/
1252                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1253                                                                            (struct rte_mbuf **)tx_q->m_table,
1254                                                                            (uint16_t)tx_q->len);
1255                                 if (unlikely(ret < tx_q->len)) {
1256                                         do {
1257                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1258                                         } while (++ret < tx_q->len);
1259                                 }
1260
1261                                 tx_q->len = 0;
1262                         }
1263
1264                         prev_tsc = cur_tsc;
1265
1266                 }
1267
1268                 rte_prefetch0(lcore_ll->ll_root_used);
1269                 /*
1270                  * Inform the configuration core that we have exited the linked list and that no devices are
1271                  * in use if requested.
1272                  */
1273                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1274                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1275
1276                 /*
1277                  * Process devices
1278                  */
1279                 dev_ll = lcore_ll->ll_root_used;
1280
1281                 while (dev_ll != NULL) {
1282                         /*get virtio device ID*/
1283                         vdev = dev_ll->vdev;
1284                         dev = vdev->dev;
1285                         mergeable =
1286                                 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1287
1288                         if (vdev->remove) {
1289                                 dev_ll = dev_ll->next;
1290                                 unlink_vmdq(vdev);
1291                                 vdev->ready = DEVICE_SAFE_REMOVE;
1292                                 continue;
1293                         }
1294                         if (likely(vdev->ready == DEVICE_RX)) {
1295                                 /*Handle guest RX*/
1296                                 rx_count = rte_eth_rx_burst(ports[0],
1297                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298
1299                                 if (rx_count) {
1300                                         if (likely(mergeable == 0))
1301                                                 ret_count =
1302                                                         virtio_dev_rx(dev,
1303                                                         pkts_burst, rx_count);
1304                                         else
1305                                                 ret_count =
1306                                                         virtio_dev_merge_rx(dev,
1307                                                         pkts_burst, rx_count);
1308
1309                                         if (enable_stats) {
1310                                                 rte_atomic64_add(
1311                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1312                                                 rx_count);
1313                                                 rte_atomic64_add(
1314                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1315                                         }
1316                                         while (likely(rx_count)) {
1317                                                 rx_count--;
1318                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1319                                         }
1320
1321                                 }
1322                         }
1323
1324                         if (!vdev->remove) {
1325                                 /*Handle guest TX*/
1326                                 if (likely(mergeable == 0))
1327                                         virtio_dev_tx(dev, mbuf_pool);
1328                                 else
1329                                         virtio_dev_merge_tx(dev, mbuf_pool);
1330                         }
1331
1332                         /*move to the next device in the list*/
1333                         dev_ll = dev_ll->next;
1334                 }
1335         }
1336
1337         return 0;
1338 }
1339
1340 /*
1341  * This function gets available ring number for zero copy rx.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_num_zcp(struct virtio_net *dev)
1347 {
1348         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1349         uint16_t avail_idx;
1350
1351         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1352         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1353 }
1354
1355 /*
1356  * This function gets available ring index for zero copy rx,
1357  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1358  * Only one thread will call this funciton for a paticular virtio device,
1359  * so, it is designed as non-thread-safe function.
1360  */
1361 static inline uint32_t __attribute__((always_inline))
1362 get_available_ring_index_zcp(struct virtio_net *dev,
1363         uint16_t *res_base_idx, uint32_t count)
1364 {
1365         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1366         uint16_t avail_idx;
1367         uint32_t retry = 0;
1368         uint16_t free_entries;
1369
1370         *res_base_idx = vq->last_used_idx_res;
1371         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372         free_entries = (avail_idx - *res_base_idx);
1373
1374         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1375                         "avail idx: %d, "
1376                         "res base idx:%d, free entries:%d\n",
1377                         dev->device_fh, avail_idx, *res_base_idx,
1378                         free_entries);
1379
1380         /*
1381          * If retry is enabled and the queue is full then we wait
1382          * and retry to avoid packet loss.
1383          */
1384         if (enable_retry && unlikely(count > free_entries)) {
1385                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1386                         rte_delay_us(burst_rx_delay_time);
1387                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1388                         free_entries = (avail_idx - *res_base_idx);
1389                         if (count <= free_entries)
1390                                 break;
1391                 }
1392         }
1393
1394         /*check that we have enough buffers*/
1395         if (unlikely(count > free_entries))
1396                 count = free_entries;
1397
1398         if (unlikely(count == 0)) {
1399                 LOG_DEBUG(VHOST_DATA,
1400                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1401                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1402                         dev->device_fh, avail_idx,
1403                         *res_base_idx, free_entries);
1404                 return 0;
1405         }
1406
1407         vq->last_used_idx_res = *res_base_idx + count;
1408
1409         return count;
1410 }
1411
1412 /*
1413  * This function put descriptor back to used list.
1414  */
1415 static inline void __attribute__((always_inline))
1416 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1417 {
1418         uint16_t res_cur_idx = vq->last_used_idx;
1419         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1420         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1421         rte_compiler_barrier();
1422         *(volatile uint16_t *)&vq->used->idx += 1;
1423         vq->last_used_idx += 1;
1424
1425         /* Kick the guest if necessary. */
1426         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1427                 eventfd_write((int)vq->kickfd, 1);
1428 }
1429
1430 /*
1431  * This function get available descriptor from vitio vring and un-attached mbuf
1432  * from vpool->ring, and then attach them together. It needs adjust the offset
1433  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1434  * frame data may be put to wrong location in mbuf.
1435  */
1436 static inline void __attribute__((always_inline))
1437 attach_rxmbuf_zcp(struct virtio_net *dev)
1438 {
1439         uint16_t res_base_idx, desc_idx;
1440         uint64_t buff_addr, phys_addr;
1441         struct vhost_virtqueue *vq;
1442         struct vring_desc *desc;
1443         struct rte_mbuf *mbuf = NULL;
1444         struct vpool *vpool;
1445         hpa_type addr_type;
1446         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1447
1448         vpool = &vpool_array[vdev->vmdq_rx_q];
1449         vq = dev->virtqueue[VIRTIO_RXQ];
1450
1451         do {
1452                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1453                                 1) != 1))
1454                         return;
1455                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1456
1457                 desc = &vq->desc[desc_idx];
1458                 if (desc->flags & VRING_DESC_F_NEXT) {
1459                         desc = &vq->desc[desc->next];
1460                         buff_addr = gpa_to_vva(dev, desc->addr);
1461                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1462                                         &addr_type);
1463                 } else {
1464                         buff_addr = gpa_to_vva(dev,
1465                                         desc->addr + vq->vhost_hlen);
1466                         phys_addr = gpa_to_hpa(vdev,
1467                                         desc->addr + vq->vhost_hlen,
1468                                         desc->len, &addr_type);
1469                 }
1470
1471                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1472                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1473                                 " address found when attaching RX frame buffer"
1474                                 " address!\n", dev->device_fh);
1475                         put_desc_to_used_list_zcp(vq, desc_idx);
1476                         continue;
1477                 }
1478
1479                 /*
1480                  * Check if the frame buffer address from guest crosses
1481                  * sub-region or not.
1482                  */
1483                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1484                         RTE_LOG(ERR, VHOST_DATA,
1485                                 "(%"PRIu64") Frame buffer address cross "
1486                                 "sub-regioin found when attaching RX frame "
1487                                 "buffer address!\n",
1488                                 dev->device_fh);
1489                         put_desc_to_used_list_zcp(vq, desc_idx);
1490                         continue;
1491                 }
1492         } while (unlikely(phys_addr == 0));
1493
1494         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1495         if (unlikely(mbuf == NULL)) {
1496                 LOG_DEBUG(VHOST_DATA,
1497                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1498                         "ring_sc_dequeue fail.\n",
1499                         dev->device_fh);
1500                 put_desc_to_used_list_zcp(vq, desc_idx);
1501                 return;
1502         }
1503
1504         if (unlikely(vpool->buf_size > desc->len)) {
1505                 LOG_DEBUG(VHOST_DATA,
1506                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1507                         "length(%d) of descriptor idx: %d less than room "
1508                         "size required: %d\n",
1509                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1510                 put_desc_to_used_list_zcp(vq, desc_idx);
1511                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1512                 return;
1513         }
1514
1515         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1516         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1517         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1518         mbuf->data_len = desc->len;
1519         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1520
1521         LOG_DEBUG(VHOST_DATA,
1522                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1523                 "descriptor idx:%d\n",
1524                 dev->device_fh, res_base_idx, desc_idx);
1525
1526         __rte_mbuf_raw_free(mbuf);
1527
1528         return;
1529 }
1530
1531 /*
1532  * Detach an attched packet mbuf -
1533  *  - restore original mbuf address and length values.
1534  *  - reset pktmbuf data and data_len to their default values.
1535  *  All other fields of the given packet mbuf will be left intact.
1536  *
1537  * @param m
1538  *   The attached packet mbuf.
1539  */
1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1541 {
1542         const struct rte_mempool *mp = m->pool;
1543         void *buf = RTE_MBUF_TO_BADDR(m);
1544         uint32_t buf_ofs;
1545         uint32_t buf_len = mp->elt_size - sizeof(*m);
1546         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1547
1548         m->buf_addr = buf;
1549         m->buf_len = (uint16_t)buf_len;
1550
1551         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1552                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1553         m->data_off = buf_ofs;
1554
1555         m->data_len = 0;
1556 }
1557
1558 /*
1559  * This function is called after packets have been transimited. It fetchs mbuf
1560  * from vpool->pool, detached it and put into vpool->ring. It also update the
1561  * used index and kick the guest if necessary.
1562  */
1563 static inline uint32_t __attribute__((always_inline))
1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1565 {
1566         struct rte_mbuf *mbuf;
1567         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1568         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1569         uint32_t index = 0;
1570         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1571
1572         LOG_DEBUG(VHOST_DATA,
1573                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1574                 "clean is: %d\n",
1575                 dev->device_fh, mbuf_count);
1576         LOG_DEBUG(VHOST_DATA,
1577                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1578                 "clean  is : %d\n",
1579                 dev->device_fh, rte_ring_count(vpool->ring));
1580
1581         for (index = 0; index < mbuf_count; index++) {
1582                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1584                         pktmbuf_detach_zcp(mbuf);
1585                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1586
1587                 /* Update used index buffer information. */
1588                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1589                 vq->used->ring[used_idx].len = 0;
1590
1591                 used_idx = (used_idx + 1) & (vq->size - 1);
1592         }
1593
1594         LOG_DEBUG(VHOST_DATA,
1595                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1596                 "clean is: %d\n",
1597                 dev->device_fh, rte_mempool_count(vpool->pool));
1598         LOG_DEBUG(VHOST_DATA,
1599                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1600                 "clean  is : %d\n",
1601                 dev->device_fh, rte_ring_count(vpool->ring));
1602         LOG_DEBUG(VHOST_DATA,
1603                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1604                 "vq->last_used_idx:%d\n",
1605                 dev->device_fh, vq->last_used_idx);
1606
1607         vq->last_used_idx += mbuf_count;
1608
1609         LOG_DEBUG(VHOST_DATA,
1610                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1611                 "vq->last_used_idx:%d\n",
1612                 dev->device_fh, vq->last_used_idx);
1613
1614         rte_compiler_barrier();
1615
1616         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1617
1618         /* Kick guest if required. */
1619         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1620                 eventfd_write((int)vq->kickfd, 1);
1621
1622         return 0;
1623 }
1624
1625 /*
1626  * This function is called when a virtio device is destroy.
1627  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1628  */
1629 static void mbuf_destroy_zcp(struct vpool *vpool)
1630 {
1631         struct rte_mbuf *mbuf = NULL;
1632         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1633
1634         LOG_DEBUG(VHOST_CONFIG,
1635                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1636                 "mbuf_destroy_zcp is: %d\n",
1637                 mbuf_count);
1638         LOG_DEBUG(VHOST_CONFIG,
1639                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1640                 "mbuf_destroy_zcp  is : %d\n",
1641                 rte_ring_count(vpool->ring));
1642
1643         for (index = 0; index < mbuf_count; index++) {
1644                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1645                 if (likely(mbuf != NULL)) {
1646                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1647                                 pktmbuf_detach_zcp(mbuf);
1648                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1649                 }
1650         }
1651
1652         LOG_DEBUG(VHOST_CONFIG,
1653                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1654                 "mbuf_destroy_zcp is: %d\n",
1655                 rte_mempool_count(vpool->pool));
1656         LOG_DEBUG(VHOST_CONFIG,
1657                 "in mbuf_destroy_zcp: mbuf count in ring after "
1658                 "mbuf_destroy_zcp is : %d\n",
1659                 rte_ring_count(vpool->ring));
1660 }
1661
1662 /*
1663  * This function update the use flag and counter.
1664  */
1665 static inline uint32_t __attribute__((always_inline))
1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1667         uint32_t count)
1668 {
1669         struct vhost_virtqueue *vq;
1670         struct vring_desc *desc;
1671         struct rte_mbuf *buff;
1672         /* The virtio_hdr is initialised to 0. */
1673         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1674                 = {{0, 0, 0, 0, 0, 0}, 0};
1675         uint64_t buff_hdr_addr = 0;
1676         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1677         uint32_t head_idx, packet_success = 0;
1678         uint16_t res_cur_idx;
1679
1680         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1681
1682         if (count == 0)
1683                 return 0;
1684
1685         vq = dev->virtqueue[VIRTIO_RXQ];
1686         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1687
1688         res_cur_idx = vq->last_used_idx;
1689         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1690                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1691
1692         /* Retrieve all of the head indexes first to avoid caching issues. */
1693         for (head_idx = 0; head_idx < count; head_idx++)
1694                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1695
1696         /*Prefetch descriptor index. */
1697         rte_prefetch0(&vq->desc[head[packet_success]]);
1698
1699         while (packet_success != count) {
1700                 /* Get descriptor from available ring */
1701                 desc = &vq->desc[head[packet_success]];
1702
1703                 buff = pkts[packet_success];
1704                 LOG_DEBUG(VHOST_DATA,
1705                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1706                         "pkt[%d] descriptor idx: %d\n",
1707                         dev->device_fh, packet_success,
1708                         MBUF_HEADROOM_UINT32(buff));
1709
1710                 PRINT_PACKET(dev,
1711                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1712                         + RTE_PKTMBUF_HEADROOM),
1713                         rte_pktmbuf_data_len(buff), 0);
1714
1715                 /* Buffer address translation for virtio header. */
1716                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1717                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1718
1719                 /*
1720                  * If the descriptors are chained the header and data are
1721                  * placed in separate buffers.
1722                  */
1723                 if (desc->flags & VRING_DESC_F_NEXT) {
1724                         desc->len = vq->vhost_hlen;
1725                         desc = &vq->desc[desc->next];
1726                         desc->len = rte_pktmbuf_data_len(buff);
1727                 } else {
1728                         desc->len = packet_len;
1729                 }
1730
1731                 /* Update used ring with desc information */
1732                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1733                         = head[packet_success];
1734                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1735                         = packet_len;
1736                 res_cur_idx++;
1737                 packet_success++;
1738
1739                 /* A header is required per buffer. */
1740                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1741                         (const void *)&virtio_hdr, vq->vhost_hlen);
1742
1743                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1744
1745                 if (likely(packet_success < count)) {
1746                         /* Prefetch descriptor index. */
1747                         rte_prefetch0(&vq->desc[head[packet_success]]);
1748                 }
1749         }
1750
1751         rte_compiler_barrier();
1752
1753         LOG_DEBUG(VHOST_DATA,
1754                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1755                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1756                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1757
1758         *(volatile uint16_t *)&vq->used->idx += count;
1759         vq->last_used_idx += count;
1760
1761         LOG_DEBUG(VHOST_DATA,
1762                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1763                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1764                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1765
1766         /* Kick the guest if necessary. */
1767         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1768                 eventfd_write((int)vq->kickfd, 1);
1769
1770         return count;
1771 }
1772
1773 /*
1774  * This function routes the TX packet to the correct interface.
1775  * This may be a local device or the physical port.
1776  */
1777 static inline void __attribute__((always_inline))
1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1779         uint32_t desc_idx, uint8_t need_copy)
1780 {
1781         struct mbuf_table *tx_q;
1782         struct rte_mbuf **m_table;
1783         struct rte_mbuf *mbuf = NULL;
1784         unsigned len, ret, offset = 0;
1785         struct vpool *vpool;
1786         struct virtio_net_data_ll *dev_ll = ll_root_used;
1787         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1788         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1789         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1790
1791         /*Add packet to the port tx queue*/
1792         tx_q = &tx_queue_zcp[vmdq_rx_q];
1793         len = tx_q->len;
1794
1795         /* Allocate an mbuf and populate the structure. */
1796         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1797         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1798         if (unlikely(mbuf == NULL)) {
1799                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1800                 RTE_LOG(ERR, VHOST_DATA,
1801                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1802                         dev->device_fh);
1803                 put_desc_to_used_list_zcp(vq, desc_idx);
1804                 return;
1805         }
1806
1807         if (vm2vm_mode == VM2VM_HARDWARE) {
1808                 /* Avoid using a vlan tag from any vm for external pkt, such as
1809                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1810                  * selection, MAC address determines it as an external pkt
1811                  * which should go to network, while vlan tag determine it as
1812                  * a vm2vm pkt should forward to another vm. Hardware confuse
1813                  * such a ambiguous situation, so pkt will lost.
1814                  */
1815                 vlan_tag = external_pkt_default_vlan_tag;
1816                 while (dev_ll != NULL) {
1817                         if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1818                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1819                                 &dev_ll->vdev->mac_address)) {
1820
1821                                 /*
1822                                  * Drop the packet if the TX packet is destined
1823                                  * for the TX device.
1824                                  */
1825                                 if (unlikely(dev_ll->vdev->dev->device_fh
1826                                         == dev->device_fh)) {
1827                                         LOG_DEBUG(VHOST_DATA,
1828                                         "(%"PRIu64") TX: Source and destination"
1829                                         "MAC addresses are the same. Dropping "
1830                                         "packet.\n",
1831                                         dev_ll->vdev->dev->device_fh);
1832                                         MBUF_HEADROOM_UINT32(mbuf)
1833                                                 = (uint32_t)desc_idx;
1834                                         __rte_mbuf_raw_free(mbuf);
1835                                         return;
1836                                 }
1837
1838                                 /*
1839                                  * Packet length offset 4 bytes for HW vlan
1840                                  * strip when L2 switch back.
1841                                  */
1842                                 offset = 4;
1843                                 vlan_tag =
1844                                 (uint16_t)
1845                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1846
1847                                 LOG_DEBUG(VHOST_DATA,
1848                                 "(%"PRIu64") TX: pkt to local VM device id:"
1849                                 "(%"PRIu64") vlan tag: %d.\n",
1850                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1851                                 vlan_tag);
1852
1853                                 break;
1854                         }
1855                         dev_ll = dev_ll->next;
1856                 }
1857         }
1858
1859         mbuf->nb_segs = m->nb_segs;
1860         mbuf->next = m->next;
1861         mbuf->data_len = m->data_len + offset;
1862         mbuf->pkt_len = mbuf->data_len;
1863         if (unlikely(need_copy)) {
1864                 /* Copy the packet contents to the mbuf. */
1865                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1866                         rte_pktmbuf_mtod(m, void *),
1867                         m->data_len);
1868         } else {
1869                 mbuf->data_off = m->data_off;
1870                 mbuf->buf_physaddr = m->buf_physaddr;
1871                 mbuf->buf_addr = m->buf_addr;
1872         }
1873         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1874         mbuf->vlan_tci = vlan_tag;
1875         mbuf->l2_len = sizeof(struct ether_hdr);
1876         mbuf->l3_len = sizeof(struct ipv4_hdr);
1877         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1878
1879         tx_q->m_table[len] = mbuf;
1880         len++;
1881
1882         LOG_DEBUG(VHOST_DATA,
1883                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1884                 dev->device_fh,
1885                 mbuf->nb_segs,
1886                 (mbuf->next == NULL) ? "null" : "non-null");
1887
1888         if (enable_stats) {
1889                 dev_statistics[dev->device_fh].tx_total++;
1890                 dev_statistics[dev->device_fh].tx++;
1891         }
1892
1893         if (unlikely(len == MAX_PKT_BURST)) {
1894                 m_table = (struct rte_mbuf **)tx_q->m_table;
1895                 ret = rte_eth_tx_burst(ports[0],
1896                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1897
1898                 /*
1899                  * Free any buffers not handled by TX and update
1900                  * the port stats.
1901                  */
1902                 if (unlikely(ret < len)) {
1903                         do {
1904                                 rte_pktmbuf_free(m_table[ret]);
1905                         } while (++ret < len);
1906                 }
1907
1908                 len = 0;
1909                 txmbuf_clean_zcp(dev, vpool);
1910         }
1911
1912         tx_q->len = len;
1913
1914         return;
1915 }
1916
1917 /*
1918  * This function TX all available packets in virtio TX queue for one
1919  * virtio-net device. If it is first packet, it learns MAC address and
1920  * setup VMDQ.
1921  */
1922 static inline void __attribute__((always_inline))
1923 virtio_dev_tx_zcp(struct virtio_net *dev)
1924 {
1925         struct rte_mbuf m;
1926         struct vhost_virtqueue *vq;
1927         struct vring_desc *desc;
1928         uint64_t buff_addr = 0, phys_addr;
1929         uint32_t head[MAX_PKT_BURST];
1930         uint32_t i;
1931         uint16_t free_entries, packet_success = 0;
1932         uint16_t avail_idx;
1933         uint8_t need_copy = 0;
1934         hpa_type addr_type;
1935         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1936
1937         vq = dev->virtqueue[VIRTIO_TXQ];
1938         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1939
1940         /* If there are no available buffers then return. */
1941         if (vq->last_used_idx_res == avail_idx)
1942                 return;
1943
1944         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1945
1946         /* Prefetch available ring to retrieve head indexes. */
1947         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1948
1949         /* Get the number of free entries in the ring */
1950         free_entries = (avail_idx - vq->last_used_idx_res);
1951
1952         /* Limit to MAX_PKT_BURST. */
1953         free_entries
1954                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1955
1956         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1957                 dev->device_fh, free_entries);
1958
1959         /* Retrieve all of the head indexes first to avoid caching issues. */
1960         for (i = 0; i < free_entries; i++)
1961                 head[i]
1962                         = vq->avail->ring[(vq->last_used_idx_res + i)
1963                         & (vq->size - 1)];
1964
1965         vq->last_used_idx_res += free_entries;
1966
1967         /* Prefetch descriptor index. */
1968         rte_prefetch0(&vq->desc[head[packet_success]]);
1969         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1970
1971         while (packet_success < free_entries) {
1972                 desc = &vq->desc[head[packet_success]];
1973
1974                 /* Discard first buffer as it is the virtio header */
1975                 desc = &vq->desc[desc->next];
1976
1977                 /* Buffer address translation. */
1978                 buff_addr = gpa_to_vva(dev, desc->addr);
1979                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1980
1981                 if (likely(packet_success < (free_entries - 1)))
1982                         /* Prefetch descriptor index. */
1983                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1984
1985                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1986                         RTE_LOG(ERR, VHOST_DATA,
1987                                 "(%"PRIu64") Invalid frame buffer address found"
1988                                 "when TX packets!\n",
1989                                 dev->device_fh);
1990                         packet_success++;
1991                         continue;
1992                 }
1993
1994                 /* Prefetch buffer address. */
1995                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1996
1997                 /*
1998                  * Setup dummy mbuf. This is copied to a real mbuf if
1999                  * transmitted out the physical port.
2000                  */
2001                 m.data_len = desc->len;
2002                 m.nb_segs = 1;
2003                 m.next = NULL;
2004                 m.data_off = 0;
2005                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2006                 m.buf_physaddr = phys_addr;
2007
2008                 /*
2009                  * Check if the frame buffer address from guest crosses
2010                  * sub-region or not.
2011                  */
2012                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2013                         RTE_LOG(ERR, VHOST_DATA,
2014                                 "(%"PRIu64") Frame buffer address cross "
2015                                 "sub-regioin found when attaching TX frame "
2016                                 "buffer address!\n",
2017                                 dev->device_fh);
2018                         need_copy = 1;
2019                 } else
2020                         need_copy = 0;
2021
2022                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2023
2024                 /*
2025                  * If this is the first received packet we need to learn
2026                  * the MAC and setup VMDQ
2027                  */
2028                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2029                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2030                                 /*
2031                                  * Discard frame if device is scheduled for
2032                                  * removal or a duplicate MAC address is found.
2033                                  */
2034                                 packet_success += free_entries;
2035                                 vq->last_used_idx += packet_success;
2036                                 break;
2037                         }
2038                 }
2039
2040                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2041                 packet_success++;
2042         }
2043 }
2044
2045 /*
2046  * This function is called by each data core. It handles all RX/TX registered
2047  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2048  * addresses are compared with all devices in the main linked list.
2049  */
2050 static int
2051 switch_worker_zcp(__attribute__((unused)) void *arg)
2052 {
2053         struct virtio_net *dev = NULL;
2054         struct vhost_dev  *vdev = NULL;
2055         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2056         struct virtio_net_data_ll *dev_ll;
2057         struct mbuf_table *tx_q;
2058         volatile struct lcore_ll_info *lcore_ll;
2059         const uint64_t drain_tsc
2060                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2061                 * BURST_TX_DRAIN_US;
2062         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2063         unsigned ret;
2064         const uint16_t lcore_id = rte_lcore_id();
2065         uint16_t count_in_ring, rx_count = 0;
2066
2067         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2068
2069         lcore_ll = lcore_info[lcore_id].lcore_ll;
2070         prev_tsc = 0;
2071
2072         while (1) {
2073                 cur_tsc = rte_rdtsc();
2074
2075                 /* TX burst queue drain */
2076                 diff_tsc = cur_tsc - prev_tsc;
2077                 if (unlikely(diff_tsc > drain_tsc)) {
2078                         /*
2079                          * Get mbuf from vpool.pool and detach mbuf and
2080                          * put back into vpool.ring.
2081                          */
2082                         dev_ll = lcore_ll->ll_root_used;
2083                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2084                                 /* Get virtio device ID */
2085                                 vdev = dev_ll->vdev;
2086                                 dev = vdev->dev;
2087
2088                                 if (likely(!vdev->remove)) {
2089                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2090                                         if (tx_q->len) {
2091                                                 LOG_DEBUG(VHOST_DATA,
2092                                                 "TX queue drained after timeout"
2093                                                 " with burst size %u\n",
2094                                                 tx_q->len);
2095
2096                                                 /*
2097                                                  * Tx any packets in the queue
2098                                                  */
2099                                                 ret = rte_eth_tx_burst(
2100                                                         ports[0],
2101                                                         (uint16_t)tx_q->txq_id,
2102                                                         (struct rte_mbuf **)
2103                                                         tx_q->m_table,
2104                                                         (uint16_t)tx_q->len);
2105                                                 if (unlikely(ret < tx_q->len)) {
2106                                                         do {
2107                                                                 rte_pktmbuf_free(
2108                                                                         tx_q->m_table[ret]);
2109                                                         } while (++ret < tx_q->len);
2110                                                 }
2111                                                 tx_q->len = 0;
2112
2113                                                 txmbuf_clean_zcp(dev,
2114                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2115                                         }
2116                                 }
2117                                 dev_ll = dev_ll->next;
2118                         }
2119                         prev_tsc = cur_tsc;
2120                 }
2121
2122                 rte_prefetch0(lcore_ll->ll_root_used);
2123
2124                 /*
2125                  * Inform the configuration core that we have exited the linked
2126                  * list and that no devices are in use if requested.
2127                  */
2128                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2129                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2130
2131                 /* Process devices */
2132                 dev_ll = lcore_ll->ll_root_used;
2133
2134                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2135                         vdev = dev_ll->vdev;
2136                         dev  = vdev->dev;
2137                         if (unlikely(vdev->remove)) {
2138                                 dev_ll = dev_ll->next;
2139                                 unlink_vmdq(vdev);
2140                                 vdev->ready = DEVICE_SAFE_REMOVE;
2141                                 continue;
2142                         }
2143
2144                         if (likely(vdev->ready == DEVICE_RX)) {
2145                                 uint32_t index = vdev->vmdq_rx_q;
2146                                 uint16_t i;
2147                                 count_in_ring
2148                                 = rte_ring_count(vpool_array[index].ring);
2149                                 uint16_t free_entries
2150                                 = (uint16_t)get_available_ring_num_zcp(dev);
2151
2152                                 /*
2153                                  * Attach all mbufs in vpool.ring and put back
2154                                  * into vpool.pool.
2155                                  */
2156                                 for (i = 0;
2157                                 i < RTE_MIN(free_entries,
2158                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2159                                 i++)
2160                                         attach_rxmbuf_zcp(dev);
2161
2162                                 /* Handle guest RX */
2163                                 rx_count = rte_eth_rx_burst(ports[0],
2164                                         vdev->vmdq_rx_q, pkts_burst,
2165                                         MAX_PKT_BURST);
2166
2167                                 if (rx_count) {
2168                                         ret_count = virtio_dev_rx_zcp(dev,
2169                                                         pkts_burst, rx_count);
2170                                         if (enable_stats) {
2171                                                 dev_statistics[dev->device_fh].rx_total
2172                                                         += rx_count;
2173                                                 dev_statistics[dev->device_fh].rx
2174                                                         += ret_count;
2175                                         }
2176                                         while (likely(rx_count)) {
2177                                                 rx_count--;
2178                                                 pktmbuf_detach_zcp(
2179                                                         pkts_burst[rx_count]);
2180                                                 rte_ring_sp_enqueue(
2181                                                         vpool_array[index].ring,
2182                                                         (void *)pkts_burst[rx_count]);
2183                                         }
2184                                 }
2185                         }
2186
2187                         if (likely(!vdev->remove))
2188                                 /* Handle guest TX */
2189                                 virtio_dev_tx_zcp(dev);
2190
2191                         /* Move to the next device in the list */
2192                         dev_ll = dev_ll->next;
2193                 }
2194         }
2195
2196         return 0;
2197 }
2198
2199
2200 /*
2201  * Add an entry to a used linked list. A free entry must first be found
2202  * in the free linked list using get_data_ll_free_entry();
2203  */
2204 static void
2205 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206         struct virtio_net_data_ll *ll_dev)
2207 {
2208         struct virtio_net_data_ll *ll = *ll_root_addr;
2209
2210         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2211         ll_dev->next = NULL;
2212         rte_compiler_barrier();
2213
2214         /* If ll == NULL then this is the first device. */
2215         if (ll) {
2216                 /* Increment to the tail of the linked list. */
2217                 while ((ll->next != NULL) )
2218                         ll = ll->next;
2219
2220                 ll->next = ll_dev;
2221         } else {
2222                 *ll_root_addr = ll_dev;
2223         }
2224 }
2225
2226 /*
2227  * Remove an entry from a used linked list. The entry must then be added to
2228  * the free linked list using put_data_ll_free_entry().
2229  */
2230 static void
2231 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2232         struct virtio_net_data_ll *ll_dev,
2233         struct virtio_net_data_ll *ll_dev_last)
2234 {
2235         struct virtio_net_data_ll *ll = *ll_root_addr;
2236
2237         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2238                 return;
2239
2240         if (ll_dev == ll)
2241                 *ll_root_addr = ll_dev->next;
2242         else
2243                 if (likely(ll_dev_last != NULL))
2244                         ll_dev_last->next = ll_dev->next;
2245                 else
2246                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2247 }
2248
2249 /*
2250  * Find and return an entry from the free linked list.
2251  */
2252 static struct virtio_net_data_ll *
2253 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2254 {
2255         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2256         struct virtio_net_data_ll *ll_dev;
2257
2258         if (ll_free == NULL)
2259                 return NULL;
2260
2261         ll_dev = ll_free;
2262         *ll_root_addr = ll_free->next;
2263
2264         return ll_dev;
2265 }
2266
2267 /*
2268  * Place an entry back on to the free linked list.
2269  */
2270 static void
2271 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2272         struct virtio_net_data_ll *ll_dev)
2273 {
2274         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2275
2276         if (ll_dev == NULL)
2277                 return;
2278
2279         ll_dev->next = ll_free;
2280         *ll_root_addr = ll_dev;
2281 }
2282
2283 /*
2284  * Creates a linked list of a given size.
2285  */
2286 static struct virtio_net_data_ll *
2287 alloc_data_ll(uint32_t size)
2288 {
2289         struct virtio_net_data_ll *ll_new;
2290         uint32_t i;
2291
2292         /* Malloc and then chain the linked list. */
2293         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2294         if (ll_new == NULL) {
2295                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2296                 return NULL;
2297         }
2298
2299         for (i = 0; i < size - 1; i++) {
2300                 ll_new[i].vdev = NULL;
2301                 ll_new[i].next = &ll_new[i+1];
2302         }
2303         ll_new[i].next = NULL;
2304
2305         return (ll_new);
2306 }
2307
2308 /*
2309  * Create the main linked list along with each individual cores linked list. A used and a free list
2310  * are created to manage entries.
2311  */
2312 static int
2313 init_data_ll (void)
2314 {
2315         int lcore;
2316
2317         RTE_LCORE_FOREACH_SLAVE(lcore) {
2318                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2319                 if (lcore_info[lcore].lcore_ll == NULL) {
2320                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2321                         return -1;
2322                 }
2323
2324                 lcore_info[lcore].lcore_ll->device_num = 0;
2325                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2326                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2327                 if (num_devices % num_switching_cores)
2328                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2329                 else
2330                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2331         }
2332
2333         /* Allocate devices up to a maximum of MAX_DEVICES. */
2334         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2335
2336         return 0;
2337 }
2338
2339 /*
2340  * Set virtqueue flags so that we do not receive interrupts.
2341  */
2342 static void
2343 set_irq_status (struct virtio_net *dev)
2344 {
2345         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2346         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2347 }
2348
2349 /*
2350  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2351  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2352  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2353  */
2354 static void
2355 destroy_device (volatile struct virtio_net *dev)
2356 {
2357         struct virtio_net_data_ll *ll_lcore_dev_cur;
2358         struct virtio_net_data_ll *ll_main_dev_cur;
2359         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2360         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2361         struct vhost_dev *vdev;
2362         int lcore;
2363
2364         dev->flags &= ~VIRTIO_DEV_RUNNING;
2365
2366         vdev = (struct vhost_dev *)dev->priv;
2367         /*set the remove flag. */
2368         vdev->remove = 1;
2369         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2370                 rte_pause();
2371         }
2372
2373         /* Search for entry to be removed from lcore ll */
2374         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2375         while (ll_lcore_dev_cur != NULL) {
2376                 if (ll_lcore_dev_cur->vdev == vdev) {
2377                         break;
2378                 } else {
2379                         ll_lcore_dev_last = ll_lcore_dev_cur;
2380                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2381                 }
2382         }
2383
2384         if (ll_lcore_dev_cur == NULL) {
2385                 RTE_LOG(ERR, VHOST_CONFIG,
2386                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2387                         dev->device_fh);
2388                 return;
2389         }
2390
2391         /* Search for entry to be removed from main ll */
2392         ll_main_dev_cur = ll_root_used;
2393         ll_main_dev_last = NULL;
2394         while (ll_main_dev_cur != NULL) {
2395                 if (ll_main_dev_cur->vdev == vdev) {
2396                         break;
2397                 } else {
2398                         ll_main_dev_last = ll_main_dev_cur;
2399                         ll_main_dev_cur = ll_main_dev_cur->next;
2400                 }
2401         }
2402
2403         /* Remove entries from the lcore and main ll. */
2404         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2405         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2406
2407         /* Set the dev_removal_flag on each lcore. */
2408         RTE_LCORE_FOREACH_SLAVE(lcore) {
2409                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2410         }
2411
2412         /*
2413          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2414          * they can no longer access the device removed from the linked lists and that the devices
2415          * are no longer in use.
2416          */
2417         RTE_LCORE_FOREACH_SLAVE(lcore) {
2418                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2419                         rte_pause();
2420                 }
2421         }
2422
2423         /* Add the entries back to the lcore and main free ll.*/
2424         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2425         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2426
2427         /* Decrement number of device on the lcore. */
2428         lcore_info[vdev->coreid].lcore_ll->device_num--;
2429
2430         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2431
2432         if (zero_copy) {
2433                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2434
2435                 /* Stop the RX queue. */
2436                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2437                         LOG_DEBUG(VHOST_CONFIG,
2438                                 "(%"PRIu64") In destroy_device: Failed to stop "
2439                                 "rx queue:%d\n",
2440                                 dev->device_fh,
2441                                 vdev->vmdq_rx_q);
2442                 }
2443
2444                 LOG_DEBUG(VHOST_CONFIG,
2445                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2446                         "mempool back to ring for RX queue: %d\n",
2447                         dev->device_fh, vdev->vmdq_rx_q);
2448
2449                 mbuf_destroy_zcp(vpool);
2450
2451                 /* Stop the TX queue. */
2452                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2453                         LOG_DEBUG(VHOST_CONFIG,
2454                                 "(%"PRIu64") In destroy_device: Failed to "
2455                                 "stop tx queue:%d\n",
2456                                 dev->device_fh, vdev->vmdq_rx_q);
2457                 }
2458
2459                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2460
2461                 LOG_DEBUG(VHOST_CONFIG,
2462                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2463                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2464                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2465                         dev->device_fh);
2466
2467                 mbuf_destroy_zcp(vpool);
2468         }
2469         rte_free(vdev);
2470
2471 }
2472
2473 /*
2474  * A new device is added to a data core. First the device is added to the main linked list
2475  * and the allocated to a specific data core.
2476  */
2477 static int
2478 new_device (struct virtio_net *dev)
2479 {
2480         struct virtio_net_data_ll *ll_dev;
2481         int lcore, core_add = 0;
2482         uint32_t device_num_min = num_devices;
2483         struct vhost_dev *vdev;
2484
2485         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2486         if (vdev == NULL) {
2487                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2488                         dev->device_fh);
2489                 return -1;
2490         }
2491         vdev->dev = dev;
2492         dev->priv = vdev;
2493
2494         /* Add device to main ll */
2495         ll_dev = get_data_ll_free_entry(&ll_root_free);
2496         if (ll_dev == NULL) {
2497                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2498                         "of %d devices per core has been reached\n",
2499                         dev->device_fh, num_devices);
2500                 rte_free(vdev);
2501                 return -1;
2502         }
2503         ll_dev->vdev = vdev;
2504         add_data_ll_entry(&ll_root_used, ll_dev);
2505         vdev->vmdq_rx_q
2506                 = dev->device_fh * (num_queues / num_devices);
2507
2508         if (zero_copy) {
2509                 uint32_t index = vdev->vmdq_rx_q;
2510                 uint32_t count_in_ring, i;
2511                 struct mbuf_table *tx_q;
2512
2513                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2514
2515                 LOG_DEBUG(VHOST_CONFIG,
2516                         "(%"PRIu64") in new_device: mbuf count in mempool "
2517                         "before attach is: %d\n",
2518                         dev->device_fh,
2519                         rte_mempool_count(vpool_array[index].pool));
2520                 LOG_DEBUG(VHOST_CONFIG,
2521                         "(%"PRIu64") in new_device: mbuf count in  ring "
2522                         "before attach  is : %d\n",
2523                         dev->device_fh, count_in_ring);
2524
2525                 /*
2526                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2527                  */
2528                 for (i = 0; i < count_in_ring; i++)
2529                         attach_rxmbuf_zcp(dev);
2530
2531                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2532                         "mempool after attach is: %d\n",
2533                         dev->device_fh,
2534                         rte_mempool_count(vpool_array[index].pool));
2535                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2536                         "ring after attach  is : %d\n",
2537                         dev->device_fh,
2538                         rte_ring_count(vpool_array[index].ring));
2539
2540                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2541                 tx_q->txq_id = vdev->vmdq_rx_q;
2542
2543                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2544                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2545
2546                         LOG_DEBUG(VHOST_CONFIG,
2547                                 "(%"PRIu64") In new_device: Failed to start "
2548                                 "tx queue:%d\n",
2549                                 dev->device_fh, vdev->vmdq_rx_q);
2550
2551                         mbuf_destroy_zcp(vpool);
2552                         rte_free(vdev);
2553                         return -1;
2554                 }
2555
2556                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2557                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2558
2559                         LOG_DEBUG(VHOST_CONFIG,
2560                                 "(%"PRIu64") In new_device: Failed to start "
2561                                 "rx queue:%d\n",
2562                                 dev->device_fh, vdev->vmdq_rx_q);
2563
2564                         /* Stop the TX queue. */
2565                         if (rte_eth_dev_tx_queue_stop(ports[0],
2566                                 vdev->vmdq_rx_q) != 0) {
2567                                 LOG_DEBUG(VHOST_CONFIG,
2568                                         "(%"PRIu64") In new_device: Failed to "
2569                                         "stop tx queue:%d\n",
2570                                         dev->device_fh, vdev->vmdq_rx_q);
2571                         }
2572
2573                         mbuf_destroy_zcp(vpool);
2574                         rte_free(vdev);
2575                         return -1;
2576                 }
2577
2578         }
2579
2580         /*reset ready flag*/
2581         vdev->ready = DEVICE_MAC_LEARNING;
2582         vdev->remove = 0;
2583
2584         /* Find a suitable lcore to add the device. */
2585         RTE_LCORE_FOREACH_SLAVE(lcore) {
2586                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2587                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2588                         core_add = lcore;
2589                 }
2590         }
2591         /* Add device to lcore ll */
2592         ll_dev->dev->coreid = core_add;
2593         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2594         if (ll_dev == NULL) {
2595                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2596                 vdev->ready = DEVICE_SAFE_REMOVE;
2597                 destroy_device(dev);
2598                 rte_free(vdev);
2599                 return -1;
2600         }
2601         ll_dev->vdev = vdev;
2602         vdev->coreid = core_add;
2603
2604         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2605
2606         /* Initialize device stats */
2607         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2608
2609         /* Disable notifications. */
2610         set_irq_status(dev);
2611         lcore_info[vdev->coreid].lcore_ll->device_num++;
2612         dev->flags |= VIRTIO_DEV_RUNNING;
2613
2614         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2615
2616         return 0;
2617 }
2618
2619 /*
2620  * These callback allow devices to be added to the data core when configuration
2621  * has been fully complete.
2622  */
2623 static const struct virtio_net_device_ops virtio_net_device_ops =
2624 {
2625         .new_device =  new_device,
2626         .destroy_device = destroy_device,
2627 };
2628
2629 /*
2630  * This is a thread will wake up after a period to print stats if the user has
2631  * enabled them.
2632  */
2633 static void
2634 print_stats(void)
2635 {
2636         struct virtio_net_data_ll *dev_ll;
2637         uint64_t tx_dropped, rx_dropped;
2638         uint64_t tx, tx_total, rx, rx_total;
2639         uint32_t device_fh;
2640         const char clr[] = { 27, '[', '2', 'J', '\0' };
2641         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2642
2643         while(1) {
2644                 sleep(enable_stats);
2645
2646                 /* Clear screen and move to top left */
2647                 printf("%s%s", clr, top_left);
2648
2649                 printf("\nDevice statistics ====================================");
2650
2651                 dev_ll = ll_root_used;
2652                 while (dev_ll != NULL) {
2653                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2654                         tx_total = dev_statistics[device_fh].tx_total;
2655                         tx = dev_statistics[device_fh].tx;
2656                         tx_dropped = tx_total - tx;
2657                         if (zero_copy == 0) {
2658                                 rx_total = rte_atomic64_read(
2659                                         &dev_statistics[device_fh].rx_total_atomic);
2660                                 rx = rte_atomic64_read(
2661                                         &dev_statistics[device_fh].rx_atomic);
2662                         } else {
2663                                 rx_total = dev_statistics[device_fh].rx_total;
2664                                 rx = dev_statistics[device_fh].rx;
2665                         }
2666                         rx_dropped = rx_total - rx;
2667
2668                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2669                                         "\nTX total:            %"PRIu64""
2670                                         "\nTX dropped:          %"PRIu64""
2671                                         "\nTX successful:               %"PRIu64""
2672                                         "\nRX total:            %"PRIu64""
2673                                         "\nRX dropped:          %"PRIu64""
2674                                         "\nRX successful:               %"PRIu64"",
2675                                         device_fh,
2676                                         tx_total,
2677                                         tx_dropped,
2678                                         tx,
2679                                         rx_total,
2680                                         rx_dropped,
2681                                         rx);
2682
2683                         dev_ll = dev_ll->next;
2684                 }
2685                 printf("\n======================================================\n");
2686         }
2687 }
2688
2689 static void
2690 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2691         char *ring_name, uint32_t nb_mbuf)
2692 {
2693         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2694         vpool_array[index].pool
2695                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2696                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2697                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2698                 rte_pktmbuf_init, NULL, socket, 0);
2699         if (vpool_array[index].pool != NULL) {
2700                 vpool_array[index].ring
2701                         = rte_ring_create(ring_name,
2702                                 rte_align32pow2(nb_mbuf + 1),
2703                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2704                 if (likely(vpool_array[index].ring != NULL)) {
2705                         LOG_DEBUG(VHOST_CONFIG,
2706                                 "in setup_mempool_tbl: mbuf count in "
2707                                 "mempool is: %d\n",
2708                                 rte_mempool_count(vpool_array[index].pool));
2709                         LOG_DEBUG(VHOST_CONFIG,
2710                                 "in setup_mempool_tbl: mbuf count in "
2711                                 "ring   is: %d\n",
2712                                 rte_ring_count(vpool_array[index].ring));
2713                 } else {
2714                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2715                                 ring_name);
2716                 }
2717
2718                 /* Need consider head room. */
2719                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2720         } else {
2721                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2722         }
2723 }
2724
2725
2726 /*
2727  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2728  * device is also registered here to handle the IOCTLs.
2729  */
2730 int
2731 MAIN(int argc, char *argv[])
2732 {
2733         struct rte_mempool *mbuf_pool = NULL;
2734         unsigned lcore_id, core_id = 0;
2735         unsigned nb_ports, valid_num_ports;
2736         int ret;
2737         uint8_t portid, queue_id = 0;
2738         static pthread_t tid;
2739
2740         /* init EAL */
2741         ret = rte_eal_init(argc, argv);
2742         if (ret < 0)
2743                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2744         argc -= ret;
2745         argv += ret;
2746
2747         /* parse app arguments */
2748         ret = us_vhost_parse_args(argc, argv);
2749         if (ret < 0)
2750                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2751
2752         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2753                 if (rte_lcore_is_enabled(lcore_id))
2754                         lcore_ids[core_id ++] = lcore_id;
2755
2756         if (rte_lcore_count() > RTE_MAX_LCORE)
2757                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2758
2759         /*set the number of swithcing cores available*/
2760         num_switching_cores = rte_lcore_count()-1;
2761
2762         /* Get the number of physical ports. */
2763         nb_ports = rte_eth_dev_count();
2764         if (nb_ports > RTE_MAX_ETHPORTS)
2765                 nb_ports = RTE_MAX_ETHPORTS;
2766
2767         /*
2768          * Update the global var NUM_PORTS and global array PORTS
2769          * and get value of var VALID_NUM_PORTS according to system ports number
2770          */
2771         valid_num_ports = check_ports_num(nb_ports);
2772
2773         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2774                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2775                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2776                 return -1;
2777         }
2778
2779         if (zero_copy == 0) {
2780                 /* Create the mbuf pool. */
2781                 mbuf_pool = rte_mempool_create(
2782                                 "MBUF_POOL",
2783                                 NUM_MBUFS_PER_PORT
2784                                 * valid_num_ports,
2785                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2786                                 sizeof(struct rte_pktmbuf_pool_private),
2787                                 rte_pktmbuf_pool_init, NULL,
2788                                 rte_pktmbuf_init, NULL,
2789                                 rte_socket_id(), 0);
2790                 if (mbuf_pool == NULL)
2791                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2792
2793                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2794                         vpool_array[queue_id].pool = mbuf_pool;
2795
2796                 if (vm2vm_mode == VM2VM_HARDWARE) {
2797                         /* Enable VT loop back to let L2 switch to do it. */
2798                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2799                         LOG_DEBUG(VHOST_CONFIG,
2800                                 "Enable loop back for L2 switch in vmdq.\n");
2801                 }
2802         } else {
2803                 uint32_t nb_mbuf;
2804                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2805                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2806
2807                 /*
2808                  * Zero copy defers queue RX/TX start to the time when guest
2809                  * finishes its startup and packet buffers from that guest are
2810                  * available.
2811                  */
2812                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2813                 rx_conf_default.rx_drop_en = 0;
2814                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2815                 nb_mbuf = num_rx_descriptor
2816                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2817                         + num_switching_cores * MAX_PKT_BURST;
2818
2819                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2820                         snprintf(pool_name, sizeof(pool_name),
2821                                 "rxmbuf_pool_%u", queue_id);
2822                         snprintf(ring_name, sizeof(ring_name),
2823                                 "rxmbuf_ring_%u", queue_id);
2824                         setup_mempool_tbl(rte_socket_id(), queue_id,
2825                                 pool_name, ring_name, nb_mbuf);
2826                 }
2827
2828                 nb_mbuf = num_tx_descriptor
2829                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2830                                 + num_switching_cores * MAX_PKT_BURST;
2831
2832                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2833                         snprintf(pool_name, sizeof(pool_name),
2834                                 "txmbuf_pool_%u", queue_id);
2835                         snprintf(ring_name, sizeof(ring_name),
2836                                 "txmbuf_ring_%u", queue_id);
2837                         setup_mempool_tbl(rte_socket_id(),
2838                                 (queue_id + MAX_QUEUES),
2839                                 pool_name, ring_name, nb_mbuf);
2840                 }
2841
2842                 if (vm2vm_mode == VM2VM_HARDWARE) {
2843                         /* Enable VT loop back to let L2 switch to do it. */
2844                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2845                         LOG_DEBUG(VHOST_CONFIG,
2846                                 "Enable loop back for L2 switch in vmdq.\n");
2847                 }
2848         }
2849         /* Set log level. */
2850         rte_set_log_level(LOG_LEVEL);
2851
2852         /* initialize all ports */
2853         for (portid = 0; portid < nb_ports; portid++) {
2854                 /* skip ports that are not enabled */
2855                 if ((enabled_port_mask & (1 << portid)) == 0) {
2856                         RTE_LOG(INFO, VHOST_PORT,
2857                                 "Skipping disabled port %d\n", portid);
2858                         continue;
2859                 }
2860                 if (port_init(portid) != 0)
2861                         rte_exit(EXIT_FAILURE,
2862                                 "Cannot initialize network ports\n");
2863         }
2864
2865         /* Initialise all linked lists. */
2866         if (init_data_ll() == -1)
2867                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2868
2869         /* Initialize device stats */
2870         memset(&dev_statistics, 0, sizeof(dev_statistics));
2871
2872         /* Enable stats if the user option is set. */
2873         if (enable_stats)
2874                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2875
2876         /* Launch all data cores. */
2877         if (zero_copy == 0) {
2878                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2879                         rte_eal_remote_launch(switch_worker,
2880                                 mbuf_pool, lcore_id);
2881                 }
2882         } else {
2883                 uint32_t count_in_mempool, index, i;
2884                 for (index = 0; index < 2*MAX_QUEUES; index++) {
2885                         /* For all RX and TX queues. */
2886                         count_in_mempool
2887                                 = rte_mempool_count(vpool_array[index].pool);
2888
2889                         /*
2890                          * Transfer all un-attached mbufs from vpool.pool
2891                          * to vpoo.ring.
2892                          */
2893                         for (i = 0; i < count_in_mempool; i++) {
2894                                 struct rte_mbuf *mbuf
2895                                         = __rte_mbuf_raw_alloc(
2896                                                 vpool_array[index].pool);
2897                                 rte_ring_sp_enqueue(vpool_array[index].ring,
2898                                                 (void *)mbuf);
2899                         }
2900
2901                         LOG_DEBUG(VHOST_CONFIG,
2902                                 "in MAIN: mbuf count in mempool at initial "
2903                                 "is: %d\n", count_in_mempool);
2904                         LOG_DEBUG(VHOST_CONFIG,
2905                                 "in MAIN: mbuf count in  ring at initial  is :"
2906                                 " %d\n",
2907                                 rte_ring_count(vpool_array[index].ring));
2908                 }
2909
2910                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
2911                         rte_eal_remote_launch(switch_worker_zcp, NULL,
2912                                 lcore_id);
2913         }
2914
2915         /* Register CUSE device to handle IOCTLs. */
2916         ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
2917         if (ret != 0)
2918                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
2919
2920         init_virtio_net(&virtio_net_device_ops);
2921
2922         /* Start CUSE session. */
2923         start_cuse_session_loop();
2924         return 0;
2925
2926 }
2927