examples/vhost: avoid inserting vlan twice
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
84
85 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
87
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX                       1
93 #define DEVICE_SAFE_REMOVE      2
94
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117                 + sizeof(struct rte_mbuf)))
118
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121
122 #define INVALID_PORT_ID 0xFF
123
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168
169 struct vpool {
170         struct rte_mempool *pool;
171         struct rte_ring *ring;
172         uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177         VM2VM_DISABLED = 0,
178         VM2VM_SOFTWARE = 1,
179         VM2VM_HARDWARE = 2,
180         VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186         PHYS_ADDR_CONTINUOUS = 0,
187         PHYS_ADDR_CROSS_SUBREG = 1,
188         PHYS_ADDR_INVALID = 2,
189         PHYS_ADDR_LAST
190 } hpa_type;
191
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206         .rxmode = {
207                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208                 .split_hdr_size = 0,
209                 .header_split   = 0, /**< Header Split disabled */
210                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
211                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
212                 /*
213                  * It is necessary for 1G NIC such as I350,
214                  * this fixes bug of ipv4 forwarding in guest can't
215                  * forward pakets from one virtio dev to another virtio dev.
216                  */
217                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
220         },
221
222         .txmode = {
223                 .mq_mode = ETH_MQ_TX_NONE,
224         },
225         .rx_adv_conf = {
226                 /*
227                  * should be overridden separately in code with
228                  * appropriate values
229                  */
230                 .vmdq_rx_conf = {
231                         .nb_queue_pools = ETH_8_POOLS,
232                         .enable_default_pool = 0,
233                         .default_pool = 0,
234                         .nb_pool_maps = 0,
235                         .pool_map = {{0, 0},},
236                 },
237         },
238 };
239
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
251         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271         unsigned len;
272         unsigned txq_id;
273         struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284         unsigned char   h_dest[ETH_ALEN];
285         unsigned char   h_source[ETH_ALEN];
286         __be16          h_vlan_proto;
287         __be16          h_vlan_TCI;
288         __be16          h_vlan_encapsulated_proto;
289 };
290
291 /* IPv4 Header */
292 struct ipv4_hdr {
293         uint8_t  version_ihl;           /**< version and header length */
294         uint8_t  type_of_service;       /**< type of service */
295         uint16_t total_length;          /**< length of packet */
296         uint16_t packet_id;             /**< packet ID */
297         uint16_t fragment_offset;       /**< fragmentation offset */
298         uint8_t  time_to_live;          /**< time to live */
299         uint8_t  next_proto_id;         /**< protocol ID */
300         uint16_t hdr_checksum;          /**< header checksum */
301         uint32_t src_addr;              /**< source address */
302         uint32_t dst_addr;              /**< destination address */
303 } __attribute__((__packed__));
304
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308
309 /* Per-device statistics struct */
310 struct device_statistics {
311         uint64_t tx_total;
312         rte_atomic64_t rx_total_atomic;
313         uint64_t rx_total;
314         uint64_t tx;
315         rte_atomic64_t rx_atomic;
316         uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327         struct rte_eth_vmdq_rx_conf conf;
328         struct rte_eth_vmdq_rx_conf *def_conf =
329                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330         unsigned i;
331
332         memset(&conf, 0, sizeof(conf));
333         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334         conf.nb_pool_maps = num_devices;
335         conf.enable_loop_back = def_conf->enable_loop_back;
336         conf.rx_mode = def_conf->rx_mode;
337
338         for (i = 0; i < conf.nb_pool_maps; i++) {
339                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
340                 conf.pool_map[i].pools = (1UL << i);
341         }
342
343         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346         return 0;
347 }
348
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357         if (num_devices > max_nb_devices) {
358                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359                 return -1;
360         }
361         return 0;
362 }
363
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371         struct rte_eth_dev_info dev_info;
372         struct rte_eth_conf port_conf;
373         struct rte_eth_rxconf *rxconf;
374         struct rte_eth_txconf *txconf;
375         int16_t rx_rings, tx_rings;
376         uint16_t rx_ring_size, tx_ring_size;
377         int retval;
378         uint16_t q;
379
380         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381         rte_eth_dev_info_get (port, &dev_info);
382
383         if (dev_info.max_rx_queues > MAX_QUEUES) {
384                 rte_exit(EXIT_FAILURE,
385                         "please define MAX_QUEUES no less than %u in %s\n",
386                         dev_info.max_rx_queues, __FILE__);
387         }
388
389         rxconf = &dev_info.default_rxconf;
390         txconf = &dev_info.default_txconf;
391         rxconf->rx_drop_en = 1;
392
393         /* Enable vlan offload */
394         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
395
396         /*
397          * Zero copy defers queue RX/TX start to the time when guest
398          * finishes its startup and packet buffers from that guest are
399          * available.
400          */
401         if (zero_copy) {
402                 rxconf->rx_deferred_start = 1;
403                 rxconf->rx_drop_en = 0;
404                 txconf->tx_deferred_start = 1;
405         }
406
407         /*configure the number of supported virtio devices based on VMDQ limits */
408         num_devices = dev_info.max_vmdq_pools;
409
410         if (zero_copy) {
411                 rx_ring_size = num_rx_descriptor;
412                 tx_ring_size = num_tx_descriptor;
413                 tx_rings = dev_info.max_tx_queues;
414         } else {
415                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
416                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
417                 tx_rings = (uint16_t)rte_lcore_count();
418         }
419
420         retval = validate_num_devices(MAX_DEVICES);
421         if (retval < 0)
422                 return retval;
423
424         /* Get port configuration. */
425         retval = get_eth_conf(&port_conf, num_devices);
426         if (retval < 0)
427                 return retval;
428         /* NIC queues are divided into pf queues and vmdq queues.  */
429         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
430         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
431         num_vmdq_queues = num_devices * queues_per_pool;
432         num_queues = num_pf_queues + num_vmdq_queues;
433         vmdq_queue_base = dev_info.vmdq_queue_base;
434         vmdq_pool_base  = dev_info.vmdq_pool_base;
435         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
436                 num_pf_queues, num_devices, queues_per_pool);
437
438         if (port >= rte_eth_dev_count()) return -1;
439
440         rx_rings = (uint16_t)dev_info.max_rx_queues;
441         /* Configure ethernet device. */
442         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
443         if (retval != 0)
444                 return retval;
445
446         /* Setup the queues. */
447         for (q = 0; q < rx_rings; q ++) {
448                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
449                                                 rte_eth_dev_socket_id(port),
450                                                 rxconf,
451                                                 vpool_array[q].pool);
452                 if (retval < 0)
453                         return retval;
454         }
455         for (q = 0; q < tx_rings; q ++) {
456                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
457                                                 rte_eth_dev_socket_id(port),
458                                                 txconf);
459                 if (retval < 0)
460                         return retval;
461         }
462
463         /* Start the device. */
464         retval  = rte_eth_dev_start(port);
465         if (retval < 0) {
466                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
467                 return retval;
468         }
469
470         if (promiscuous)
471                 rte_eth_promiscuous_enable(port);
472
473         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
474         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
475         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
476                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
477                         (unsigned)port,
478                         vmdq_ports_eth_addr[port].addr_bytes[0],
479                         vmdq_ports_eth_addr[port].addr_bytes[1],
480                         vmdq_ports_eth_addr[port].addr_bytes[2],
481                         vmdq_ports_eth_addr[port].addr_bytes[3],
482                         vmdq_ports_eth_addr[port].addr_bytes[4],
483                         vmdq_ports_eth_addr[port].addr_bytes[5]);
484
485         return 0;
486 }
487
488 /*
489  * Set character device basename.
490  */
491 static int
492 us_vhost_parse_basename(const char *q_arg)
493 {
494         /* parse number string */
495
496         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
497                 return -1;
498         else
499                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
500
501         return 0;
502 }
503
504 /*
505  * Parse the portmask provided at run time.
506  */
507 static int
508 parse_portmask(const char *portmask)
509 {
510         char *end = NULL;
511         unsigned long pm;
512
513         errno = 0;
514
515         /* parse hexadecimal string */
516         pm = strtoul(portmask, &end, 16);
517         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
518                 return -1;
519
520         if (pm == 0)
521                 return -1;
522
523         return pm;
524
525 }
526
527 /*
528  * Parse num options at run time.
529  */
530 static int
531 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
532 {
533         char *end = NULL;
534         unsigned long num;
535
536         errno = 0;
537
538         /* parse unsigned int string */
539         num = strtoul(q_arg, &end, 10);
540         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
541                 return -1;
542
543         if (num > max_valid_value)
544                 return -1;
545
546         return num;
547
548 }
549
550 /*
551  * Display usage
552  */
553 static void
554 us_vhost_usage(const char *prgname)
555 {
556         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
557         "               --vm2vm [0|1|2]\n"
558         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
559         "               --dev-basename <name>\n"
560         "               --nb-devices ND\n"
561         "               -p PORTMASK: Set mask for ports to be used by application\n"
562         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
563         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
564         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
565         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
566         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
567         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
568         "               --dev-basename: The basename to be used for the character device.\n"
569         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
570                         "zero copy\n"
571         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
572                         "used only when zero copy is enabled.\n"
573         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
574                         "used only when zero copy is enabled.\n",
575                prgname);
576 }
577
578 /*
579  * Parse the arguments given in the command line of the application.
580  */
581 static int
582 us_vhost_parse_args(int argc, char **argv)
583 {
584         int opt, ret;
585         int option_index;
586         unsigned i;
587         const char *prgname = argv[0];
588         static struct option long_option[] = {
589                 {"vm2vm", required_argument, NULL, 0},
590                 {"rx-retry", required_argument, NULL, 0},
591                 {"rx-retry-delay", required_argument, NULL, 0},
592                 {"rx-retry-num", required_argument, NULL, 0},
593                 {"mergeable", required_argument, NULL, 0},
594                 {"stats", required_argument, NULL, 0},
595                 {"dev-basename", required_argument, NULL, 0},
596                 {"zero-copy", required_argument, NULL, 0},
597                 {"rx-desc-num", required_argument, NULL, 0},
598                 {"tx-desc-num", required_argument, NULL, 0},
599                 {NULL, 0, 0, 0},
600         };
601
602         /* Parse command line */
603         while ((opt = getopt_long(argc, argv, "p:P",
604                         long_option, &option_index)) != EOF) {
605                 switch (opt) {
606                 /* Portmask */
607                 case 'p':
608                         enabled_port_mask = parse_portmask(optarg);
609                         if (enabled_port_mask == 0) {
610                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611                                 us_vhost_usage(prgname);
612                                 return -1;
613                         }
614                         break;
615
616                 case 'P':
617                         promiscuous = 1;
618                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
619                                 ETH_VMDQ_ACCEPT_BROADCAST |
620                                 ETH_VMDQ_ACCEPT_MULTICAST;
621                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
622
623                         break;
624
625                 case 0:
626                         /* Enable/disable vm2vm comms. */
627                         if (!strncmp(long_option[option_index].name, "vm2vm",
628                                 MAX_LONG_OPT_SZ)) {
629                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
630                                 if (ret == -1) {
631                                         RTE_LOG(INFO, VHOST_CONFIG,
632                                                 "Invalid argument for "
633                                                 "vm2vm [0|1|2]\n");
634                                         us_vhost_usage(prgname);
635                                         return -1;
636                                 } else {
637                                         vm2vm_mode = (vm2vm_type)ret;
638                                 }
639                         }
640
641                         /* Enable/disable retries on RX. */
642                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
643                                 ret = parse_num_opt(optarg, 1);
644                                 if (ret == -1) {
645                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
646                                         us_vhost_usage(prgname);
647                                         return -1;
648                                 } else {
649                                         enable_retry = ret;
650                                 }
651                         }
652
653                         /* Specify the retries delay time (in useconds) on RX. */
654                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
655                                 ret = parse_num_opt(optarg, INT32_MAX);
656                                 if (ret == -1) {
657                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
658                                         us_vhost_usage(prgname);
659                                         return -1;
660                                 } else {
661                                         burst_rx_delay_time = ret;
662                                 }
663                         }
664
665                         /* Specify the retries number on RX. */
666                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
667                                 ret = parse_num_opt(optarg, INT32_MAX);
668                                 if (ret == -1) {
669                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
670                                         us_vhost_usage(prgname);
671                                         return -1;
672                                 } else {
673                                         burst_rx_retry_num = ret;
674                                 }
675                         }
676
677                         /* Enable/disable RX mergeable buffers. */
678                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
679                                 ret = parse_num_opt(optarg, 1);
680                                 if (ret == -1) {
681                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
682                                         us_vhost_usage(prgname);
683                                         return -1;
684                                 } else {
685                                         mergeable = !!ret;
686                                         if (ret) {
687                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
688                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
689                                                         = JUMBO_FRAME_MAX_SIZE;
690                                         }
691                                 }
692                         }
693
694                         /* Enable/disable stats. */
695                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
696                                 ret = parse_num_opt(optarg, INT32_MAX);
697                                 if (ret == -1) {
698                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
699                                         us_vhost_usage(prgname);
700                                         return -1;
701                                 } else {
702                                         enable_stats = ret;
703                                 }
704                         }
705
706                         /* Set character device basename. */
707                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
708                                 if (us_vhost_parse_basename(optarg) == -1) {
709                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
710                                         us_vhost_usage(prgname);
711                                         return -1;
712                                 }
713                         }
714
715                         /* Enable/disable rx/tx zero copy. */
716                         if (!strncmp(long_option[option_index].name,
717                                 "zero-copy", MAX_LONG_OPT_SZ)) {
718                                 ret = parse_num_opt(optarg, 1);
719                                 if (ret == -1) {
720                                         RTE_LOG(INFO, VHOST_CONFIG,
721                                                 "Invalid argument"
722                                                 " for zero-copy [0|1]\n");
723                                         us_vhost_usage(prgname);
724                                         return -1;
725                                 } else
726                                         zero_copy = ret;
727
728                                 if (zero_copy) {
729 #ifdef RTE_MBUF_REFCNT
730                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
731                                         "zero copy vhost APP, please "
732                                         "disable RTE_MBUF_REFCNT\n"
733                                         "in config file and then rebuild DPDK "
734                                         "core lib!\n"
735                                         "Otherwise please disable zero copy "
736                                         "flag in command line!\n");
737                                         return -1;
738 #endif
739                                 }
740                         }
741
742                         /* Specify the descriptor number on RX. */
743                         if (!strncmp(long_option[option_index].name,
744                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
745                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
746                                 if ((ret == -1) || (!POWEROF2(ret))) {
747                                         RTE_LOG(INFO, VHOST_CONFIG,
748                                         "Invalid argument for rx-desc-num[0-N],"
749                                         "power of 2 required.\n");
750                                         us_vhost_usage(prgname);
751                                         return -1;
752                                 } else {
753                                         num_rx_descriptor = ret;
754                                 }
755                         }
756
757                         /* Specify the descriptor number on TX. */
758                         if (!strncmp(long_option[option_index].name,
759                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
760                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
761                                 if ((ret == -1) || (!POWEROF2(ret))) {
762                                         RTE_LOG(INFO, VHOST_CONFIG,
763                                         "Invalid argument for tx-desc-num [0-N],"
764                                         "power of 2 required.\n");
765                                         us_vhost_usage(prgname);
766                                         return -1;
767                                 } else {
768                                         num_tx_descriptor = ret;
769                                 }
770                         }
771
772                         break;
773
774                         /* Invalid option - print options. */
775                 default:
776                         us_vhost_usage(prgname);
777                         return -1;
778                 }
779         }
780
781         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
782                 if (enabled_port_mask & (1 << i))
783                         ports[num_ports++] = (uint8_t)i;
784         }
785
786         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
787                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
788                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
789                 return -1;
790         }
791
792         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
793                 RTE_LOG(INFO, VHOST_PORT,
794                         "Vhost zero copy doesn't support software vm2vm,"
795                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
796                 return -1;
797         }
798
799         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
800                 RTE_LOG(INFO, VHOST_PORT,
801                         "Vhost zero copy doesn't support jumbo frame,"
802                         "please specify '--mergeable 0' to disable the "
803                         "mergeable feature.\n");
804                 return -1;
805         }
806
807         return 0;
808 }
809
810 /*
811  * Update the global var NUM_PORTS and array PORTS according to system ports number
812  * and return valid ports number
813  */
814 static unsigned check_ports_num(unsigned nb_ports)
815 {
816         unsigned valid_num_ports = num_ports;
817         unsigned portid;
818
819         if (num_ports > nb_ports) {
820                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
821                         num_ports, nb_ports);
822                 num_ports = nb_ports;
823         }
824
825         for (portid = 0; portid < num_ports; portid ++) {
826                 if (ports[portid] >= nb_ports) {
827                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
828                                 ports[portid], (nb_ports - 1));
829                         ports[portid] = INVALID_PORT_ID;
830                         valid_num_ports--;
831                 }
832         }
833         return valid_num_ports;
834 }
835
836 /*
837  * Macro to print out packet contents. Wrapped in debug define so that the
838  * data path is not effected when debug is disabled.
839  */
840 #ifdef DEBUG
841 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
842         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
843         unsigned int index;                                                                                                                                                                                             \
844         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
845                                                                                                                                                                                                                                         \
846         if ((header))                                                                                                                                                                                                   \
847                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
848         else                                                                                                                                                                                                                    \
849                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
850         for (index = 0; index < (size); index++) {                                                                                                                                              \
851                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
852                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
853         }                                                                                                                                                                                                                               \
854         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
855                                                                                                                                                                                                                                         \
856         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
857 } while(0)
858 #else
859 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
860 #endif
861
862 /*
863  * Function to convert guest physical addresses to vhost physical addresses.
864  * This is used to convert virtio buffer addresses.
865  */
866 static inline uint64_t __attribute__((always_inline))
867 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
868         uint32_t buf_len, hpa_type *addr_type)
869 {
870         struct virtio_memory_regions_hpa *region;
871         uint32_t regionidx;
872         uint64_t vhost_pa = 0;
873
874         *addr_type = PHYS_ADDR_INVALID;
875
876         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
877                 region = &vdev->regions_hpa[regionidx];
878                 if ((guest_pa >= region->guest_phys_address) &&
879                         (guest_pa <= region->guest_phys_address_end)) {
880                         vhost_pa = region->host_phys_addr_offset + guest_pa;
881                         if (likely((guest_pa + buf_len - 1)
882                                 <= region->guest_phys_address_end))
883                                 *addr_type = PHYS_ADDR_CONTINUOUS;
884                         else
885                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
886                         break;
887                 }
888         }
889
890         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
891                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
892                 (void *)(uintptr_t)vhost_pa);
893
894         return vhost_pa;
895 }
896
897 /*
898  * Compares a packet destination MAC address to a device MAC address.
899  */
900 static inline int __attribute__((always_inline))
901 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
902 {
903         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
904 }
905
906 /*
907  * This function learns the MAC address of the device and registers this along with a
908  * vlan tag to a VMDQ.
909  */
910 static int
911 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
912 {
913         struct ether_hdr *pkt_hdr;
914         struct virtio_net_data_ll *dev_ll;
915         struct virtio_net *dev = vdev->dev;
916         int i, ret;
917
918         /* Learn MAC address of guest device from packet */
919         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
920
921         dev_ll = ll_root_used;
922
923         while (dev_ll != NULL) {
924                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
925                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
926                         return -1;
927                 }
928                 dev_ll = dev_ll->next;
929         }
930
931         for (i = 0; i < ETHER_ADDR_LEN; i++)
932                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
933
934         /* vlan_tag currently uses the device_id. */
935         vdev->vlan_tag = vlan_tags[dev->device_fh];
936
937         /* Print out VMDQ registration info. */
938         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
939                 dev->device_fh,
940                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
941                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
942                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
943                 vdev->vlan_tag);
944
945         /* Register the MAC address. */
946         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
947                                 (uint32_t)dev->device_fh + vmdq_pool_base);
948         if (ret)
949                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
950                                         dev->device_fh);
951
952         /* Enable stripping of the vlan tag as we handle routing. */
953         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
954
955         /* Set device as ready for RX. */
956         vdev->ready = DEVICE_RX;
957
958         return 0;
959 }
960
961 /*
962  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
963  * queue before disabling RX on the device.
964  */
965 static inline void
966 unlink_vmdq(struct vhost_dev *vdev)
967 {
968         unsigned i = 0;
969         unsigned rx_count;
970         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
971
972         if (vdev->ready == DEVICE_RX) {
973                 /*clear MAC and VLAN settings*/
974                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
975                 for (i = 0; i < 6; i++)
976                         vdev->mac_address.addr_bytes[i] = 0;
977
978                 vdev->vlan_tag = 0;
979
980                 /*Clear out the receive buffers*/
981                 rx_count = rte_eth_rx_burst(ports[0],
982                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
983
984                 while (rx_count) {
985                         for (i = 0; i < rx_count; i++)
986                                 rte_pktmbuf_free(pkts_burst[i]);
987
988                         rx_count = rte_eth_rx_burst(ports[0],
989                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990                 }
991
992                 vdev->ready = DEVICE_MAC_LEARNING;
993         }
994 }
995
996 /*
997  * Check if the packet destination MAC address is for a local device. If so then put
998  * the packet on that devices RX queue. If not then return.
999  */
1000 static inline int __attribute__((always_inline))
1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1002 {
1003         struct virtio_net_data_ll *dev_ll;
1004         struct ether_hdr *pkt_hdr;
1005         uint64_t ret = 0;
1006         struct virtio_net *dev = vdev->dev;
1007         struct virtio_net *tdev; /* destination virito device */
1008
1009         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1010
1011         /*get the used devices list*/
1012         dev_ll = ll_root_used;
1013
1014         while (dev_ll != NULL) {
1015                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1016                                           &dev_ll->vdev->mac_address)) {
1017
1018                         /* Drop the packet if the TX packet is destined for the TX device. */
1019                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1020                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1021                                                         dev->device_fh);
1022                                 return 0;
1023                         }
1024                         tdev = dev_ll->vdev->dev;
1025
1026
1027                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1028
1029                         if (unlikely(dev_ll->vdev->remove)) {
1030                                 /*drop the packet if the device is marked for removal*/
1031                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1032                         } else {
1033                                 /*send the packet to the local virtio device*/
1034                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1035                                 if (enable_stats) {
1036                                         rte_atomic64_add(
1037                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1038                                         1);
1039                                         rte_atomic64_add(
1040                                         &dev_statistics[tdev->device_fh].rx_atomic,
1041                                         ret);
1042                                         dev_statistics[tdev->device_fh].tx_total++;
1043                                         dev_statistics[tdev->device_fh].tx += ret;
1044                                 }
1045                         }
1046
1047                         return 0;
1048                 }
1049                 dev_ll = dev_ll->next;
1050         }
1051
1052         return -1;
1053 }
1054
1055 /*
1056  * Check if the destination MAC of a packet is one local VM,
1057  * and get its vlan tag, and offset if it is.
1058  */
1059 static inline int __attribute__((always_inline))
1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1061         uint32_t *offset, uint16_t *vlan_tag)
1062 {
1063         struct virtio_net_data_ll *dev_ll = ll_root_used;
1064         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1065
1066         while (dev_ll != NULL) {
1067                 if ((dev_ll->vdev->ready == DEVICE_RX)
1068                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1069                 &dev_ll->vdev->mac_address)) {
1070                         /*
1071                          * Drop the packet if the TX packet is
1072                          * destined for the TX device.
1073                          */
1074                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1075                                 LOG_DEBUG(VHOST_DATA,
1076                                 "(%"PRIu64") TX: Source and destination"
1077                                 " MAC addresses are the same. Dropping "
1078                                 "packet.\n",
1079                                 dev_ll->vdev->dev->device_fh);
1080                                 return -1;
1081                         }
1082
1083                         /*
1084                          * HW vlan strip will reduce the packet length
1085                          * by minus length of vlan tag, so need restore
1086                          * the packet length by plus it.
1087                          */
1088                         *offset = VLAN_HLEN;
1089                         *vlan_tag =
1090                         (uint16_t)
1091                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1092
1093                         LOG_DEBUG(VHOST_DATA,
1094                         "(%"PRIu64") TX: pkt to local VM device id:"
1095                         "(%"PRIu64") vlan tag: %d.\n",
1096                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1097                         vlan_tag);
1098
1099                         break;
1100                 }
1101                 dev_ll = dev_ll->next;
1102         }
1103         return 0;
1104 }
1105
1106 /*
1107  * This function routes the TX packet to the correct interface. This may be a local device
1108  * or the physical port.
1109  */
1110 static inline void __attribute__((always_inline))
1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1112 {
1113         struct mbuf_table *tx_q;
1114         struct rte_mbuf **m_table;
1115         unsigned len, ret, offset = 0;
1116         const uint16_t lcore_id = rte_lcore_id();
1117         struct virtio_net *dev = vdev->dev;
1118         struct ether_hdr *nh;
1119
1120         /*check if destination is local VM*/
1121         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1122                 rte_pktmbuf_free(m);
1123                 return;
1124         }
1125
1126         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1127                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1128                         rte_pktmbuf_free(m);
1129                         return;
1130                 }
1131         }
1132
1133         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1134
1135         /*Add packet to the port tx queue*/
1136         tx_q = &lcore_tx_queue[lcore_id];
1137         len = tx_q->len;
1138
1139         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1140         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1141                 /* Guest has inserted the vlan tag. */
1142                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1143                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1144                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1145                         (vh->vlan_tci != vlan_tag_be))
1146                         vh->vlan_tci = vlan_tag_be;
1147         } else {
1148                 m->ol_flags = PKT_TX_VLAN_PKT;
1149
1150                 /*
1151                  * Find the right seg to adjust the data len when offset is
1152                  * bigger than tail room size.
1153                  */
1154                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1155                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1156                                 m->data_len += offset;
1157                         else {
1158                                 struct rte_mbuf *seg = m;
1159
1160                                 while ((seg->next != NULL) &&
1161                                         (offset > rte_pktmbuf_tailroom(seg)))
1162                                         seg = seg->next;
1163
1164                                 seg->data_len += offset;
1165                         }
1166                         m->pkt_len += offset;
1167                 }
1168
1169                 m->vlan_tci = vlan_tag;
1170         }
1171
1172         tx_q->m_table[len] = m;
1173         len++;
1174         if (enable_stats) {
1175                 dev_statistics[dev->device_fh].tx_total++;
1176                 dev_statistics[dev->device_fh].tx++;
1177         }
1178
1179         if (unlikely(len == MAX_PKT_BURST)) {
1180                 m_table = (struct rte_mbuf **)tx_q->m_table;
1181                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1182                 /* Free any buffers not handled by TX and update the port stats. */
1183                 if (unlikely(ret < len)) {
1184                         do {
1185                                 rte_pktmbuf_free(m_table[ret]);
1186                         } while (++ret < len);
1187                 }
1188
1189                 len = 0;
1190         }
1191
1192         tx_q->len = len;
1193         return;
1194 }
1195 /*
1196  * This function is called by each data core. It handles all RX/TX registered with the
1197  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1198  * with all devices in the main linked list.
1199  */
1200 static int
1201 switch_worker(__attribute__((unused)) void *arg)
1202 {
1203         struct rte_mempool *mbuf_pool = arg;
1204         struct virtio_net *dev = NULL;
1205         struct vhost_dev *vdev = NULL;
1206         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1207         struct virtio_net_data_ll *dev_ll;
1208         struct mbuf_table *tx_q;
1209         volatile struct lcore_ll_info *lcore_ll;
1210         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1211         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1212         unsigned ret, i;
1213         const uint16_t lcore_id = rte_lcore_id();
1214         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1215         uint16_t rx_count = 0;
1216         uint16_t tx_count;
1217         uint32_t retry = 0;
1218
1219         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1220         lcore_ll = lcore_info[lcore_id].lcore_ll;
1221         prev_tsc = 0;
1222
1223         tx_q = &lcore_tx_queue[lcore_id];
1224         for (i = 0; i < num_cores; i ++) {
1225                 if (lcore_ids[i] == lcore_id) {
1226                         tx_q->txq_id = i;
1227                         break;
1228                 }
1229         }
1230
1231         while(1) {
1232                 cur_tsc = rte_rdtsc();
1233                 /*
1234                  * TX burst queue drain
1235                  */
1236                 diff_tsc = cur_tsc - prev_tsc;
1237                 if (unlikely(diff_tsc > drain_tsc)) {
1238
1239                         if (tx_q->len) {
1240                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1241
1242                                 /*Tx any packets in the queue*/
1243                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1244                                                                            (struct rte_mbuf **)tx_q->m_table,
1245                                                                            (uint16_t)tx_q->len);
1246                                 if (unlikely(ret < tx_q->len)) {
1247                                         do {
1248                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1249                                         } while (++ret < tx_q->len);
1250                                 }
1251
1252                                 tx_q->len = 0;
1253                         }
1254
1255                         prev_tsc = cur_tsc;
1256
1257                 }
1258
1259                 rte_prefetch0(lcore_ll->ll_root_used);
1260                 /*
1261                  * Inform the configuration core that we have exited the linked list and that no devices are
1262                  * in use if requested.
1263                  */
1264                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1265                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1266
1267                 /*
1268                  * Process devices
1269                  */
1270                 dev_ll = lcore_ll->ll_root_used;
1271
1272                 while (dev_ll != NULL) {
1273                         /*get virtio device ID*/
1274                         vdev = dev_ll->vdev;
1275                         dev = vdev->dev;
1276
1277                         if (unlikely(vdev->remove)) {
1278                                 dev_ll = dev_ll->next;
1279                                 unlink_vmdq(vdev);
1280                                 vdev->ready = DEVICE_SAFE_REMOVE;
1281                                 continue;
1282                         }
1283                         if (likely(vdev->ready == DEVICE_RX)) {
1284                                 /*Handle guest RX*/
1285                                 rx_count = rte_eth_rx_burst(ports[0],
1286                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1287
1288                                 if (rx_count) {
1289                                         /*
1290                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1291                                         * Here MAX_PKT_BURST must be less than virtio queue size
1292                                         */
1293                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1294                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1295                                                         rte_delay_us(burst_rx_delay_time);
1296                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1297                                                                 break;
1298                                                 }
1299                                         }
1300                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1301                                         if (enable_stats) {
1302                                                 rte_atomic64_add(
1303                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1304                                                 rx_count);
1305                                                 rte_atomic64_add(
1306                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1307                                         }
1308                                         while (likely(rx_count)) {
1309                                                 rx_count--;
1310                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1311                                         }
1312
1313                                 }
1314                         }
1315
1316                         if (likely(!vdev->remove)) {
1317                                 /* Handle guest TX*/
1318                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1319                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1320                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1321                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1322                                                 while (tx_count)
1323                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1324                                         }
1325                                 }
1326                                 while (tx_count)
1327                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1328                         }
1329
1330                         /*move to the next device in the list*/
1331                         dev_ll = dev_ll->next;
1332                 }
1333         }
1334
1335         return 0;
1336 }
1337
1338 /*
1339  * This function gets available ring number for zero copy rx.
1340  * Only one thread will call this funciton for a paticular virtio device,
1341  * so, it is designed as non-thread-safe function.
1342  */
1343 static inline uint32_t __attribute__((always_inline))
1344 get_available_ring_num_zcp(struct virtio_net *dev)
1345 {
1346         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1347         uint16_t avail_idx;
1348
1349         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1350         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1351 }
1352
1353 /*
1354  * This function gets available ring index for zero copy rx,
1355  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1356  * Only one thread will call this funciton for a paticular virtio device,
1357  * so, it is designed as non-thread-safe function.
1358  */
1359 static inline uint32_t __attribute__((always_inline))
1360 get_available_ring_index_zcp(struct virtio_net *dev,
1361         uint16_t *res_base_idx, uint32_t count)
1362 {
1363         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1364         uint16_t avail_idx;
1365         uint32_t retry = 0;
1366         uint16_t free_entries;
1367
1368         *res_base_idx = vq->last_used_idx_res;
1369         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1370         free_entries = (avail_idx - *res_base_idx);
1371
1372         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1373                         "avail idx: %d, "
1374                         "res base idx:%d, free entries:%d\n",
1375                         dev->device_fh, avail_idx, *res_base_idx,
1376                         free_entries);
1377
1378         /*
1379          * If retry is enabled and the queue is full then we wait
1380          * and retry to avoid packet loss.
1381          */
1382         if (enable_retry && unlikely(count > free_entries)) {
1383                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1384                         rte_delay_us(burst_rx_delay_time);
1385                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1386                         free_entries = (avail_idx - *res_base_idx);
1387                         if (count <= free_entries)
1388                                 break;
1389                 }
1390         }
1391
1392         /*check that we have enough buffers*/
1393         if (unlikely(count > free_entries))
1394                 count = free_entries;
1395
1396         if (unlikely(count == 0)) {
1397                 LOG_DEBUG(VHOST_DATA,
1398                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1399                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1400                         dev->device_fh, avail_idx,
1401                         *res_base_idx, free_entries);
1402                 return 0;
1403         }
1404
1405         vq->last_used_idx_res = *res_base_idx + count;
1406
1407         return count;
1408 }
1409
1410 /*
1411  * This function put descriptor back to used list.
1412  */
1413 static inline void __attribute__((always_inline))
1414 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1415 {
1416         uint16_t res_cur_idx = vq->last_used_idx;
1417         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1418         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1419         rte_compiler_barrier();
1420         *(volatile uint16_t *)&vq->used->idx += 1;
1421         vq->last_used_idx += 1;
1422
1423         /* Kick the guest if necessary. */
1424         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1425                 eventfd_write((int)vq->kickfd, 1);
1426 }
1427
1428 /*
1429  * This function get available descriptor from vitio vring and un-attached mbuf
1430  * from vpool->ring, and then attach them together. It needs adjust the offset
1431  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1432  * frame data may be put to wrong location in mbuf.
1433  */
1434 static inline void __attribute__((always_inline))
1435 attach_rxmbuf_zcp(struct virtio_net *dev)
1436 {
1437         uint16_t res_base_idx, desc_idx;
1438         uint64_t buff_addr, phys_addr;
1439         struct vhost_virtqueue *vq;
1440         struct vring_desc *desc;
1441         struct rte_mbuf *mbuf = NULL;
1442         struct vpool *vpool;
1443         hpa_type addr_type;
1444         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1445
1446         vpool = &vpool_array[vdev->vmdq_rx_q];
1447         vq = dev->virtqueue[VIRTIO_RXQ];
1448
1449         do {
1450                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1451                                 1) != 1))
1452                         return;
1453                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1454
1455                 desc = &vq->desc[desc_idx];
1456                 if (desc->flags & VRING_DESC_F_NEXT) {
1457                         desc = &vq->desc[desc->next];
1458                         buff_addr = gpa_to_vva(dev, desc->addr);
1459                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1460                                         &addr_type);
1461                 } else {
1462                         buff_addr = gpa_to_vva(dev,
1463                                         desc->addr + vq->vhost_hlen);
1464                         phys_addr = gpa_to_hpa(vdev,
1465                                         desc->addr + vq->vhost_hlen,
1466                                         desc->len, &addr_type);
1467                 }
1468
1469                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1470                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1471                                 " address found when attaching RX frame buffer"
1472                                 " address!\n", dev->device_fh);
1473                         put_desc_to_used_list_zcp(vq, desc_idx);
1474                         continue;
1475                 }
1476
1477                 /*
1478                  * Check if the frame buffer address from guest crosses
1479                  * sub-region or not.
1480                  */
1481                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1482                         RTE_LOG(ERR, VHOST_DATA,
1483                                 "(%"PRIu64") Frame buffer address cross "
1484                                 "sub-regioin found when attaching RX frame "
1485                                 "buffer address!\n",
1486                                 dev->device_fh);
1487                         put_desc_to_used_list_zcp(vq, desc_idx);
1488                         continue;
1489                 }
1490         } while (unlikely(phys_addr == 0));
1491
1492         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1493         if (unlikely(mbuf == NULL)) {
1494                 LOG_DEBUG(VHOST_DATA,
1495                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1496                         "ring_sc_dequeue fail.\n",
1497                         dev->device_fh);
1498                 put_desc_to_used_list_zcp(vq, desc_idx);
1499                 return;
1500         }
1501
1502         if (unlikely(vpool->buf_size > desc->len)) {
1503                 LOG_DEBUG(VHOST_DATA,
1504                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1505                         "length(%d) of descriptor idx: %d less than room "
1506                         "size required: %d\n",
1507                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1508                 put_desc_to_used_list_zcp(vq, desc_idx);
1509                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1510                 return;
1511         }
1512
1513         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1514         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1515         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1516         mbuf->data_len = desc->len;
1517         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1518
1519         LOG_DEBUG(VHOST_DATA,
1520                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1521                 "descriptor idx:%d\n",
1522                 dev->device_fh, res_base_idx, desc_idx);
1523
1524         __rte_mbuf_raw_free(mbuf);
1525
1526         return;
1527 }
1528
1529 /*
1530  * Detach an attched packet mbuf -
1531  *  - restore original mbuf address and length values.
1532  *  - reset pktmbuf data and data_len to their default values.
1533  *  All other fields of the given packet mbuf will be left intact.
1534  *
1535  * @param m
1536  *   The attached packet mbuf.
1537  */
1538 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1539 {
1540         const struct rte_mempool *mp = m->pool;
1541         void *buf = RTE_MBUF_TO_BADDR(m);
1542         uint32_t buf_ofs;
1543         uint32_t buf_len = mp->elt_size - sizeof(*m);
1544         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1545
1546         m->buf_addr = buf;
1547         m->buf_len = (uint16_t)buf_len;
1548
1549         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1550                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1551         m->data_off = buf_ofs;
1552
1553         m->data_len = 0;
1554 }
1555
1556 /*
1557  * This function is called after packets have been transimited. It fetchs mbuf
1558  * from vpool->pool, detached it and put into vpool->ring. It also update the
1559  * used index and kick the guest if necessary.
1560  */
1561 static inline uint32_t __attribute__((always_inline))
1562 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1563 {
1564         struct rte_mbuf *mbuf;
1565         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1566         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1567         uint32_t index = 0;
1568         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1569
1570         LOG_DEBUG(VHOST_DATA,
1571                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1572                 "clean is: %d\n",
1573                 dev->device_fh, mbuf_count);
1574         LOG_DEBUG(VHOST_DATA,
1575                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1576                 "clean  is : %d\n",
1577                 dev->device_fh, rte_ring_count(vpool->ring));
1578
1579         for (index = 0; index < mbuf_count; index++) {
1580                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1581                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1582                         pktmbuf_detach_zcp(mbuf);
1583                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1584
1585                 /* Update used index buffer information. */
1586                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1587                 vq->used->ring[used_idx].len = 0;
1588
1589                 used_idx = (used_idx + 1) & (vq->size - 1);
1590         }
1591
1592         LOG_DEBUG(VHOST_DATA,
1593                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1594                 "clean is: %d\n",
1595                 dev->device_fh, rte_mempool_count(vpool->pool));
1596         LOG_DEBUG(VHOST_DATA,
1597                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1598                 "clean  is : %d\n",
1599                 dev->device_fh, rte_ring_count(vpool->ring));
1600         LOG_DEBUG(VHOST_DATA,
1601                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1602                 "vq->last_used_idx:%d\n",
1603                 dev->device_fh, vq->last_used_idx);
1604
1605         vq->last_used_idx += mbuf_count;
1606
1607         LOG_DEBUG(VHOST_DATA,
1608                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1609                 "vq->last_used_idx:%d\n",
1610                 dev->device_fh, vq->last_used_idx);
1611
1612         rte_compiler_barrier();
1613
1614         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1615
1616         /* Kick guest if required. */
1617         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1618                 eventfd_write((int)vq->kickfd, 1);
1619
1620         return 0;
1621 }
1622
1623 /*
1624  * This function is called when a virtio device is destroy.
1625  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1626  */
1627 static void mbuf_destroy_zcp(struct vpool *vpool)
1628 {
1629         struct rte_mbuf *mbuf = NULL;
1630         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1631
1632         LOG_DEBUG(VHOST_CONFIG,
1633                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1634                 "mbuf_destroy_zcp is: %d\n",
1635                 mbuf_count);
1636         LOG_DEBUG(VHOST_CONFIG,
1637                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1638                 "mbuf_destroy_zcp  is : %d\n",
1639                 rte_ring_count(vpool->ring));
1640
1641         for (index = 0; index < mbuf_count; index++) {
1642                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1643                 if (likely(mbuf != NULL)) {
1644                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1645                                 pktmbuf_detach_zcp(mbuf);
1646                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1647                 }
1648         }
1649
1650         LOG_DEBUG(VHOST_CONFIG,
1651                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1652                 "mbuf_destroy_zcp is: %d\n",
1653                 rte_mempool_count(vpool->pool));
1654         LOG_DEBUG(VHOST_CONFIG,
1655                 "in mbuf_destroy_zcp: mbuf count in ring after "
1656                 "mbuf_destroy_zcp is : %d\n",
1657                 rte_ring_count(vpool->ring));
1658 }
1659
1660 /*
1661  * This function update the use flag and counter.
1662  */
1663 static inline uint32_t __attribute__((always_inline))
1664 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1665         uint32_t count)
1666 {
1667         struct vhost_virtqueue *vq;
1668         struct vring_desc *desc;
1669         struct rte_mbuf *buff;
1670         /* The virtio_hdr is initialised to 0. */
1671         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1672                 = {{0, 0, 0, 0, 0, 0}, 0};
1673         uint64_t buff_hdr_addr = 0;
1674         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1675         uint32_t head_idx, packet_success = 0;
1676         uint16_t res_cur_idx;
1677
1678         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1679
1680         if (count == 0)
1681                 return 0;
1682
1683         vq = dev->virtqueue[VIRTIO_RXQ];
1684         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1685
1686         res_cur_idx = vq->last_used_idx;
1687         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1688                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1689
1690         /* Retrieve all of the head indexes first to avoid caching issues. */
1691         for (head_idx = 0; head_idx < count; head_idx++)
1692                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1693
1694         /*Prefetch descriptor index. */
1695         rte_prefetch0(&vq->desc[head[packet_success]]);
1696
1697         while (packet_success != count) {
1698                 /* Get descriptor from available ring */
1699                 desc = &vq->desc[head[packet_success]];
1700
1701                 buff = pkts[packet_success];
1702                 LOG_DEBUG(VHOST_DATA,
1703                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1704                         "pkt[%d] descriptor idx: %d\n",
1705                         dev->device_fh, packet_success,
1706                         MBUF_HEADROOM_UINT32(buff));
1707
1708                 PRINT_PACKET(dev,
1709                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1710                         + RTE_PKTMBUF_HEADROOM),
1711                         rte_pktmbuf_data_len(buff), 0);
1712
1713                 /* Buffer address translation for virtio header. */
1714                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1715                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1716
1717                 /*
1718                  * If the descriptors are chained the header and data are
1719                  * placed in separate buffers.
1720                  */
1721                 if (desc->flags & VRING_DESC_F_NEXT) {
1722                         desc->len = vq->vhost_hlen;
1723                         desc = &vq->desc[desc->next];
1724                         desc->len = rte_pktmbuf_data_len(buff);
1725                 } else {
1726                         desc->len = packet_len;
1727                 }
1728
1729                 /* Update used ring with desc information */
1730                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1731                         = head[packet_success];
1732                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1733                         = packet_len;
1734                 res_cur_idx++;
1735                 packet_success++;
1736
1737                 /* A header is required per buffer. */
1738                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1739                         (const void *)&virtio_hdr, vq->vhost_hlen);
1740
1741                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1742
1743                 if (likely(packet_success < count)) {
1744                         /* Prefetch descriptor index. */
1745                         rte_prefetch0(&vq->desc[head[packet_success]]);
1746                 }
1747         }
1748
1749         rte_compiler_barrier();
1750
1751         LOG_DEBUG(VHOST_DATA,
1752                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1753                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1754                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1755
1756         *(volatile uint16_t *)&vq->used->idx += count;
1757         vq->last_used_idx += count;
1758
1759         LOG_DEBUG(VHOST_DATA,
1760                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1761                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1762                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1763
1764         /* Kick the guest if necessary. */
1765         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1766                 eventfd_write((int)vq->kickfd, 1);
1767
1768         return count;
1769 }
1770
1771 /*
1772  * This function routes the TX packet to the correct interface.
1773  * This may be a local device or the physical port.
1774  */
1775 static inline void __attribute__((always_inline))
1776 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1777         uint32_t desc_idx, uint8_t need_copy)
1778 {
1779         struct mbuf_table *tx_q;
1780         struct rte_mbuf **m_table;
1781         struct rte_mbuf *mbuf = NULL;
1782         unsigned len, ret, offset = 0;
1783         struct vpool *vpool;
1784         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1785         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1786
1787         /*Add packet to the port tx queue*/
1788         tx_q = &tx_queue_zcp[vmdq_rx_q];
1789         len = tx_q->len;
1790
1791         /* Allocate an mbuf and populate the structure. */
1792         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1793         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1794         if (unlikely(mbuf == NULL)) {
1795                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1796                 RTE_LOG(ERR, VHOST_DATA,
1797                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1798                         dev->device_fh);
1799                 put_desc_to_used_list_zcp(vq, desc_idx);
1800                 return;
1801         }
1802
1803         if (vm2vm_mode == VM2VM_HARDWARE) {
1804                 /* Avoid using a vlan tag from any vm for external pkt, such as
1805                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1806                  * selection, MAC address determines it as an external pkt
1807                  * which should go to network, while vlan tag determine it as
1808                  * a vm2vm pkt should forward to another vm. Hardware confuse
1809                  * such a ambiguous situation, so pkt will lost.
1810                  */
1811                 vlan_tag = external_pkt_default_vlan_tag;
1812                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1813                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1814                         __rte_mbuf_raw_free(mbuf);
1815                         return;
1816                 }
1817         }
1818
1819         mbuf->nb_segs = m->nb_segs;
1820         mbuf->next = m->next;
1821         mbuf->data_len = m->data_len + offset;
1822         mbuf->pkt_len = mbuf->data_len;
1823         if (unlikely(need_copy)) {
1824                 /* Copy the packet contents to the mbuf. */
1825                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1826                         rte_pktmbuf_mtod(m, void *),
1827                         m->data_len);
1828         } else {
1829                 mbuf->data_off = m->data_off;
1830                 mbuf->buf_physaddr = m->buf_physaddr;
1831                 mbuf->buf_addr = m->buf_addr;
1832         }
1833         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1834         mbuf->vlan_tci = vlan_tag;
1835         mbuf->l2_len = sizeof(struct ether_hdr);
1836         mbuf->l3_len = sizeof(struct ipv4_hdr);
1837         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1838
1839         tx_q->m_table[len] = mbuf;
1840         len++;
1841
1842         LOG_DEBUG(VHOST_DATA,
1843                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1844                 dev->device_fh,
1845                 mbuf->nb_segs,
1846                 (mbuf->next == NULL) ? "null" : "non-null");
1847
1848         if (enable_stats) {
1849                 dev_statistics[dev->device_fh].tx_total++;
1850                 dev_statistics[dev->device_fh].tx++;
1851         }
1852
1853         if (unlikely(len == MAX_PKT_BURST)) {
1854                 m_table = (struct rte_mbuf **)tx_q->m_table;
1855                 ret = rte_eth_tx_burst(ports[0],
1856                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1857
1858                 /*
1859                  * Free any buffers not handled by TX and update
1860                  * the port stats.
1861                  */
1862                 if (unlikely(ret < len)) {
1863                         do {
1864                                 rte_pktmbuf_free(m_table[ret]);
1865                         } while (++ret < len);
1866                 }
1867
1868                 len = 0;
1869                 txmbuf_clean_zcp(dev, vpool);
1870         }
1871
1872         tx_q->len = len;
1873
1874         return;
1875 }
1876
1877 /*
1878  * This function TX all available packets in virtio TX queue for one
1879  * virtio-net device. If it is first packet, it learns MAC address and
1880  * setup VMDQ.
1881  */
1882 static inline void __attribute__((always_inline))
1883 virtio_dev_tx_zcp(struct virtio_net *dev)
1884 {
1885         struct rte_mbuf m;
1886         struct vhost_virtqueue *vq;
1887         struct vring_desc *desc;
1888         uint64_t buff_addr = 0, phys_addr;
1889         uint32_t head[MAX_PKT_BURST];
1890         uint32_t i;
1891         uint16_t free_entries, packet_success = 0;
1892         uint16_t avail_idx;
1893         uint8_t need_copy = 0;
1894         hpa_type addr_type;
1895         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1896
1897         vq = dev->virtqueue[VIRTIO_TXQ];
1898         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1899
1900         /* If there are no available buffers then return. */
1901         if (vq->last_used_idx_res == avail_idx)
1902                 return;
1903
1904         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1905
1906         /* Prefetch available ring to retrieve head indexes. */
1907         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1908
1909         /* Get the number of free entries in the ring */
1910         free_entries = (avail_idx - vq->last_used_idx_res);
1911
1912         /* Limit to MAX_PKT_BURST. */
1913         free_entries
1914                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1915
1916         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1917                 dev->device_fh, free_entries);
1918
1919         /* Retrieve all of the head indexes first to avoid caching issues. */
1920         for (i = 0; i < free_entries; i++)
1921                 head[i]
1922                         = vq->avail->ring[(vq->last_used_idx_res + i)
1923                         & (vq->size - 1)];
1924
1925         vq->last_used_idx_res += free_entries;
1926
1927         /* Prefetch descriptor index. */
1928         rte_prefetch0(&vq->desc[head[packet_success]]);
1929         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1930
1931         while (packet_success < free_entries) {
1932                 desc = &vq->desc[head[packet_success]];
1933
1934                 /* Discard first buffer as it is the virtio header */
1935                 desc = &vq->desc[desc->next];
1936
1937                 /* Buffer address translation. */
1938                 buff_addr = gpa_to_vva(dev, desc->addr);
1939                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1940                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1941                         &addr_type);
1942
1943                 if (likely(packet_success < (free_entries - 1)))
1944                         /* Prefetch descriptor index. */
1945                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1946
1947                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1948                         RTE_LOG(ERR, VHOST_DATA,
1949                                 "(%"PRIu64") Invalid frame buffer address found"
1950                                 "when TX packets!\n",
1951                                 dev->device_fh);
1952                         packet_success++;
1953                         continue;
1954                 }
1955
1956                 /* Prefetch buffer address. */
1957                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1958
1959                 /*
1960                  * Setup dummy mbuf. This is copied to a real mbuf if
1961                  * transmitted out the physical port.
1962                  */
1963                 m.data_len = desc->len;
1964                 m.nb_segs = 1;
1965                 m.next = NULL;
1966                 m.data_off = 0;
1967                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1968                 m.buf_physaddr = phys_addr;
1969
1970                 /*
1971                  * Check if the frame buffer address from guest crosses
1972                  * sub-region or not.
1973                  */
1974                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1975                         RTE_LOG(ERR, VHOST_DATA,
1976                                 "(%"PRIu64") Frame buffer address cross "
1977                                 "sub-regioin found when attaching TX frame "
1978                                 "buffer address!\n",
1979                                 dev->device_fh);
1980                         need_copy = 1;
1981                 } else
1982                         need_copy = 0;
1983
1984                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1985
1986                 /*
1987                  * If this is the first received packet we need to learn
1988                  * the MAC and setup VMDQ
1989                  */
1990                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1991                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1992                                 /*
1993                                  * Discard frame if device is scheduled for
1994                                  * removal or a duplicate MAC address is found.
1995                                  */
1996                                 packet_success += free_entries;
1997                                 vq->last_used_idx += packet_success;
1998                                 break;
1999                         }
2000                 }
2001
2002                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2003                 packet_success++;
2004         }
2005 }
2006
2007 /*
2008  * This function is called by each data core. It handles all RX/TX registered
2009  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2010  * addresses are compared with all devices in the main linked list.
2011  */
2012 static int
2013 switch_worker_zcp(__attribute__((unused)) void *arg)
2014 {
2015         struct virtio_net *dev = NULL;
2016         struct vhost_dev  *vdev = NULL;
2017         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2018         struct virtio_net_data_ll *dev_ll;
2019         struct mbuf_table *tx_q;
2020         volatile struct lcore_ll_info *lcore_ll;
2021         const uint64_t drain_tsc
2022                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2023                 * BURST_TX_DRAIN_US;
2024         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2025         unsigned ret;
2026         const uint16_t lcore_id = rte_lcore_id();
2027         uint16_t count_in_ring, rx_count = 0;
2028
2029         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2030
2031         lcore_ll = lcore_info[lcore_id].lcore_ll;
2032         prev_tsc = 0;
2033
2034         while (1) {
2035                 cur_tsc = rte_rdtsc();
2036
2037                 /* TX burst queue drain */
2038                 diff_tsc = cur_tsc - prev_tsc;
2039                 if (unlikely(diff_tsc > drain_tsc)) {
2040                         /*
2041                          * Get mbuf from vpool.pool and detach mbuf and
2042                          * put back into vpool.ring.
2043                          */
2044                         dev_ll = lcore_ll->ll_root_used;
2045                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2046                                 /* Get virtio device ID */
2047                                 vdev = dev_ll->vdev;
2048                                 dev = vdev->dev;
2049
2050                                 if (likely(!vdev->remove)) {
2051                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2052                                         if (tx_q->len) {
2053                                                 LOG_DEBUG(VHOST_DATA,
2054                                                 "TX queue drained after timeout"
2055                                                 " with burst size %u\n",
2056                                                 tx_q->len);
2057
2058                                                 /*
2059                                                  * Tx any packets in the queue
2060                                                  */
2061                                                 ret = rte_eth_tx_burst(
2062                                                         ports[0],
2063                                                         (uint16_t)tx_q->txq_id,
2064                                                         (struct rte_mbuf **)
2065                                                         tx_q->m_table,
2066                                                         (uint16_t)tx_q->len);
2067                                                 if (unlikely(ret < tx_q->len)) {
2068                                                         do {
2069                                                                 rte_pktmbuf_free(
2070                                                                         tx_q->m_table[ret]);
2071                                                         } while (++ret < tx_q->len);
2072                                                 }
2073                                                 tx_q->len = 0;
2074
2075                                                 txmbuf_clean_zcp(dev,
2076                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2077                                         }
2078                                 }
2079                                 dev_ll = dev_ll->next;
2080                         }
2081                         prev_tsc = cur_tsc;
2082                 }
2083
2084                 rte_prefetch0(lcore_ll->ll_root_used);
2085
2086                 /*
2087                  * Inform the configuration core that we have exited the linked
2088                  * list and that no devices are in use if requested.
2089                  */
2090                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2091                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2092
2093                 /* Process devices */
2094                 dev_ll = lcore_ll->ll_root_used;
2095
2096                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2097                         vdev = dev_ll->vdev;
2098                         dev  = vdev->dev;
2099                         if (unlikely(vdev->remove)) {
2100                                 dev_ll = dev_ll->next;
2101                                 unlink_vmdq(vdev);
2102                                 vdev->ready = DEVICE_SAFE_REMOVE;
2103                                 continue;
2104                         }
2105
2106                         if (likely(vdev->ready == DEVICE_RX)) {
2107                                 uint32_t index = vdev->vmdq_rx_q;
2108                                 uint16_t i;
2109                                 count_in_ring
2110                                 = rte_ring_count(vpool_array[index].ring);
2111                                 uint16_t free_entries
2112                                 = (uint16_t)get_available_ring_num_zcp(dev);
2113
2114                                 /*
2115                                  * Attach all mbufs in vpool.ring and put back
2116                                  * into vpool.pool.
2117                                  */
2118                                 for (i = 0;
2119                                 i < RTE_MIN(free_entries,
2120                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2121                                 i++)
2122                                         attach_rxmbuf_zcp(dev);
2123
2124                                 /* Handle guest RX */
2125                                 rx_count = rte_eth_rx_burst(ports[0],
2126                                         vdev->vmdq_rx_q, pkts_burst,
2127                                         MAX_PKT_BURST);
2128
2129                                 if (rx_count) {
2130                                         ret_count = virtio_dev_rx_zcp(dev,
2131                                                         pkts_burst, rx_count);
2132                                         if (enable_stats) {
2133                                                 dev_statistics[dev->device_fh].rx_total
2134                                                         += rx_count;
2135                                                 dev_statistics[dev->device_fh].rx
2136                                                         += ret_count;
2137                                         }
2138                                         while (likely(rx_count)) {
2139                                                 rx_count--;
2140                                                 pktmbuf_detach_zcp(
2141                                                         pkts_burst[rx_count]);
2142                                                 rte_ring_sp_enqueue(
2143                                                         vpool_array[index].ring,
2144                                                         (void *)pkts_burst[rx_count]);
2145                                         }
2146                                 }
2147                         }
2148
2149                         if (likely(!vdev->remove))
2150                                 /* Handle guest TX */
2151                                 virtio_dev_tx_zcp(dev);
2152
2153                         /* Move to the next device in the list */
2154                         dev_ll = dev_ll->next;
2155                 }
2156         }
2157
2158         return 0;
2159 }
2160
2161
2162 /*
2163  * Add an entry to a used linked list. A free entry must first be found
2164  * in the free linked list using get_data_ll_free_entry();
2165  */
2166 static void
2167 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2168         struct virtio_net_data_ll *ll_dev)
2169 {
2170         struct virtio_net_data_ll *ll = *ll_root_addr;
2171
2172         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2173         ll_dev->next = NULL;
2174         rte_compiler_barrier();
2175
2176         /* If ll == NULL then this is the first device. */
2177         if (ll) {
2178                 /* Increment to the tail of the linked list. */
2179                 while ((ll->next != NULL) )
2180                         ll = ll->next;
2181
2182                 ll->next = ll_dev;
2183         } else {
2184                 *ll_root_addr = ll_dev;
2185         }
2186 }
2187
2188 /*
2189  * Remove an entry from a used linked list. The entry must then be added to
2190  * the free linked list using put_data_ll_free_entry().
2191  */
2192 static void
2193 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2194         struct virtio_net_data_ll *ll_dev,
2195         struct virtio_net_data_ll *ll_dev_last)
2196 {
2197         struct virtio_net_data_ll *ll = *ll_root_addr;
2198
2199         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2200                 return;
2201
2202         if (ll_dev == ll)
2203                 *ll_root_addr = ll_dev->next;
2204         else
2205                 if (likely(ll_dev_last != NULL))
2206                         ll_dev_last->next = ll_dev->next;
2207                 else
2208                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2209 }
2210
2211 /*
2212  * Find and return an entry from the free linked list.
2213  */
2214 static struct virtio_net_data_ll *
2215 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2216 {
2217         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2218         struct virtio_net_data_ll *ll_dev;
2219
2220         if (ll_free == NULL)
2221                 return NULL;
2222
2223         ll_dev = ll_free;
2224         *ll_root_addr = ll_free->next;
2225
2226         return ll_dev;
2227 }
2228
2229 /*
2230  * Place an entry back on to the free linked list.
2231  */
2232 static void
2233 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2234         struct virtio_net_data_ll *ll_dev)
2235 {
2236         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2237
2238         if (ll_dev == NULL)
2239                 return;
2240
2241         ll_dev->next = ll_free;
2242         *ll_root_addr = ll_dev;
2243 }
2244
2245 /*
2246  * Creates a linked list of a given size.
2247  */
2248 static struct virtio_net_data_ll *
2249 alloc_data_ll(uint32_t size)
2250 {
2251         struct virtio_net_data_ll *ll_new;
2252         uint32_t i;
2253
2254         /* Malloc and then chain the linked list. */
2255         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2256         if (ll_new == NULL) {
2257                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2258                 return NULL;
2259         }
2260
2261         for (i = 0; i < size - 1; i++) {
2262                 ll_new[i].vdev = NULL;
2263                 ll_new[i].next = &ll_new[i+1];
2264         }
2265         ll_new[i].next = NULL;
2266
2267         return (ll_new);
2268 }
2269
2270 /*
2271  * Create the main linked list along with each individual cores linked list. A used and a free list
2272  * are created to manage entries.
2273  */
2274 static int
2275 init_data_ll (void)
2276 {
2277         int lcore;
2278
2279         RTE_LCORE_FOREACH_SLAVE(lcore) {
2280                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2281                 if (lcore_info[lcore].lcore_ll == NULL) {
2282                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2283                         return -1;
2284                 }
2285
2286                 lcore_info[lcore].lcore_ll->device_num = 0;
2287                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2288                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2289                 if (num_devices % num_switching_cores)
2290                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2291                 else
2292                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2293         }
2294
2295         /* Allocate devices up to a maximum of MAX_DEVICES. */
2296         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2297
2298         return 0;
2299 }
2300
2301 /*
2302  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2303  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2304  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2305  */
2306 static void
2307 destroy_device (volatile struct virtio_net *dev)
2308 {
2309         struct virtio_net_data_ll *ll_lcore_dev_cur;
2310         struct virtio_net_data_ll *ll_main_dev_cur;
2311         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2312         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2313         struct vhost_dev *vdev;
2314         int lcore;
2315
2316         dev->flags &= ~VIRTIO_DEV_RUNNING;
2317
2318         vdev = (struct vhost_dev *)dev->priv;
2319         /*set the remove flag. */
2320         vdev->remove = 1;
2321         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2322                 rte_pause();
2323         }
2324
2325         /* Search for entry to be removed from lcore ll */
2326         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2327         while (ll_lcore_dev_cur != NULL) {
2328                 if (ll_lcore_dev_cur->vdev == vdev) {
2329                         break;
2330                 } else {
2331                         ll_lcore_dev_last = ll_lcore_dev_cur;
2332                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2333                 }
2334         }
2335
2336         if (ll_lcore_dev_cur == NULL) {
2337                 RTE_LOG(ERR, VHOST_CONFIG,
2338                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2339                         dev->device_fh);
2340                 return;
2341         }
2342
2343         /* Search for entry to be removed from main ll */
2344         ll_main_dev_cur = ll_root_used;
2345         ll_main_dev_last = NULL;
2346         while (ll_main_dev_cur != NULL) {
2347                 if (ll_main_dev_cur->vdev == vdev) {
2348                         break;
2349                 } else {
2350                         ll_main_dev_last = ll_main_dev_cur;
2351                         ll_main_dev_cur = ll_main_dev_cur->next;
2352                 }
2353         }
2354
2355         /* Remove entries from the lcore and main ll. */
2356         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2357         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2358
2359         /* Set the dev_removal_flag on each lcore. */
2360         RTE_LCORE_FOREACH_SLAVE(lcore) {
2361                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2362         }
2363
2364         /*
2365          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2366          * they can no longer access the device removed from the linked lists and that the devices
2367          * are no longer in use.
2368          */
2369         RTE_LCORE_FOREACH_SLAVE(lcore) {
2370                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2371                         rte_pause();
2372                 }
2373         }
2374
2375         /* Add the entries back to the lcore and main free ll.*/
2376         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2377         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2378
2379         /* Decrement number of device on the lcore. */
2380         lcore_info[vdev->coreid].lcore_ll->device_num--;
2381
2382         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2383
2384         if (zero_copy) {
2385                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2386
2387                 /* Stop the RX queue. */
2388                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2389                         LOG_DEBUG(VHOST_CONFIG,
2390                                 "(%"PRIu64") In destroy_device: Failed to stop "
2391                                 "rx queue:%d\n",
2392                                 dev->device_fh,
2393                                 vdev->vmdq_rx_q);
2394                 }
2395
2396                 LOG_DEBUG(VHOST_CONFIG,
2397                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2398                         "mempool back to ring for RX queue: %d\n",
2399                         dev->device_fh, vdev->vmdq_rx_q);
2400
2401                 mbuf_destroy_zcp(vpool);
2402
2403                 /* Stop the TX queue. */
2404                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2405                         LOG_DEBUG(VHOST_CONFIG,
2406                                 "(%"PRIu64") In destroy_device: Failed to "
2407                                 "stop tx queue:%d\n",
2408                                 dev->device_fh, vdev->vmdq_rx_q);
2409                 }
2410
2411                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2412
2413                 LOG_DEBUG(VHOST_CONFIG,
2414                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2415                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2416                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2417                         dev->device_fh);
2418
2419                 mbuf_destroy_zcp(vpool);
2420                 rte_free(vdev->regions_hpa);
2421         }
2422         rte_free(vdev);
2423
2424 }
2425
2426 /*
2427  * Calculate the region count of physical continous regions for one particular
2428  * region of whose vhost virtual address is continous. The particular region
2429  * start from vva_start, with size of 'size' in argument.
2430  */
2431 static uint32_t
2432 check_hpa_regions(uint64_t vva_start, uint64_t size)
2433 {
2434         uint32_t i, nregions = 0, page_size = getpagesize();
2435         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2436         if (vva_start % page_size) {
2437                 LOG_DEBUG(VHOST_CONFIG,
2438                         "in check_countinous: vva start(%p) mod page_size(%d) "
2439                         "has remainder\n",
2440                         (void *)(uintptr_t)vva_start, page_size);
2441                 return 0;
2442         }
2443         if (size % page_size) {
2444                 LOG_DEBUG(VHOST_CONFIG,
2445                         "in check_countinous: "
2446                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2447                         size, page_size);
2448                 return 0;
2449         }
2450         for (i = 0; i < size - page_size; i = i + page_size) {
2451                 cur_phys_addr
2452                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2453                 next_phys_addr = rte_mem_virt2phy(
2454                         (void *)(uintptr_t)(vva_start + i + page_size));
2455                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2456                         ++nregions;
2457                         LOG_DEBUG(VHOST_CONFIG,
2458                                 "in check_continuous: hva addr:(%p) is not "
2459                                 "continuous with hva addr:(%p), diff:%d\n",
2460                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2461                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2462                                 + page_size), page_size);
2463                         LOG_DEBUG(VHOST_CONFIG,
2464                                 "in check_continuous: hpa addr:(%p) is not "
2465                                 "continuous with hpa addr:(%p), "
2466                                 "diff:(%"PRIu64")\n",
2467                                 (void *)(uintptr_t)cur_phys_addr,
2468                                 (void *)(uintptr_t)next_phys_addr,
2469                                 (next_phys_addr-cur_phys_addr));
2470                 }
2471         }
2472         return nregions;
2473 }
2474
2475 /*
2476  * Divide each region whose vhost virtual address is continous into a few
2477  * sub-regions, make sure the physical address within each sub-region are
2478  * continous. And fill offset(to GPA) and size etc. information of each
2479  * sub-region into regions_hpa.
2480  */
2481 static uint32_t
2482 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2483 {
2484         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2485         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2486
2487         if (mem_region_hpa == NULL)
2488                 return 0;
2489
2490         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2491                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2492                         virtio_memory->regions[regionidx].address_offset;
2493                 mem_region_hpa[regionidx_hpa].guest_phys_address
2494                         = virtio_memory->regions[regionidx].guest_phys_address;
2495                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2496                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2497                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2498                 LOG_DEBUG(VHOST_CONFIG,
2499                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2500                         regionidx_hpa,
2501                         (void *)(uintptr_t)
2502                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2503                 LOG_DEBUG(VHOST_CONFIG,
2504                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2505                         regionidx_hpa,
2506                         (void *)(uintptr_t)
2507                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2508                 for (i = 0, k = 0;
2509                         i < virtio_memory->regions[regionidx].memory_size -
2510                                 page_size;
2511                         i += page_size) {
2512                         cur_phys_addr = rte_mem_virt2phy(
2513                                         (void *)(uintptr_t)(vva_start + i));
2514                         next_phys_addr = rte_mem_virt2phy(
2515                                         (void *)(uintptr_t)(vva_start +
2516                                         i + page_size));
2517                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2518                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2519                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2520                                         k + page_size;
2521                                 mem_region_hpa[regionidx_hpa].memory_size
2522                                         = k + page_size;
2523                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2524                                         "phys addr end  [%d]:(%p)\n",
2525                                         regionidx_hpa,
2526                                         (void *)(uintptr_t)
2527                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2528                                 LOG_DEBUG(VHOST_CONFIG,
2529                                         "in fill_hpa_regions: guest phys addr "
2530                                         "size [%d]:(%p)\n",
2531                                         regionidx_hpa,
2532                                         (void *)(uintptr_t)
2533                                         (mem_region_hpa[regionidx_hpa].memory_size));
2534                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2535                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2536                                 ++regionidx_hpa;
2537                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2538                                         next_phys_addr -
2539                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2540                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2541                                         " phys addr start[%d]:(%p)\n",
2542                                         regionidx_hpa,
2543                                         (void *)(uintptr_t)
2544                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2545                                 LOG_DEBUG(VHOST_CONFIG,
2546                                         "in fill_hpa_regions: host  phys addr "
2547                                         "start[%d]:(%p)\n",
2548                                         regionidx_hpa,
2549                                         (void *)(uintptr_t)
2550                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2551                                 k = 0;
2552                         } else {
2553                                 k += page_size;
2554                         }
2555                 }
2556                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2557                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2558                         + k + page_size;
2559                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2560                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2561                         "[%d]:(%p)\n", regionidx_hpa,
2562                         (void *)(uintptr_t)
2563                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2564                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2565                         "[%d]:(%p)\n", regionidx_hpa,
2566                         (void *)(uintptr_t)
2567                         (mem_region_hpa[regionidx_hpa].memory_size));
2568                 ++regionidx_hpa;
2569         }
2570         return regionidx_hpa;
2571 }
2572
2573 /*
2574  * A new device is added to a data core. First the device is added to the main linked list
2575  * and the allocated to a specific data core.
2576  */
2577 static int
2578 new_device (struct virtio_net *dev)
2579 {
2580         struct virtio_net_data_ll *ll_dev;
2581         int lcore, core_add = 0;
2582         uint32_t device_num_min = num_devices;
2583         struct vhost_dev *vdev;
2584         uint32_t regionidx;
2585
2586         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2587         if (vdev == NULL) {
2588                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2589                         dev->device_fh);
2590                 return -1;
2591         }
2592         vdev->dev = dev;
2593         dev->priv = vdev;
2594
2595         if (zero_copy) {
2596                 vdev->nregions_hpa = dev->mem->nregions;
2597                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2598                         vdev->nregions_hpa
2599                                 += check_hpa_regions(
2600                                         dev->mem->regions[regionidx].guest_phys_address
2601                                         + dev->mem->regions[regionidx].address_offset,
2602                                         dev->mem->regions[regionidx].memory_size);
2603
2604                 }
2605
2606                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2607                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2608                         RTE_CACHE_LINE_SIZE);
2609                 if (vdev->regions_hpa == NULL) {
2610                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2611                         rte_free(vdev);
2612                         return -1;
2613                 }
2614
2615
2616                 if (fill_hpa_memory_regions(
2617                         vdev->regions_hpa, dev->mem
2618                         ) != vdev->nregions_hpa) {
2619
2620                         RTE_LOG(ERR, VHOST_CONFIG,
2621                                 "hpa memory regions number mismatch: "
2622                                 "[%d]\n", vdev->nregions_hpa);
2623                         rte_free(vdev->regions_hpa);
2624                         rte_free(vdev);
2625                         return -1;
2626                 }
2627         }
2628
2629
2630         /* Add device to main ll */
2631         ll_dev = get_data_ll_free_entry(&ll_root_free);
2632         if (ll_dev == NULL) {
2633                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2634                         "of %d devices per core has been reached\n",
2635                         dev->device_fh, num_devices);
2636                 if (vdev->regions_hpa)
2637                         rte_free(vdev->regions_hpa);
2638                 rte_free(vdev);
2639                 return -1;
2640         }
2641         ll_dev->vdev = vdev;
2642         add_data_ll_entry(&ll_root_used, ll_dev);
2643         vdev->vmdq_rx_q
2644                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2645
2646         if (zero_copy) {
2647                 uint32_t index = vdev->vmdq_rx_q;
2648                 uint32_t count_in_ring, i;
2649                 struct mbuf_table *tx_q;
2650
2651                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2652
2653                 LOG_DEBUG(VHOST_CONFIG,
2654                         "(%"PRIu64") in new_device: mbuf count in mempool "
2655                         "before attach is: %d\n",
2656                         dev->device_fh,
2657                         rte_mempool_count(vpool_array[index].pool));
2658                 LOG_DEBUG(VHOST_CONFIG,
2659                         "(%"PRIu64") in new_device: mbuf count in  ring "
2660                         "before attach  is : %d\n",
2661                         dev->device_fh, count_in_ring);
2662
2663                 /*
2664                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2665                  */
2666                 for (i = 0; i < count_in_ring; i++)
2667                         attach_rxmbuf_zcp(dev);
2668
2669                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2670                         "mempool after attach is: %d\n",
2671                         dev->device_fh,
2672                         rte_mempool_count(vpool_array[index].pool));
2673                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2674                         "ring after attach  is : %d\n",
2675                         dev->device_fh,
2676                         rte_ring_count(vpool_array[index].ring));
2677
2678                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2679                 tx_q->txq_id = vdev->vmdq_rx_q;
2680
2681                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2682                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2683
2684                         LOG_DEBUG(VHOST_CONFIG,
2685                                 "(%"PRIu64") In new_device: Failed to start "
2686                                 "tx queue:%d\n",
2687                                 dev->device_fh, vdev->vmdq_rx_q);
2688
2689                         mbuf_destroy_zcp(vpool);
2690                         rte_free(vdev->regions_hpa);
2691                         rte_free(vdev);
2692                         return -1;
2693                 }
2694
2695                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2696                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2697
2698                         LOG_DEBUG(VHOST_CONFIG,
2699                                 "(%"PRIu64") In new_device: Failed to start "
2700                                 "rx queue:%d\n",
2701                                 dev->device_fh, vdev->vmdq_rx_q);
2702
2703                         /* Stop the TX queue. */
2704                         if (rte_eth_dev_tx_queue_stop(ports[0],
2705                                 vdev->vmdq_rx_q) != 0) {
2706                                 LOG_DEBUG(VHOST_CONFIG,
2707                                         "(%"PRIu64") In new_device: Failed to "
2708                                         "stop tx queue:%d\n",
2709                                         dev->device_fh, vdev->vmdq_rx_q);
2710                         }
2711
2712                         mbuf_destroy_zcp(vpool);
2713                         rte_free(vdev->regions_hpa);
2714                         rte_free(vdev);
2715                         return -1;
2716                 }
2717
2718         }
2719
2720         /*reset ready flag*/
2721         vdev->ready = DEVICE_MAC_LEARNING;
2722         vdev->remove = 0;
2723
2724         /* Find a suitable lcore to add the device. */
2725         RTE_LCORE_FOREACH_SLAVE(lcore) {
2726                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2727                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2728                         core_add = lcore;
2729                 }
2730         }
2731         /* Add device to lcore ll */
2732         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2733         if (ll_dev == NULL) {
2734                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2735                 vdev->ready = DEVICE_SAFE_REMOVE;
2736                 destroy_device(dev);
2737                 if (vdev->regions_hpa)
2738                         rte_free(vdev->regions_hpa);
2739                 rte_free(vdev);
2740                 return -1;
2741         }
2742         ll_dev->vdev = vdev;
2743         vdev->coreid = core_add;
2744
2745         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2746
2747         /* Initialize device stats */
2748         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2749
2750         /* Disable notifications. */
2751         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2752         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2753         lcore_info[vdev->coreid].lcore_ll->device_num++;
2754         dev->flags |= VIRTIO_DEV_RUNNING;
2755
2756         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2757
2758         return 0;
2759 }
2760
2761 /*
2762  * These callback allow devices to be added to the data core when configuration
2763  * has been fully complete.
2764  */
2765 static const struct virtio_net_device_ops virtio_net_device_ops =
2766 {
2767         .new_device =  new_device,
2768         .destroy_device = destroy_device,
2769 };
2770
2771 /*
2772  * This is a thread will wake up after a period to print stats if the user has
2773  * enabled them.
2774  */
2775 static void
2776 print_stats(void)
2777 {
2778         struct virtio_net_data_ll *dev_ll;
2779         uint64_t tx_dropped, rx_dropped;
2780         uint64_t tx, tx_total, rx, rx_total;
2781         uint32_t device_fh;
2782         const char clr[] = { 27, '[', '2', 'J', '\0' };
2783         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2784
2785         while(1) {
2786                 sleep(enable_stats);
2787
2788                 /* Clear screen and move to top left */
2789                 printf("%s%s", clr, top_left);
2790
2791                 printf("\nDevice statistics ====================================");
2792
2793                 dev_ll = ll_root_used;
2794                 while (dev_ll != NULL) {
2795                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2796                         tx_total = dev_statistics[device_fh].tx_total;
2797                         tx = dev_statistics[device_fh].tx;
2798                         tx_dropped = tx_total - tx;
2799                         if (zero_copy == 0) {
2800                                 rx_total = rte_atomic64_read(
2801                                         &dev_statistics[device_fh].rx_total_atomic);
2802                                 rx = rte_atomic64_read(
2803                                         &dev_statistics[device_fh].rx_atomic);
2804                         } else {
2805                                 rx_total = dev_statistics[device_fh].rx_total;
2806                                 rx = dev_statistics[device_fh].rx;
2807                         }
2808                         rx_dropped = rx_total - rx;
2809
2810                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2811                                         "\nTX total:            %"PRIu64""
2812                                         "\nTX dropped:          %"PRIu64""
2813                                         "\nTX successful:               %"PRIu64""
2814                                         "\nRX total:            %"PRIu64""
2815                                         "\nRX dropped:          %"PRIu64""
2816                                         "\nRX successful:               %"PRIu64"",
2817                                         device_fh,
2818                                         tx_total,
2819                                         tx_dropped,
2820                                         tx,
2821                                         rx_total,
2822                                         rx_dropped,
2823                                         rx);
2824
2825                         dev_ll = dev_ll->next;
2826                 }
2827                 printf("\n======================================================\n");
2828         }
2829 }
2830
2831 static void
2832 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2833         char *ring_name, uint32_t nb_mbuf)
2834 {
2835         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2836         vpool_array[index].pool
2837                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2838                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2839                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2840                 rte_pktmbuf_init, NULL, socket, 0);
2841         if (vpool_array[index].pool != NULL) {
2842                 vpool_array[index].ring
2843                         = rte_ring_create(ring_name,
2844                                 rte_align32pow2(nb_mbuf + 1),
2845                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2846                 if (likely(vpool_array[index].ring != NULL)) {
2847                         LOG_DEBUG(VHOST_CONFIG,
2848                                 "in setup_mempool_tbl: mbuf count in "
2849                                 "mempool is: %d\n",
2850                                 rte_mempool_count(vpool_array[index].pool));
2851                         LOG_DEBUG(VHOST_CONFIG,
2852                                 "in setup_mempool_tbl: mbuf count in "
2853                                 "ring   is: %d\n",
2854                                 rte_ring_count(vpool_array[index].ring));
2855                 } else {
2856                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2857                                 ring_name);
2858                 }
2859
2860                 /* Need consider head room. */
2861                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2862         } else {
2863                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2864         }
2865 }
2866
2867
2868 /*
2869  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2870  * device is also registered here to handle the IOCTLs.
2871  */
2872 int
2873 main(int argc, char *argv[])
2874 {
2875         struct rte_mempool *mbuf_pool = NULL;
2876         unsigned lcore_id, core_id = 0;
2877         unsigned nb_ports, valid_num_ports;
2878         int ret;
2879         uint8_t portid;
2880         uint16_t queue_id;
2881         static pthread_t tid;
2882
2883         /* init EAL */
2884         ret = rte_eal_init(argc, argv);
2885         if (ret < 0)
2886                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2887         argc -= ret;
2888         argv += ret;
2889
2890         /* parse app arguments */
2891         ret = us_vhost_parse_args(argc, argv);
2892         if (ret < 0)
2893                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2894
2895         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2896                 if (rte_lcore_is_enabled(lcore_id))
2897                         lcore_ids[core_id ++] = lcore_id;
2898
2899         if (rte_lcore_count() > RTE_MAX_LCORE)
2900                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2901
2902         /*set the number of swithcing cores available*/
2903         num_switching_cores = rte_lcore_count()-1;
2904
2905         /* Get the number of physical ports. */
2906         nb_ports = rte_eth_dev_count();
2907         if (nb_ports > RTE_MAX_ETHPORTS)
2908                 nb_ports = RTE_MAX_ETHPORTS;
2909
2910         /*
2911          * Update the global var NUM_PORTS and global array PORTS
2912          * and get value of var VALID_NUM_PORTS according to system ports number
2913          */
2914         valid_num_ports = check_ports_num(nb_ports);
2915
2916         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2917                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2918                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2919                 return -1;
2920         }
2921
2922         if (zero_copy == 0) {
2923                 /* Create the mbuf pool. */
2924                 mbuf_pool = rte_mempool_create(
2925                                 "MBUF_POOL",
2926                                 NUM_MBUFS_PER_PORT
2927                                 * valid_num_ports,
2928                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2929                                 sizeof(struct rte_pktmbuf_pool_private),
2930                                 rte_pktmbuf_pool_init, NULL,
2931                                 rte_pktmbuf_init, NULL,
2932                                 rte_socket_id(), 0);
2933                 if (mbuf_pool == NULL)
2934                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2935
2936                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2937                         vpool_array[queue_id].pool = mbuf_pool;
2938
2939                 if (vm2vm_mode == VM2VM_HARDWARE) {
2940                         /* Enable VT loop back to let L2 switch to do it. */
2941                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2942                         LOG_DEBUG(VHOST_CONFIG,
2943                                 "Enable loop back for L2 switch in vmdq.\n");
2944                 }
2945         } else {
2946                 uint32_t nb_mbuf;
2947                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2948                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2949
2950                 nb_mbuf = num_rx_descriptor
2951                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2952                         + num_switching_cores * MAX_PKT_BURST;
2953
2954                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2955                         snprintf(pool_name, sizeof(pool_name),
2956                                 "rxmbuf_pool_%u", queue_id);
2957                         snprintf(ring_name, sizeof(ring_name),
2958                                 "rxmbuf_ring_%u", queue_id);
2959                         setup_mempool_tbl(rte_socket_id(), queue_id,
2960                                 pool_name, ring_name, nb_mbuf);
2961                 }
2962
2963                 nb_mbuf = num_tx_descriptor
2964                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2965                                 + num_switching_cores * MAX_PKT_BURST;
2966
2967                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2968                         snprintf(pool_name, sizeof(pool_name),
2969                                 "txmbuf_pool_%u", queue_id);
2970                         snprintf(ring_name, sizeof(ring_name),
2971                                 "txmbuf_ring_%u", queue_id);
2972                         setup_mempool_tbl(rte_socket_id(),
2973                                 (queue_id + MAX_QUEUES),
2974                                 pool_name, ring_name, nb_mbuf);
2975                 }
2976
2977                 if (vm2vm_mode == VM2VM_HARDWARE) {
2978                         /* Enable VT loop back to let L2 switch to do it. */
2979                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2980                         LOG_DEBUG(VHOST_CONFIG,
2981                                 "Enable loop back for L2 switch in vmdq.\n");
2982                 }
2983         }
2984         /* Set log level. */
2985         rte_set_log_level(LOG_LEVEL);
2986
2987         /* initialize all ports */
2988         for (portid = 0; portid < nb_ports; portid++) {
2989                 /* skip ports that are not enabled */
2990                 if ((enabled_port_mask & (1 << portid)) == 0) {
2991                         RTE_LOG(INFO, VHOST_PORT,
2992                                 "Skipping disabled port %d\n", portid);
2993                         continue;
2994                 }
2995                 if (port_init(portid) != 0)
2996                         rte_exit(EXIT_FAILURE,
2997                                 "Cannot initialize network ports\n");
2998         }
2999
3000         /* Initialise all linked lists. */
3001         if (init_data_ll() == -1)
3002                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3003
3004         /* Initialize device stats */
3005         memset(&dev_statistics, 0, sizeof(dev_statistics));
3006
3007         /* Enable stats if the user option is set. */
3008         if (enable_stats)
3009                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3010
3011         /* Launch all data cores. */
3012         if (zero_copy == 0) {
3013                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3014                         rte_eal_remote_launch(switch_worker,
3015                                 mbuf_pool, lcore_id);
3016                 }
3017         } else {
3018                 uint32_t count_in_mempool, index, i;
3019                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3020                         /* For all RX and TX queues. */
3021                         count_in_mempool
3022                                 = rte_mempool_count(vpool_array[index].pool);
3023
3024                         /*
3025                          * Transfer all un-attached mbufs from vpool.pool
3026                          * to vpoo.ring.
3027                          */
3028                         for (i = 0; i < count_in_mempool; i++) {
3029                                 struct rte_mbuf *mbuf
3030                                         = __rte_mbuf_raw_alloc(
3031                                                 vpool_array[index].pool);
3032                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3033                                                 (void *)mbuf);
3034                         }
3035
3036                         LOG_DEBUG(VHOST_CONFIG,
3037                                 "in main: mbuf count in mempool at initial "
3038                                 "is: %d\n", count_in_mempool);
3039                         LOG_DEBUG(VHOST_CONFIG,
3040                                 "in main: mbuf count in  ring at initial  is :"
3041                                 " %d\n",
3042                                 rte_ring_count(vpool_array[index].ring));
3043                 }
3044
3045                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3046                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3047                                 lcore_id);
3048         }
3049
3050         if (mergeable == 0)
3051                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3052
3053         /* Register CUSE device to handle IOCTLs. */
3054         ret = rte_vhost_driver_register((char *)&dev_basename);
3055         if (ret != 0)
3056                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3057
3058         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3059
3060         /* Start CUSE session. */
3061         rte_vhost_driver_session_start();
3062         return 0;
3063
3064 }
3065