examples/vhost: use factorized default Rx/Tx configuration
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 256
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
84
85 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
87
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX                       1
93 #define DEVICE_SAFE_REMOVE      2
94
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117                 + sizeof(struct rte_mbuf)))
118
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121
122 #define INVALID_PORT_ID 0xFF
123
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161
162 /* number of descriptors to apply*/
163 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
164 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
165
166 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
167 #define MAX_RING_DESC 4096
168
169 struct vpool {
170         struct rte_mempool *pool;
171         struct rte_ring *ring;
172         uint32_t buf_size;
173 } vpool_array[MAX_QUEUES+MAX_QUEUES];
174
175 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
176 typedef enum {
177         VM2VM_DISABLED = 0,
178         VM2VM_SOFTWARE = 1,
179         VM2VM_HARDWARE = 2,
180         VM2VM_LAST
181 } vm2vm_type;
182 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
183
184 /* The type of host physical address translated from guest physical address. */
185 typedef enum {
186         PHYS_ADDR_CONTINUOUS = 0,
187         PHYS_ADDR_CROSS_SUBREG = 1,
188         PHYS_ADDR_INVALID = 2,
189         PHYS_ADDR_LAST
190 } hpa_type;
191
192 /* Enable stats. */
193 static uint32_t enable_stats = 0;
194 /* Enable retries on RX. */
195 static uint32_t enable_retry = 1;
196 /* Specify timeout (in useconds) between retries on RX. */
197 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
198 /* Specify the number of retries on RX. */
199 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
200
201 /* Character device basename. Can be set by user. */
202 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
203
204 /* empty vmdq configuration structure. Filled in programatically */
205 static struct rte_eth_conf vmdq_conf_default = {
206         .rxmode = {
207                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
208                 .split_hdr_size = 0,
209                 .header_split   = 0, /**< Header Split disabled */
210                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
211                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
212                 /*
213                  * It is necessary for 1G NIC such as I350,
214                  * this fixes bug of ipv4 forwarding in guest can't
215                  * forward pakets from one virtio dev to another virtio dev.
216                  */
217                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
218                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
219                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
220         },
221
222         .txmode = {
223                 .mq_mode = ETH_MQ_TX_NONE,
224         },
225         .rx_adv_conf = {
226                 /*
227                  * should be overridden separately in code with
228                  * appropriate values
229                  */
230                 .vmdq_rx_conf = {
231                         .nb_queue_pools = ETH_8_POOLS,
232                         .enable_default_pool = 0,
233                         .default_pool = 0,
234                         .nb_pool_maps = 0,
235                         .pool_map = {{0, 0},},
236                 },
237         },
238 };
239
240 static unsigned lcore_ids[RTE_MAX_LCORE];
241 static uint8_t ports[RTE_MAX_ETHPORTS];
242 static unsigned num_ports = 0; /**< The number of ports specified in command line */
243 static uint16_t num_pf_queues, num_vmdq_queues;
244 static uint16_t vmdq_pool_base, vmdq_queue_base;
245 static uint16_t queues_per_pool;
246
247 static const uint16_t external_pkt_default_vlan_tag = 2000;
248 const uint16_t vlan_tags[] = {
249         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
250         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
251         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
252         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
253         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
254         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
255         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
256         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
257 };
258
259 /* ethernet addresses of ports */
260 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
261
262 /* heads for the main used and free linked lists for the data path. */
263 static struct virtio_net_data_ll *ll_root_used = NULL;
264 static struct virtio_net_data_ll *ll_root_free = NULL;
265
266 /* Array of data core structures containing information on individual core linked lists. */
267 static struct lcore_info lcore_info[RTE_MAX_LCORE];
268
269 /* Used for queueing bursts of TX packets. */
270 struct mbuf_table {
271         unsigned len;
272         unsigned txq_id;
273         struct rte_mbuf *m_table[MAX_PKT_BURST];
274 };
275
276 /* TX queue for each data core. */
277 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
278
279 /* TX queue fori each virtio device for zero copy. */
280 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
281
282 /* Vlan header struct used to insert vlan tags on TX. */
283 struct vlan_ethhdr {
284         unsigned char   h_dest[ETH_ALEN];
285         unsigned char   h_source[ETH_ALEN];
286         __be16          h_vlan_proto;
287         __be16          h_vlan_TCI;
288         __be16          h_vlan_encapsulated_proto;
289 };
290
291 /* IPv4 Header */
292 struct ipv4_hdr {
293         uint8_t  version_ihl;           /**< version and header length */
294         uint8_t  type_of_service;       /**< type of service */
295         uint16_t total_length;          /**< length of packet */
296         uint16_t packet_id;             /**< packet ID */
297         uint16_t fragment_offset;       /**< fragmentation offset */
298         uint8_t  time_to_live;          /**< time to live */
299         uint8_t  next_proto_id;         /**< protocol ID */
300         uint16_t hdr_checksum;          /**< header checksum */
301         uint32_t src_addr;              /**< source address */
302         uint32_t dst_addr;              /**< destination address */
303 } __attribute__((__packed__));
304
305 /* Header lengths. */
306 #define VLAN_HLEN       4
307 #define VLAN_ETH_HLEN   18
308
309 /* Per-device statistics struct */
310 struct device_statistics {
311         uint64_t tx_total;
312         rte_atomic64_t rx_total_atomic;
313         uint64_t rx_total;
314         uint64_t tx;
315         rte_atomic64_t rx_atomic;
316         uint64_t rx;
317 } __rte_cache_aligned;
318 struct device_statistics dev_statistics[MAX_DEVICES];
319
320 /*
321  * Builds up the correct configuration for VMDQ VLAN pool map
322  * according to the pool & queue limits.
323  */
324 static inline int
325 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
326 {
327         struct rte_eth_vmdq_rx_conf conf;
328         struct rte_eth_vmdq_rx_conf *def_conf =
329                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
330         unsigned i;
331
332         memset(&conf, 0, sizeof(conf));
333         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
334         conf.nb_pool_maps = num_devices;
335         conf.enable_loop_back = def_conf->enable_loop_back;
336         conf.rx_mode = def_conf->rx_mode;
337
338         for (i = 0; i < conf.nb_pool_maps; i++) {
339                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
340                 conf.pool_map[i].pools = (1UL << i);
341         }
342
343         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
344         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
345                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
346         return 0;
347 }
348
349 /*
350  * Validate the device number according to the max pool number gotten form
351  * dev_info. If the device number is invalid, give the error message and
352  * return -1. Each device must have its own pool.
353  */
354 static inline int
355 validate_num_devices(uint32_t max_nb_devices)
356 {
357         if (num_devices > max_nb_devices) {
358                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
359                 return -1;
360         }
361         return 0;
362 }
363
364 /*
365  * Initialises a given port using global settings and with the rx buffers
366  * coming from the mbuf_pool passed as parameter
367  */
368 static inline int
369 port_init(uint8_t port)
370 {
371         struct rte_eth_dev_info dev_info;
372         struct rte_eth_conf port_conf;
373         struct rte_eth_rxconf *rxconf;
374         struct rte_eth_txconf *txconf;
375         int16_t rx_rings, tx_rings;
376         uint16_t rx_ring_size, tx_ring_size;
377         int retval;
378         uint16_t q;
379
380         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
381         rte_eth_dev_info_get (port, &dev_info);
382
383         rxconf = &dev_info.default_rxconf;
384         txconf = &dev_info.default_txconf;
385         rxconf->rx_drop_en = 1;
386
387         /*
388          * Zero copy defers queue RX/TX start to the time when guest
389          * finishes its startup and packet buffers from that guest are
390          * available.
391          */
392         if (zero_copy) {
393                 rxconf->rx_deferred_start = 1;
394                 rxconf->rx_drop_en = 0;
395                 txconf->tx_deferred_start = 1;
396         }
397
398         /*configure the number of supported virtio devices based on VMDQ limits */
399         num_devices = dev_info.max_vmdq_pools;
400
401         if (zero_copy) {
402                 rx_ring_size = num_rx_descriptor;
403                 tx_ring_size = num_tx_descriptor;
404                 tx_rings = dev_info.max_tx_queues;
405         } else {
406                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
408                 tx_rings = (uint16_t)rte_lcore_count();
409         }
410
411         retval = validate_num_devices(MAX_DEVICES);
412         if (retval < 0)
413                 return retval;
414
415         /* Get port configuration. */
416         retval = get_eth_conf(&port_conf, num_devices);
417         if (retval < 0)
418                 return retval;
419         /* NIC queues are divided into pf queues and vmdq queues.  */
420         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
421         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
422         num_vmdq_queues = num_devices * queues_per_pool;
423         num_queues = num_pf_queues + num_vmdq_queues;
424         vmdq_queue_base = dev_info.vmdq_queue_base;
425         vmdq_pool_base  = dev_info.vmdq_pool_base;
426         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
427                 num_pf_queues, num_devices, queues_per_pool);
428
429         if (port >= rte_eth_dev_count()) return -1;
430
431         rx_rings = (uint16_t)dev_info.max_rx_queues;
432         /* Configure ethernet device. */
433         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
434         if (retval != 0)
435                 return retval;
436
437         /* Setup the queues. */
438         for (q = 0; q < rx_rings; q ++) {
439                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
440                                                 rte_eth_dev_socket_id(port),
441                                                 rxconf,
442                                                 vpool_array[q].pool);
443                 if (retval < 0)
444                         return retval;
445         }
446         for (q = 0; q < tx_rings; q ++) {
447                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
448                                                 rte_eth_dev_socket_id(port),
449                                                 txconf);
450                 if (retval < 0)
451                         return retval;
452         }
453
454         /* Start the device. */
455         retval  = rte_eth_dev_start(port);
456         if (retval < 0) {
457                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
458                 return retval;
459         }
460
461         if (promiscuous)
462                 rte_eth_promiscuous_enable(port);
463
464         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
465         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
466         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
467                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
468                         (unsigned)port,
469                         vmdq_ports_eth_addr[port].addr_bytes[0],
470                         vmdq_ports_eth_addr[port].addr_bytes[1],
471                         vmdq_ports_eth_addr[port].addr_bytes[2],
472                         vmdq_ports_eth_addr[port].addr_bytes[3],
473                         vmdq_ports_eth_addr[port].addr_bytes[4],
474                         vmdq_ports_eth_addr[port].addr_bytes[5]);
475
476         return 0;
477 }
478
479 /*
480  * Set character device basename.
481  */
482 static int
483 us_vhost_parse_basename(const char *q_arg)
484 {
485         /* parse number string */
486
487         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
488                 return -1;
489         else
490                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
491
492         return 0;
493 }
494
495 /*
496  * Parse the portmask provided at run time.
497  */
498 static int
499 parse_portmask(const char *portmask)
500 {
501         char *end = NULL;
502         unsigned long pm;
503
504         errno = 0;
505
506         /* parse hexadecimal string */
507         pm = strtoul(portmask, &end, 16);
508         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
509                 return -1;
510
511         if (pm == 0)
512                 return -1;
513
514         return pm;
515
516 }
517
518 /*
519  * Parse num options at run time.
520  */
521 static int
522 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
523 {
524         char *end = NULL;
525         unsigned long num;
526
527         errno = 0;
528
529         /* parse unsigned int string */
530         num = strtoul(q_arg, &end, 10);
531         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
532                 return -1;
533
534         if (num > max_valid_value)
535                 return -1;
536
537         return num;
538
539 }
540
541 /*
542  * Display usage
543  */
544 static void
545 us_vhost_usage(const char *prgname)
546 {
547         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
548         "               --vm2vm [0|1|2]\n"
549         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
550         "               --dev-basename <name>\n"
551         "               --nb-devices ND\n"
552         "               -p PORTMASK: Set mask for ports to be used by application\n"
553         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
554         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
555         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
556         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
557         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
558         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
559         "               --dev-basename: The basename to be used for the character device.\n"
560         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
561                         "zero copy\n"
562         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
563                         "used only when zero copy is enabled.\n"
564         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
565                         "used only when zero copy is enabled.\n",
566                prgname);
567 }
568
569 /*
570  * Parse the arguments given in the command line of the application.
571  */
572 static int
573 us_vhost_parse_args(int argc, char **argv)
574 {
575         int opt, ret;
576         int option_index;
577         unsigned i;
578         const char *prgname = argv[0];
579         static struct option long_option[] = {
580                 {"vm2vm", required_argument, NULL, 0},
581                 {"rx-retry", required_argument, NULL, 0},
582                 {"rx-retry-delay", required_argument, NULL, 0},
583                 {"rx-retry-num", required_argument, NULL, 0},
584                 {"mergeable", required_argument, NULL, 0},
585                 {"stats", required_argument, NULL, 0},
586                 {"dev-basename", required_argument, NULL, 0},
587                 {"zero-copy", required_argument, NULL, 0},
588                 {"rx-desc-num", required_argument, NULL, 0},
589                 {"tx-desc-num", required_argument, NULL, 0},
590                 {NULL, 0, 0, 0},
591         };
592
593         /* Parse command line */
594         while ((opt = getopt_long(argc, argv, "p:P",
595                         long_option, &option_index)) != EOF) {
596                 switch (opt) {
597                 /* Portmask */
598                 case 'p':
599                         enabled_port_mask = parse_portmask(optarg);
600                         if (enabled_port_mask == 0) {
601                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
602                                 us_vhost_usage(prgname);
603                                 return -1;
604                         }
605                         break;
606
607                 case 'P':
608                         promiscuous = 1;
609                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
610                                 ETH_VMDQ_ACCEPT_BROADCAST |
611                                 ETH_VMDQ_ACCEPT_MULTICAST;
612                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
613
614                         break;
615
616                 case 0:
617                         /* Enable/disable vm2vm comms. */
618                         if (!strncmp(long_option[option_index].name, "vm2vm",
619                                 MAX_LONG_OPT_SZ)) {
620                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621                                 if (ret == -1) {
622                                         RTE_LOG(INFO, VHOST_CONFIG,
623                                                 "Invalid argument for "
624                                                 "vm2vm [0|1|2]\n");
625                                         us_vhost_usage(prgname);
626                                         return -1;
627                                 } else {
628                                         vm2vm_mode = (vm2vm_type)ret;
629                                 }
630                         }
631
632                         /* Enable/disable retries on RX. */
633                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634                                 ret = parse_num_opt(optarg, 1);
635                                 if (ret == -1) {
636                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637                                         us_vhost_usage(prgname);
638                                         return -1;
639                                 } else {
640                                         enable_retry = ret;
641                                 }
642                         }
643
644                         /* Specify the retries delay time (in useconds) on RX. */
645                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646                                 ret = parse_num_opt(optarg, INT32_MAX);
647                                 if (ret == -1) {
648                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         burst_rx_delay_time = ret;
653                                 }
654                         }
655
656                         /* Specify the retries number on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, INT32_MAX);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         burst_rx_retry_num = ret;
665                                 }
666                         }
667
668                         /* Enable/disable RX mergeable buffers. */
669                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else {
676                                         mergeable = !!ret;
677                                         if (ret) {
678                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
679                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
680                                                         = JUMBO_FRAME_MAX_SIZE;
681                                         }
682                                 }
683                         }
684
685                         /* Enable/disable stats. */
686                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687                                 ret = parse_num_opt(optarg, INT32_MAX);
688                                 if (ret == -1) {
689                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690                                         us_vhost_usage(prgname);
691                                         return -1;
692                                 } else {
693                                         enable_stats = ret;
694                                 }
695                         }
696
697                         /* Set character device basename. */
698                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699                                 if (us_vhost_parse_basename(optarg) == -1) {
700                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701                                         us_vhost_usage(prgname);
702                                         return -1;
703                                 }
704                         }
705
706                         /* Enable/disable rx/tx zero copy. */
707                         if (!strncmp(long_option[option_index].name,
708                                 "zero-copy", MAX_LONG_OPT_SZ)) {
709                                 ret = parse_num_opt(optarg, 1);
710                                 if (ret == -1) {
711                                         RTE_LOG(INFO, VHOST_CONFIG,
712                                                 "Invalid argument"
713                                                 " for zero-copy [0|1]\n");
714                                         us_vhost_usage(prgname);
715                                         return -1;
716                                 } else
717                                         zero_copy = ret;
718
719                                 if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722                                         "zero copy vhost APP, please "
723                                         "disable RTE_MBUF_REFCNT\n"
724                                         "in config file and then rebuild DPDK "
725                                         "core lib!\n"
726                                         "Otherwise please disable zero copy "
727                                         "flag in command line!\n");
728                                         return -1;
729 #endif
730                                 }
731                         }
732
733                         /* Specify the descriptor number on RX. */
734                         if (!strncmp(long_option[option_index].name,
735                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
736                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
737                                 if ((ret == -1) || (!POWEROF2(ret))) {
738                                         RTE_LOG(INFO, VHOST_CONFIG,
739                                         "Invalid argument for rx-desc-num[0-N],"
740                                         "power of 2 required.\n");
741                                         us_vhost_usage(prgname);
742                                         return -1;
743                                 } else {
744                                         num_rx_descriptor = ret;
745                                 }
746                         }
747
748                         /* Specify the descriptor number on TX. */
749                         if (!strncmp(long_option[option_index].name,
750                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
751                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
752                                 if ((ret == -1) || (!POWEROF2(ret))) {
753                                         RTE_LOG(INFO, VHOST_CONFIG,
754                                         "Invalid argument for tx-desc-num [0-N],"
755                                         "power of 2 required.\n");
756                                         us_vhost_usage(prgname);
757                                         return -1;
758                                 } else {
759                                         num_tx_descriptor = ret;
760                                 }
761                         }
762
763                         break;
764
765                         /* Invalid option - print options. */
766                 default:
767                         us_vhost_usage(prgname);
768                         return -1;
769                 }
770         }
771
772         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773                 if (enabled_port_mask & (1 << i))
774                         ports[num_ports++] = (uint8_t)i;
775         }
776
777         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780                 return -1;
781         }
782
783         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784                 RTE_LOG(INFO, VHOST_PORT,
785                         "Vhost zero copy doesn't support software vm2vm,"
786                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
787                 return -1;
788         }
789
790         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791                 RTE_LOG(INFO, VHOST_PORT,
792                         "Vhost zero copy doesn't support jumbo frame,"
793                         "please specify '--mergeable 0' to disable the "
794                         "mergeable feature.\n");
795                 return -1;
796         }
797
798         return 0;
799 }
800
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807         unsigned valid_num_ports = num_ports;
808         unsigned portid;
809
810         if (num_ports > nb_ports) {
811                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812                         num_ports, nb_ports);
813                 num_ports = nb_ports;
814         }
815
816         for (portid = 0; portid < num_ports; portid ++) {
817                 if (ports[portid] >= nb_ports) {
818                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819                                 ports[portid], (nb_ports - 1));
820                         ports[portid] = INVALID_PORT_ID;
821                         valid_num_ports--;
822                 }
823         }
824         return valid_num_ports;
825 }
826
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
833         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
834         unsigned int index;                                                                                                                                                                                             \
835         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
836                                                                                                                                                                                                                                         \
837         if ((header))                                                                                                                                                                                                   \
838                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
839         else                                                                                                                                                                                                                    \
840                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
841         for (index = 0; index < (size); index++) {                                                                                                                                              \
842                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
843                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
844         }                                                                                                                                                                                                                               \
845         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
846                                                                                                                                                                                                                                         \
847         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859         uint32_t buf_len, hpa_type *addr_type)
860 {
861         struct virtio_memory_regions_hpa *region;
862         uint32_t regionidx;
863         uint64_t vhost_pa = 0;
864
865         *addr_type = PHYS_ADDR_INVALID;
866
867         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868                 region = &vdev->regions_hpa[regionidx];
869                 if ((guest_pa >= region->guest_phys_address) &&
870                         (guest_pa <= region->guest_phys_address_end)) {
871                         vhost_pa = region->host_phys_addr_offset + guest_pa;
872                         if (likely((guest_pa + buf_len - 1)
873                                 <= region->guest_phys_address_end))
874                                 *addr_type = PHYS_ADDR_CONTINUOUS;
875                         else
876                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
877                         break;
878                 }
879         }
880
881         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883                 (void *)(uintptr_t)vhost_pa);
884
885         return vhost_pa;
886 }
887
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904         struct ether_hdr *pkt_hdr;
905         struct virtio_net_data_ll *dev_ll;
906         struct virtio_net *dev = vdev->dev;
907         int i, ret;
908
909         /* Learn MAC address of guest device from packet */
910         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911
912         dev_ll = ll_root_used;
913
914         while (dev_ll != NULL) {
915                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917                         return -1;
918                 }
919                 dev_ll = dev_ll->next;
920         }
921
922         for (i = 0; i < ETHER_ADDR_LEN; i++)
923                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924
925         /* vlan_tag currently uses the device_id. */
926         vdev->vlan_tag = vlan_tags[dev->device_fh];
927
928         /* Print out VMDQ registration info. */
929         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930                 dev->device_fh,
931                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934                 vdev->vlan_tag);
935
936         /* Register the MAC address. */
937         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
938                                 (uint32_t)dev->device_fh + vmdq_pool_base);
939         if (ret)
940                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
941                                         dev->device_fh);
942
943         /* Enable stripping of the vlan tag as we handle routing. */
944         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
945
946         /* Set device as ready for RX. */
947         vdev->ready = DEVICE_RX;
948
949         return 0;
950 }
951
952 /*
953  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
954  * queue before disabling RX on the device.
955  */
956 static inline void
957 unlink_vmdq(struct vhost_dev *vdev)
958 {
959         unsigned i = 0;
960         unsigned rx_count;
961         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
962
963         if (vdev->ready == DEVICE_RX) {
964                 /*clear MAC and VLAN settings*/
965                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
966                 for (i = 0; i < 6; i++)
967                         vdev->mac_address.addr_bytes[i] = 0;
968
969                 vdev->vlan_tag = 0;
970
971                 /*Clear out the receive buffers*/
972                 rx_count = rte_eth_rx_burst(ports[0],
973                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
974
975                 while (rx_count) {
976                         for (i = 0; i < rx_count; i++)
977                                 rte_pktmbuf_free(pkts_burst[i]);
978
979                         rx_count = rte_eth_rx_burst(ports[0],
980                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
981                 }
982
983                 vdev->ready = DEVICE_MAC_LEARNING;
984         }
985 }
986
987 /*
988  * Check if the packet destination MAC address is for a local device. If so then put
989  * the packet on that devices RX queue. If not then return.
990  */
991 static inline int __attribute__((always_inline))
992 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
993 {
994         struct virtio_net_data_ll *dev_ll;
995         struct ether_hdr *pkt_hdr;
996         uint64_t ret = 0;
997         struct virtio_net *dev = vdev->dev;
998         struct virtio_net *tdev; /* destination virito device */
999
1000         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1001
1002         /*get the used devices list*/
1003         dev_ll = ll_root_used;
1004
1005         while (dev_ll != NULL) {
1006                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1007                                           &dev_ll->vdev->mac_address)) {
1008
1009                         /* Drop the packet if the TX packet is destined for the TX device. */
1010                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1011                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1012                                                         dev->device_fh);
1013                                 return 0;
1014                         }
1015                         tdev = dev_ll->vdev->dev;
1016
1017
1018                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1019
1020                         if (unlikely(dev_ll->vdev->remove)) {
1021                                 /*drop the packet if the device is marked for removal*/
1022                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1023                         } else {
1024                                 /*send the packet to the local virtio device*/
1025                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1026                                 if (enable_stats) {
1027                                         rte_atomic64_add(
1028                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1029                                         1);
1030                                         rte_atomic64_add(
1031                                         &dev_statistics[tdev->device_fh].rx_atomic,
1032                                         ret);
1033                                         dev_statistics[tdev->device_fh].tx_total++;
1034                                         dev_statistics[tdev->device_fh].tx += ret;
1035                                 }
1036                         }
1037
1038                         return 0;
1039                 }
1040                 dev_ll = dev_ll->next;
1041         }
1042
1043         return -1;
1044 }
1045
1046 /*
1047  * Check if the destination MAC of a packet is one local VM,
1048  * and get its vlan tag, and offset if it is.
1049  */
1050 static inline int __attribute__((always_inline))
1051 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1052         uint32_t *offset, uint16_t *vlan_tag)
1053 {
1054         struct virtio_net_data_ll *dev_ll = ll_root_used;
1055         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1056
1057         while (dev_ll != NULL) {
1058                 if ((dev_ll->vdev->ready == DEVICE_RX)
1059                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1060                 &dev_ll->vdev->mac_address)) {
1061                         /*
1062                          * Drop the packet if the TX packet is
1063                          * destined for the TX device.
1064                          */
1065                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1066                                 LOG_DEBUG(VHOST_DATA,
1067                                 "(%"PRIu64") TX: Source and destination"
1068                                 " MAC addresses are the same. Dropping "
1069                                 "packet.\n",
1070                                 dev_ll->vdev->dev->device_fh);
1071                                 return -1;
1072                         }
1073
1074                         /*
1075                          * HW vlan strip will reduce the packet length
1076                          * by minus length of vlan tag, so need restore
1077                          * the packet length by plus it.
1078                          */
1079                         *offset = VLAN_HLEN;
1080                         *vlan_tag =
1081                         (uint16_t)
1082                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1083
1084                         LOG_DEBUG(VHOST_DATA,
1085                         "(%"PRIu64") TX: pkt to local VM device id:"
1086                         "(%"PRIu64") vlan tag: %d.\n",
1087                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1088                         vlan_tag);
1089
1090                         break;
1091                 }
1092                 dev_ll = dev_ll->next;
1093         }
1094         return 0;
1095 }
1096
1097 /*
1098  * This function routes the TX packet to the correct interface. This may be a local device
1099  * or the physical port.
1100  */
1101 static inline void __attribute__((always_inline))
1102 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1103 {
1104         struct mbuf_table *tx_q;
1105         struct rte_mbuf **m_table;
1106         unsigned len, ret, offset = 0;
1107         const uint16_t lcore_id = rte_lcore_id();
1108         struct virtio_net *dev = vdev->dev;
1109
1110         /*check if destination is local VM*/
1111         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1112                 rte_pktmbuf_free(m);
1113                 return;
1114         }
1115
1116         if (vm2vm_mode == VM2VM_HARDWARE) {
1117                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1118                         offset > rte_pktmbuf_tailroom(m)) {
1119                         rte_pktmbuf_free(m);
1120                         return;
1121                 }
1122         }
1123
1124         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1125
1126         /*Add packet to the port tx queue*/
1127         tx_q = &lcore_tx_queue[lcore_id];
1128         len = tx_q->len;
1129
1130         m->ol_flags = PKT_TX_VLAN_PKT;
1131
1132         m->data_len += offset;
1133         m->pkt_len += offset;
1134
1135         m->vlan_tci = vlan_tag;
1136
1137         tx_q->m_table[len] = m;
1138         len++;
1139         if (enable_stats) {
1140                 dev_statistics[dev->device_fh].tx_total++;
1141                 dev_statistics[dev->device_fh].tx++;
1142         }
1143
1144         if (unlikely(len == MAX_PKT_BURST)) {
1145                 m_table = (struct rte_mbuf **)tx_q->m_table;
1146                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1147                 /* Free any buffers not handled by TX and update the port stats. */
1148                 if (unlikely(ret < len)) {
1149                         do {
1150                                 rte_pktmbuf_free(m_table[ret]);
1151                         } while (++ret < len);
1152                 }
1153
1154                 len = 0;
1155         }
1156
1157         tx_q->len = len;
1158         return;
1159 }
1160 /*
1161  * This function is called by each data core. It handles all RX/TX registered with the
1162  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1163  * with all devices in the main linked list.
1164  */
1165 static int
1166 switch_worker(__attribute__((unused)) void *arg)
1167 {
1168         struct rte_mempool *mbuf_pool = arg;
1169         struct virtio_net *dev = NULL;
1170         struct vhost_dev *vdev = NULL;
1171         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1172         struct virtio_net_data_ll *dev_ll;
1173         struct mbuf_table *tx_q;
1174         volatile struct lcore_ll_info *lcore_ll;
1175         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1176         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1177         unsigned ret, i;
1178         const uint16_t lcore_id = rte_lcore_id();
1179         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1180         uint16_t rx_count = 0;
1181         uint16_t tx_count;
1182         uint32_t retry = 0;
1183
1184         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1185         lcore_ll = lcore_info[lcore_id].lcore_ll;
1186         prev_tsc = 0;
1187
1188         tx_q = &lcore_tx_queue[lcore_id];
1189         for (i = 0; i < num_cores; i ++) {
1190                 if (lcore_ids[i] == lcore_id) {
1191                         tx_q->txq_id = i;
1192                         break;
1193                 }
1194         }
1195
1196         while(1) {
1197                 cur_tsc = rte_rdtsc();
1198                 /*
1199                  * TX burst queue drain
1200                  */
1201                 diff_tsc = cur_tsc - prev_tsc;
1202                 if (unlikely(diff_tsc > drain_tsc)) {
1203
1204                         if (tx_q->len) {
1205                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1206
1207                                 /*Tx any packets in the queue*/
1208                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1209                                                                            (struct rte_mbuf **)tx_q->m_table,
1210                                                                            (uint16_t)tx_q->len);
1211                                 if (unlikely(ret < tx_q->len)) {
1212                                         do {
1213                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1214                                         } while (++ret < tx_q->len);
1215                                 }
1216
1217                                 tx_q->len = 0;
1218                         }
1219
1220                         prev_tsc = cur_tsc;
1221
1222                 }
1223
1224                 rte_prefetch0(lcore_ll->ll_root_used);
1225                 /*
1226                  * Inform the configuration core that we have exited the linked list and that no devices are
1227                  * in use if requested.
1228                  */
1229                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1230                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1231
1232                 /*
1233                  * Process devices
1234                  */
1235                 dev_ll = lcore_ll->ll_root_used;
1236
1237                 while (dev_ll != NULL) {
1238                         /*get virtio device ID*/
1239                         vdev = dev_ll->vdev;
1240                         dev = vdev->dev;
1241
1242                         if (unlikely(vdev->remove)) {
1243                                 dev_ll = dev_ll->next;
1244                                 unlink_vmdq(vdev);
1245                                 vdev->ready = DEVICE_SAFE_REMOVE;
1246                                 continue;
1247                         }
1248                         if (likely(vdev->ready == DEVICE_RX)) {
1249                                 /*Handle guest RX*/
1250                                 rx_count = rte_eth_rx_burst(ports[0],
1251                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1252
1253                                 if (rx_count) {
1254                                         /*
1255                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1256                                         * Here MAX_PKT_BURST must be less than virtio queue size
1257                                         */
1258                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1259                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1260                                                         rte_delay_us(burst_rx_delay_time);
1261                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1262                                                                 break;
1263                                                 }
1264                                         }
1265                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1266                                         if (enable_stats) {
1267                                                 rte_atomic64_add(
1268                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1269                                                 rx_count);
1270                                                 rte_atomic64_add(
1271                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1272                                         }
1273                                         while (likely(rx_count)) {
1274                                                 rx_count--;
1275                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1276                                         }
1277
1278                                 }
1279                         }
1280
1281                         if (likely(!vdev->remove)) {
1282                                 /* Handle guest TX*/
1283                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1284                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1285                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1286                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1287                                                 while (tx_count--)
1288                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1289                                         }
1290                                 }
1291                                 while (tx_count)
1292                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1293                         }
1294
1295                         /*move to the next device in the list*/
1296                         dev_ll = dev_ll->next;
1297                 }
1298         }
1299
1300         return 0;
1301 }
1302
1303 /*
1304  * This function gets available ring number for zero copy rx.
1305  * Only one thread will call this funciton for a paticular virtio device,
1306  * so, it is designed as non-thread-safe function.
1307  */
1308 static inline uint32_t __attribute__((always_inline))
1309 get_available_ring_num_zcp(struct virtio_net *dev)
1310 {
1311         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1312         uint16_t avail_idx;
1313
1314         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1315         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1316 }
1317
1318 /*
1319  * This function gets available ring index for zero copy rx,
1320  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1321  * Only one thread will call this funciton for a paticular virtio device,
1322  * so, it is designed as non-thread-safe function.
1323  */
1324 static inline uint32_t __attribute__((always_inline))
1325 get_available_ring_index_zcp(struct virtio_net *dev,
1326         uint16_t *res_base_idx, uint32_t count)
1327 {
1328         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1329         uint16_t avail_idx;
1330         uint32_t retry = 0;
1331         uint16_t free_entries;
1332
1333         *res_base_idx = vq->last_used_idx_res;
1334         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1335         free_entries = (avail_idx - *res_base_idx);
1336
1337         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1338                         "avail idx: %d, "
1339                         "res base idx:%d, free entries:%d\n",
1340                         dev->device_fh, avail_idx, *res_base_idx,
1341                         free_entries);
1342
1343         /*
1344          * If retry is enabled and the queue is full then we wait
1345          * and retry to avoid packet loss.
1346          */
1347         if (enable_retry && unlikely(count > free_entries)) {
1348                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1349                         rte_delay_us(burst_rx_delay_time);
1350                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1351                         free_entries = (avail_idx - *res_base_idx);
1352                         if (count <= free_entries)
1353                                 break;
1354                 }
1355         }
1356
1357         /*check that we have enough buffers*/
1358         if (unlikely(count > free_entries))
1359                 count = free_entries;
1360
1361         if (unlikely(count == 0)) {
1362                 LOG_DEBUG(VHOST_DATA,
1363                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1364                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1365                         dev->device_fh, avail_idx,
1366                         *res_base_idx, free_entries);
1367                 return 0;
1368         }
1369
1370         vq->last_used_idx_res = *res_base_idx + count;
1371
1372         return count;
1373 }
1374
1375 /*
1376  * This function put descriptor back to used list.
1377  */
1378 static inline void __attribute__((always_inline))
1379 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1380 {
1381         uint16_t res_cur_idx = vq->last_used_idx;
1382         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1383         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1384         rte_compiler_barrier();
1385         *(volatile uint16_t *)&vq->used->idx += 1;
1386         vq->last_used_idx += 1;
1387
1388         /* Kick the guest if necessary. */
1389         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1390                 eventfd_write((int)vq->kickfd, 1);
1391 }
1392
1393 /*
1394  * This function get available descriptor from vitio vring and un-attached mbuf
1395  * from vpool->ring, and then attach them together. It needs adjust the offset
1396  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1397  * frame data may be put to wrong location in mbuf.
1398  */
1399 static inline void __attribute__((always_inline))
1400 attach_rxmbuf_zcp(struct virtio_net *dev)
1401 {
1402         uint16_t res_base_idx, desc_idx;
1403         uint64_t buff_addr, phys_addr;
1404         struct vhost_virtqueue *vq;
1405         struct vring_desc *desc;
1406         struct rte_mbuf *mbuf = NULL;
1407         struct vpool *vpool;
1408         hpa_type addr_type;
1409         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1410
1411         vpool = &vpool_array[vdev->vmdq_rx_q];
1412         vq = dev->virtqueue[VIRTIO_RXQ];
1413
1414         do {
1415                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1416                                 1) != 1))
1417                         return;
1418                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1419
1420                 desc = &vq->desc[desc_idx];
1421                 if (desc->flags & VRING_DESC_F_NEXT) {
1422                         desc = &vq->desc[desc->next];
1423                         buff_addr = gpa_to_vva(dev, desc->addr);
1424                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1425                                         &addr_type);
1426                 } else {
1427                         buff_addr = gpa_to_vva(dev,
1428                                         desc->addr + vq->vhost_hlen);
1429                         phys_addr = gpa_to_hpa(vdev,
1430                                         desc->addr + vq->vhost_hlen,
1431                                         desc->len, &addr_type);
1432                 }
1433
1434                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1435                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1436                                 " address found when attaching RX frame buffer"
1437                                 " address!\n", dev->device_fh);
1438                         put_desc_to_used_list_zcp(vq, desc_idx);
1439                         continue;
1440                 }
1441
1442                 /*
1443                  * Check if the frame buffer address from guest crosses
1444                  * sub-region or not.
1445                  */
1446                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1447                         RTE_LOG(ERR, VHOST_DATA,
1448                                 "(%"PRIu64") Frame buffer address cross "
1449                                 "sub-regioin found when attaching RX frame "
1450                                 "buffer address!\n",
1451                                 dev->device_fh);
1452                         put_desc_to_used_list_zcp(vq, desc_idx);
1453                         continue;
1454                 }
1455         } while (unlikely(phys_addr == 0));
1456
1457         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1458         if (unlikely(mbuf == NULL)) {
1459                 LOG_DEBUG(VHOST_DATA,
1460                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1461                         "ring_sc_dequeue fail.\n",
1462                         dev->device_fh);
1463                 put_desc_to_used_list_zcp(vq, desc_idx);
1464                 return;
1465         }
1466
1467         if (unlikely(vpool->buf_size > desc->len)) {
1468                 LOG_DEBUG(VHOST_DATA,
1469                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1470                         "length(%d) of descriptor idx: %d less than room "
1471                         "size required: %d\n",
1472                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1473                 put_desc_to_used_list_zcp(vq, desc_idx);
1474                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1475                 return;
1476         }
1477
1478         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1479         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1480         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1481         mbuf->data_len = desc->len;
1482         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1483
1484         LOG_DEBUG(VHOST_DATA,
1485                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1486                 "descriptor idx:%d\n",
1487                 dev->device_fh, res_base_idx, desc_idx);
1488
1489         __rte_mbuf_raw_free(mbuf);
1490
1491         return;
1492 }
1493
1494 /*
1495  * Detach an attched packet mbuf -
1496  *  - restore original mbuf address and length values.
1497  *  - reset pktmbuf data and data_len to their default values.
1498  *  All other fields of the given packet mbuf will be left intact.
1499  *
1500  * @param m
1501  *   The attached packet mbuf.
1502  */
1503 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1504 {
1505         const struct rte_mempool *mp = m->pool;
1506         void *buf = RTE_MBUF_TO_BADDR(m);
1507         uint32_t buf_ofs;
1508         uint32_t buf_len = mp->elt_size - sizeof(*m);
1509         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1510
1511         m->buf_addr = buf;
1512         m->buf_len = (uint16_t)buf_len;
1513
1514         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1515                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1516         m->data_off = buf_ofs;
1517
1518         m->data_len = 0;
1519 }
1520
1521 /*
1522  * This function is called after packets have been transimited. It fetchs mbuf
1523  * from vpool->pool, detached it and put into vpool->ring. It also update the
1524  * used index and kick the guest if necessary.
1525  */
1526 static inline uint32_t __attribute__((always_inline))
1527 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1528 {
1529         struct rte_mbuf *mbuf;
1530         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1531         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1532         uint32_t index = 0;
1533         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1534
1535         LOG_DEBUG(VHOST_DATA,
1536                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1537                 "clean is: %d\n",
1538                 dev->device_fh, mbuf_count);
1539         LOG_DEBUG(VHOST_DATA,
1540                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1541                 "clean  is : %d\n",
1542                 dev->device_fh, rte_ring_count(vpool->ring));
1543
1544         for (index = 0; index < mbuf_count; index++) {
1545                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1546                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1547                         pktmbuf_detach_zcp(mbuf);
1548                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1549
1550                 /* Update used index buffer information. */
1551                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1552                 vq->used->ring[used_idx].len = 0;
1553
1554                 used_idx = (used_idx + 1) & (vq->size - 1);
1555         }
1556
1557         LOG_DEBUG(VHOST_DATA,
1558                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1559                 "clean is: %d\n",
1560                 dev->device_fh, rte_mempool_count(vpool->pool));
1561         LOG_DEBUG(VHOST_DATA,
1562                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1563                 "clean  is : %d\n",
1564                 dev->device_fh, rte_ring_count(vpool->ring));
1565         LOG_DEBUG(VHOST_DATA,
1566                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1567                 "vq->last_used_idx:%d\n",
1568                 dev->device_fh, vq->last_used_idx);
1569
1570         vq->last_used_idx += mbuf_count;
1571
1572         LOG_DEBUG(VHOST_DATA,
1573                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1574                 "vq->last_used_idx:%d\n",
1575                 dev->device_fh, vq->last_used_idx);
1576
1577         rte_compiler_barrier();
1578
1579         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1580
1581         /* Kick guest if required. */
1582         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1583                 eventfd_write((int)vq->kickfd, 1);
1584
1585         return 0;
1586 }
1587
1588 /*
1589  * This function is called when a virtio device is destroy.
1590  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1591  */
1592 static void mbuf_destroy_zcp(struct vpool *vpool)
1593 {
1594         struct rte_mbuf *mbuf = NULL;
1595         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1596
1597         LOG_DEBUG(VHOST_CONFIG,
1598                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1599                 "mbuf_destroy_zcp is: %d\n",
1600                 mbuf_count);
1601         LOG_DEBUG(VHOST_CONFIG,
1602                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1603                 "mbuf_destroy_zcp  is : %d\n",
1604                 rte_ring_count(vpool->ring));
1605
1606         for (index = 0; index < mbuf_count; index++) {
1607                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1608                 if (likely(mbuf != NULL)) {
1609                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1610                                 pktmbuf_detach_zcp(mbuf);
1611                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1612                 }
1613         }
1614
1615         LOG_DEBUG(VHOST_CONFIG,
1616                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1617                 "mbuf_destroy_zcp is: %d\n",
1618                 rte_mempool_count(vpool->pool));
1619         LOG_DEBUG(VHOST_CONFIG,
1620                 "in mbuf_destroy_zcp: mbuf count in ring after "
1621                 "mbuf_destroy_zcp is : %d\n",
1622                 rte_ring_count(vpool->ring));
1623 }
1624
1625 /*
1626  * This function update the use flag and counter.
1627  */
1628 static inline uint32_t __attribute__((always_inline))
1629 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1630         uint32_t count)
1631 {
1632         struct vhost_virtqueue *vq;
1633         struct vring_desc *desc;
1634         struct rte_mbuf *buff;
1635         /* The virtio_hdr is initialised to 0. */
1636         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1637                 = {{0, 0, 0, 0, 0, 0}, 0};
1638         uint64_t buff_hdr_addr = 0;
1639         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1640         uint32_t head_idx, packet_success = 0;
1641         uint16_t res_cur_idx;
1642
1643         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1644
1645         if (count == 0)
1646                 return 0;
1647
1648         vq = dev->virtqueue[VIRTIO_RXQ];
1649         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1650
1651         res_cur_idx = vq->last_used_idx;
1652         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1653                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1654
1655         /* Retrieve all of the head indexes first to avoid caching issues. */
1656         for (head_idx = 0; head_idx < count; head_idx++)
1657                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1658
1659         /*Prefetch descriptor index. */
1660         rte_prefetch0(&vq->desc[head[packet_success]]);
1661
1662         while (packet_success != count) {
1663                 /* Get descriptor from available ring */
1664                 desc = &vq->desc[head[packet_success]];
1665
1666                 buff = pkts[packet_success];
1667                 LOG_DEBUG(VHOST_DATA,
1668                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1669                         "pkt[%d] descriptor idx: %d\n",
1670                         dev->device_fh, packet_success,
1671                         MBUF_HEADROOM_UINT32(buff));
1672
1673                 PRINT_PACKET(dev,
1674                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1675                         + RTE_PKTMBUF_HEADROOM),
1676                         rte_pktmbuf_data_len(buff), 0);
1677
1678                 /* Buffer address translation for virtio header. */
1679                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1680                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1681
1682                 /*
1683                  * If the descriptors are chained the header and data are
1684                  * placed in separate buffers.
1685                  */
1686                 if (desc->flags & VRING_DESC_F_NEXT) {
1687                         desc->len = vq->vhost_hlen;
1688                         desc = &vq->desc[desc->next];
1689                         desc->len = rte_pktmbuf_data_len(buff);
1690                 } else {
1691                         desc->len = packet_len;
1692                 }
1693
1694                 /* Update used ring with desc information */
1695                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1696                         = head[packet_success];
1697                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1698                         = packet_len;
1699                 res_cur_idx++;
1700                 packet_success++;
1701
1702                 /* A header is required per buffer. */
1703                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1704                         (const void *)&virtio_hdr, vq->vhost_hlen);
1705
1706                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1707
1708                 if (likely(packet_success < count)) {
1709                         /* Prefetch descriptor index. */
1710                         rte_prefetch0(&vq->desc[head[packet_success]]);
1711                 }
1712         }
1713
1714         rte_compiler_barrier();
1715
1716         LOG_DEBUG(VHOST_DATA,
1717                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1718                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1719                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1720
1721         *(volatile uint16_t *)&vq->used->idx += count;
1722         vq->last_used_idx += count;
1723
1724         LOG_DEBUG(VHOST_DATA,
1725                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1726                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1727                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1728
1729         /* Kick the guest if necessary. */
1730         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1731                 eventfd_write((int)vq->kickfd, 1);
1732
1733         return count;
1734 }
1735
1736 /*
1737  * This function routes the TX packet to the correct interface.
1738  * This may be a local device or the physical port.
1739  */
1740 static inline void __attribute__((always_inline))
1741 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1742         uint32_t desc_idx, uint8_t need_copy)
1743 {
1744         struct mbuf_table *tx_q;
1745         struct rte_mbuf **m_table;
1746         struct rte_mbuf *mbuf = NULL;
1747         unsigned len, ret, offset = 0;
1748         struct vpool *vpool;
1749         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1750         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1751
1752         /*Add packet to the port tx queue*/
1753         tx_q = &tx_queue_zcp[vmdq_rx_q];
1754         len = tx_q->len;
1755
1756         /* Allocate an mbuf and populate the structure. */
1757         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1758         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1759         if (unlikely(mbuf == NULL)) {
1760                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1761                 RTE_LOG(ERR, VHOST_DATA,
1762                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1763                         dev->device_fh);
1764                 put_desc_to_used_list_zcp(vq, desc_idx);
1765                 return;
1766         }
1767
1768         if (vm2vm_mode == VM2VM_HARDWARE) {
1769                 /* Avoid using a vlan tag from any vm for external pkt, such as
1770                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1771                  * selection, MAC address determines it as an external pkt
1772                  * which should go to network, while vlan tag determine it as
1773                  * a vm2vm pkt should forward to another vm. Hardware confuse
1774                  * such a ambiguous situation, so pkt will lost.
1775                  */
1776                 vlan_tag = external_pkt_default_vlan_tag;
1777                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1778                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1779                         __rte_mbuf_raw_free(mbuf);
1780                         return;
1781                 }
1782         }
1783
1784         mbuf->nb_segs = m->nb_segs;
1785         mbuf->next = m->next;
1786         mbuf->data_len = m->data_len + offset;
1787         mbuf->pkt_len = mbuf->data_len;
1788         if (unlikely(need_copy)) {
1789                 /* Copy the packet contents to the mbuf. */
1790                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1791                         rte_pktmbuf_mtod(m, void *),
1792                         m->data_len);
1793         } else {
1794                 mbuf->data_off = m->data_off;
1795                 mbuf->buf_physaddr = m->buf_physaddr;
1796                 mbuf->buf_addr = m->buf_addr;
1797         }
1798         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1799         mbuf->vlan_tci = vlan_tag;
1800         mbuf->l2_len = sizeof(struct ether_hdr);
1801         mbuf->l3_len = sizeof(struct ipv4_hdr);
1802         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1803
1804         tx_q->m_table[len] = mbuf;
1805         len++;
1806
1807         LOG_DEBUG(VHOST_DATA,
1808                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1809                 dev->device_fh,
1810                 mbuf->nb_segs,
1811                 (mbuf->next == NULL) ? "null" : "non-null");
1812
1813         if (enable_stats) {
1814                 dev_statistics[dev->device_fh].tx_total++;
1815                 dev_statistics[dev->device_fh].tx++;
1816         }
1817
1818         if (unlikely(len == MAX_PKT_BURST)) {
1819                 m_table = (struct rte_mbuf **)tx_q->m_table;
1820                 ret = rte_eth_tx_burst(ports[0],
1821                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1822
1823                 /*
1824                  * Free any buffers not handled by TX and update
1825                  * the port stats.
1826                  */
1827                 if (unlikely(ret < len)) {
1828                         do {
1829                                 rte_pktmbuf_free(m_table[ret]);
1830                         } while (++ret < len);
1831                 }
1832
1833                 len = 0;
1834                 txmbuf_clean_zcp(dev, vpool);
1835         }
1836
1837         tx_q->len = len;
1838
1839         return;
1840 }
1841
1842 /*
1843  * This function TX all available packets in virtio TX queue for one
1844  * virtio-net device. If it is first packet, it learns MAC address and
1845  * setup VMDQ.
1846  */
1847 static inline void __attribute__((always_inline))
1848 virtio_dev_tx_zcp(struct virtio_net *dev)
1849 {
1850         struct rte_mbuf m;
1851         struct vhost_virtqueue *vq;
1852         struct vring_desc *desc;
1853         uint64_t buff_addr = 0, phys_addr;
1854         uint32_t head[MAX_PKT_BURST];
1855         uint32_t i;
1856         uint16_t free_entries, packet_success = 0;
1857         uint16_t avail_idx;
1858         uint8_t need_copy = 0;
1859         hpa_type addr_type;
1860         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1861
1862         vq = dev->virtqueue[VIRTIO_TXQ];
1863         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1864
1865         /* If there are no available buffers then return. */
1866         if (vq->last_used_idx_res == avail_idx)
1867                 return;
1868
1869         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1870
1871         /* Prefetch available ring to retrieve head indexes. */
1872         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1873
1874         /* Get the number of free entries in the ring */
1875         free_entries = (avail_idx - vq->last_used_idx_res);
1876
1877         /* Limit to MAX_PKT_BURST. */
1878         free_entries
1879                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1880
1881         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1882                 dev->device_fh, free_entries);
1883
1884         /* Retrieve all of the head indexes first to avoid caching issues. */
1885         for (i = 0; i < free_entries; i++)
1886                 head[i]
1887                         = vq->avail->ring[(vq->last_used_idx_res + i)
1888                         & (vq->size - 1)];
1889
1890         vq->last_used_idx_res += free_entries;
1891
1892         /* Prefetch descriptor index. */
1893         rte_prefetch0(&vq->desc[head[packet_success]]);
1894         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1895
1896         while (packet_success < free_entries) {
1897                 desc = &vq->desc[head[packet_success]];
1898
1899                 /* Discard first buffer as it is the virtio header */
1900                 desc = &vq->desc[desc->next];
1901
1902                 /* Buffer address translation. */
1903                 buff_addr = gpa_to_vva(dev, desc->addr);
1904                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1905                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1906                         &addr_type);
1907
1908                 if (likely(packet_success < (free_entries - 1)))
1909                         /* Prefetch descriptor index. */
1910                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1911
1912                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1913                         RTE_LOG(ERR, VHOST_DATA,
1914                                 "(%"PRIu64") Invalid frame buffer address found"
1915                                 "when TX packets!\n",
1916                                 dev->device_fh);
1917                         packet_success++;
1918                         continue;
1919                 }
1920
1921                 /* Prefetch buffer address. */
1922                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1923
1924                 /*
1925                  * Setup dummy mbuf. This is copied to a real mbuf if
1926                  * transmitted out the physical port.
1927                  */
1928                 m.data_len = desc->len;
1929                 m.nb_segs = 1;
1930                 m.next = NULL;
1931                 m.data_off = 0;
1932                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1933                 m.buf_physaddr = phys_addr;
1934
1935                 /*
1936                  * Check if the frame buffer address from guest crosses
1937                  * sub-region or not.
1938                  */
1939                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1940                         RTE_LOG(ERR, VHOST_DATA,
1941                                 "(%"PRIu64") Frame buffer address cross "
1942                                 "sub-regioin found when attaching TX frame "
1943                                 "buffer address!\n",
1944                                 dev->device_fh);
1945                         need_copy = 1;
1946                 } else
1947                         need_copy = 0;
1948
1949                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1950
1951                 /*
1952                  * If this is the first received packet we need to learn
1953                  * the MAC and setup VMDQ
1954                  */
1955                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1956                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1957                                 /*
1958                                  * Discard frame if device is scheduled for
1959                                  * removal or a duplicate MAC address is found.
1960                                  */
1961                                 packet_success += free_entries;
1962                                 vq->last_used_idx += packet_success;
1963                                 break;
1964                         }
1965                 }
1966
1967                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1968                 packet_success++;
1969         }
1970 }
1971
1972 /*
1973  * This function is called by each data core. It handles all RX/TX registered
1974  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1975  * addresses are compared with all devices in the main linked list.
1976  */
1977 static int
1978 switch_worker_zcp(__attribute__((unused)) void *arg)
1979 {
1980         struct virtio_net *dev = NULL;
1981         struct vhost_dev  *vdev = NULL;
1982         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1983         struct virtio_net_data_ll *dev_ll;
1984         struct mbuf_table *tx_q;
1985         volatile struct lcore_ll_info *lcore_ll;
1986         const uint64_t drain_tsc
1987                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
1988                 * BURST_TX_DRAIN_US;
1989         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1990         unsigned ret;
1991         const uint16_t lcore_id = rte_lcore_id();
1992         uint16_t count_in_ring, rx_count = 0;
1993
1994         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1995
1996         lcore_ll = lcore_info[lcore_id].lcore_ll;
1997         prev_tsc = 0;
1998
1999         while (1) {
2000                 cur_tsc = rte_rdtsc();
2001
2002                 /* TX burst queue drain */
2003                 diff_tsc = cur_tsc - prev_tsc;
2004                 if (unlikely(diff_tsc > drain_tsc)) {
2005                         /*
2006                          * Get mbuf from vpool.pool and detach mbuf and
2007                          * put back into vpool.ring.
2008                          */
2009                         dev_ll = lcore_ll->ll_root_used;
2010                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2011                                 /* Get virtio device ID */
2012                                 vdev = dev_ll->vdev;
2013                                 dev = vdev->dev;
2014
2015                                 if (likely(!vdev->remove)) {
2016                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2017                                         if (tx_q->len) {
2018                                                 LOG_DEBUG(VHOST_DATA,
2019                                                 "TX queue drained after timeout"
2020                                                 " with burst size %u\n",
2021                                                 tx_q->len);
2022
2023                                                 /*
2024                                                  * Tx any packets in the queue
2025                                                  */
2026                                                 ret = rte_eth_tx_burst(
2027                                                         ports[0],
2028                                                         (uint16_t)tx_q->txq_id,
2029                                                         (struct rte_mbuf **)
2030                                                         tx_q->m_table,
2031                                                         (uint16_t)tx_q->len);
2032                                                 if (unlikely(ret < tx_q->len)) {
2033                                                         do {
2034                                                                 rte_pktmbuf_free(
2035                                                                         tx_q->m_table[ret]);
2036                                                         } while (++ret < tx_q->len);
2037                                                 }
2038                                                 tx_q->len = 0;
2039
2040                                                 txmbuf_clean_zcp(dev,
2041                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2042                                         }
2043                                 }
2044                                 dev_ll = dev_ll->next;
2045                         }
2046                         prev_tsc = cur_tsc;
2047                 }
2048
2049                 rte_prefetch0(lcore_ll->ll_root_used);
2050
2051                 /*
2052                  * Inform the configuration core that we have exited the linked
2053                  * list and that no devices are in use if requested.
2054                  */
2055                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2056                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2057
2058                 /* Process devices */
2059                 dev_ll = lcore_ll->ll_root_used;
2060
2061                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2062                         vdev = dev_ll->vdev;
2063                         dev  = vdev->dev;
2064                         if (unlikely(vdev->remove)) {
2065                                 dev_ll = dev_ll->next;
2066                                 unlink_vmdq(vdev);
2067                                 vdev->ready = DEVICE_SAFE_REMOVE;
2068                                 continue;
2069                         }
2070
2071                         if (likely(vdev->ready == DEVICE_RX)) {
2072                                 uint32_t index = vdev->vmdq_rx_q;
2073                                 uint16_t i;
2074                                 count_in_ring
2075                                 = rte_ring_count(vpool_array[index].ring);
2076                                 uint16_t free_entries
2077                                 = (uint16_t)get_available_ring_num_zcp(dev);
2078
2079                                 /*
2080                                  * Attach all mbufs in vpool.ring and put back
2081                                  * into vpool.pool.
2082                                  */
2083                                 for (i = 0;
2084                                 i < RTE_MIN(free_entries,
2085                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2086                                 i++)
2087                                         attach_rxmbuf_zcp(dev);
2088
2089                                 /* Handle guest RX */
2090                                 rx_count = rte_eth_rx_burst(ports[0],
2091                                         vdev->vmdq_rx_q, pkts_burst,
2092                                         MAX_PKT_BURST);
2093
2094                                 if (rx_count) {
2095                                         ret_count = virtio_dev_rx_zcp(dev,
2096                                                         pkts_burst, rx_count);
2097                                         if (enable_stats) {
2098                                                 dev_statistics[dev->device_fh].rx_total
2099                                                         += rx_count;
2100                                                 dev_statistics[dev->device_fh].rx
2101                                                         += ret_count;
2102                                         }
2103                                         while (likely(rx_count)) {
2104                                                 rx_count--;
2105                                                 pktmbuf_detach_zcp(
2106                                                         pkts_burst[rx_count]);
2107                                                 rte_ring_sp_enqueue(
2108                                                         vpool_array[index].ring,
2109                                                         (void *)pkts_burst[rx_count]);
2110                                         }
2111                                 }
2112                         }
2113
2114                         if (likely(!vdev->remove))
2115                                 /* Handle guest TX */
2116                                 virtio_dev_tx_zcp(dev);
2117
2118                         /* Move to the next device in the list */
2119                         dev_ll = dev_ll->next;
2120                 }
2121         }
2122
2123         return 0;
2124 }
2125
2126
2127 /*
2128  * Add an entry to a used linked list. A free entry must first be found
2129  * in the free linked list using get_data_ll_free_entry();
2130  */
2131 static void
2132 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2133         struct virtio_net_data_ll *ll_dev)
2134 {
2135         struct virtio_net_data_ll *ll = *ll_root_addr;
2136
2137         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2138         ll_dev->next = NULL;
2139         rte_compiler_barrier();
2140
2141         /* If ll == NULL then this is the first device. */
2142         if (ll) {
2143                 /* Increment to the tail of the linked list. */
2144                 while ((ll->next != NULL) )
2145                         ll = ll->next;
2146
2147                 ll->next = ll_dev;
2148         } else {
2149                 *ll_root_addr = ll_dev;
2150         }
2151 }
2152
2153 /*
2154  * Remove an entry from a used linked list. The entry must then be added to
2155  * the free linked list using put_data_ll_free_entry().
2156  */
2157 static void
2158 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2159         struct virtio_net_data_ll *ll_dev,
2160         struct virtio_net_data_ll *ll_dev_last)
2161 {
2162         struct virtio_net_data_ll *ll = *ll_root_addr;
2163
2164         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2165                 return;
2166
2167         if (ll_dev == ll)
2168                 *ll_root_addr = ll_dev->next;
2169         else
2170                 if (likely(ll_dev_last != NULL))
2171                         ll_dev_last->next = ll_dev->next;
2172                 else
2173                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2174 }
2175
2176 /*
2177  * Find and return an entry from the free linked list.
2178  */
2179 static struct virtio_net_data_ll *
2180 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2181 {
2182         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2183         struct virtio_net_data_ll *ll_dev;
2184
2185         if (ll_free == NULL)
2186                 return NULL;
2187
2188         ll_dev = ll_free;
2189         *ll_root_addr = ll_free->next;
2190
2191         return ll_dev;
2192 }
2193
2194 /*
2195  * Place an entry back on to the free linked list.
2196  */
2197 static void
2198 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2199         struct virtio_net_data_ll *ll_dev)
2200 {
2201         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2202
2203         if (ll_dev == NULL)
2204                 return;
2205
2206         ll_dev->next = ll_free;
2207         *ll_root_addr = ll_dev;
2208 }
2209
2210 /*
2211  * Creates a linked list of a given size.
2212  */
2213 static struct virtio_net_data_ll *
2214 alloc_data_ll(uint32_t size)
2215 {
2216         struct virtio_net_data_ll *ll_new;
2217         uint32_t i;
2218
2219         /* Malloc and then chain the linked list. */
2220         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2221         if (ll_new == NULL) {
2222                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2223                 return NULL;
2224         }
2225
2226         for (i = 0; i < size - 1; i++) {
2227                 ll_new[i].vdev = NULL;
2228                 ll_new[i].next = &ll_new[i+1];
2229         }
2230         ll_new[i].next = NULL;
2231
2232         return (ll_new);
2233 }
2234
2235 /*
2236  * Create the main linked list along with each individual cores linked list. A used and a free list
2237  * are created to manage entries.
2238  */
2239 static int
2240 init_data_ll (void)
2241 {
2242         int lcore;
2243
2244         RTE_LCORE_FOREACH_SLAVE(lcore) {
2245                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2246                 if (lcore_info[lcore].lcore_ll == NULL) {
2247                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2248                         return -1;
2249                 }
2250
2251                 lcore_info[lcore].lcore_ll->device_num = 0;
2252                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2253                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2254                 if (num_devices % num_switching_cores)
2255                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2256                 else
2257                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2258         }
2259
2260         /* Allocate devices up to a maximum of MAX_DEVICES. */
2261         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2262
2263         return 0;
2264 }
2265
2266 /*
2267  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2268  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2269  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2270  */
2271 static void
2272 destroy_device (volatile struct virtio_net *dev)
2273 {
2274         struct virtio_net_data_ll *ll_lcore_dev_cur;
2275         struct virtio_net_data_ll *ll_main_dev_cur;
2276         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2277         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2278         struct vhost_dev *vdev;
2279         int lcore;
2280
2281         dev->flags &= ~VIRTIO_DEV_RUNNING;
2282
2283         vdev = (struct vhost_dev *)dev->priv;
2284         /*set the remove flag. */
2285         vdev->remove = 1;
2286         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2287                 rte_pause();
2288         }
2289
2290         /* Search for entry to be removed from lcore ll */
2291         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2292         while (ll_lcore_dev_cur != NULL) {
2293                 if (ll_lcore_dev_cur->vdev == vdev) {
2294                         break;
2295                 } else {
2296                         ll_lcore_dev_last = ll_lcore_dev_cur;
2297                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2298                 }
2299         }
2300
2301         if (ll_lcore_dev_cur == NULL) {
2302                 RTE_LOG(ERR, VHOST_CONFIG,
2303                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2304                         dev->device_fh);
2305                 return;
2306         }
2307
2308         /* Search for entry to be removed from main ll */
2309         ll_main_dev_cur = ll_root_used;
2310         ll_main_dev_last = NULL;
2311         while (ll_main_dev_cur != NULL) {
2312                 if (ll_main_dev_cur->vdev == vdev) {
2313                         break;
2314                 } else {
2315                         ll_main_dev_last = ll_main_dev_cur;
2316                         ll_main_dev_cur = ll_main_dev_cur->next;
2317                 }
2318         }
2319
2320         /* Remove entries from the lcore and main ll. */
2321         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2322         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2323
2324         /* Set the dev_removal_flag on each lcore. */
2325         RTE_LCORE_FOREACH_SLAVE(lcore) {
2326                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2327         }
2328
2329         /*
2330          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2331          * they can no longer access the device removed from the linked lists and that the devices
2332          * are no longer in use.
2333          */
2334         RTE_LCORE_FOREACH_SLAVE(lcore) {
2335                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2336                         rte_pause();
2337                 }
2338         }
2339
2340         /* Add the entries back to the lcore and main free ll.*/
2341         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2342         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2343
2344         /* Decrement number of device on the lcore. */
2345         lcore_info[vdev->coreid].lcore_ll->device_num--;
2346
2347         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2348
2349         if (zero_copy) {
2350                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2351
2352                 /* Stop the RX queue. */
2353                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2354                         LOG_DEBUG(VHOST_CONFIG,
2355                                 "(%"PRIu64") In destroy_device: Failed to stop "
2356                                 "rx queue:%d\n",
2357                                 dev->device_fh,
2358                                 vdev->vmdq_rx_q);
2359                 }
2360
2361                 LOG_DEBUG(VHOST_CONFIG,
2362                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2363                         "mempool back to ring for RX queue: %d\n",
2364                         dev->device_fh, vdev->vmdq_rx_q);
2365
2366                 mbuf_destroy_zcp(vpool);
2367
2368                 /* Stop the TX queue. */
2369                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2370                         LOG_DEBUG(VHOST_CONFIG,
2371                                 "(%"PRIu64") In destroy_device: Failed to "
2372                                 "stop tx queue:%d\n",
2373                                 dev->device_fh, vdev->vmdq_rx_q);
2374                 }
2375
2376                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2377
2378                 LOG_DEBUG(VHOST_CONFIG,
2379                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2380                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2381                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2382                         dev->device_fh);
2383
2384                 mbuf_destroy_zcp(vpool);
2385                 rte_free(vdev->regions_hpa);
2386         }
2387         rte_free(vdev);
2388
2389 }
2390
2391 /*
2392  * Calculate the region count of physical continous regions for one particular
2393  * region of whose vhost virtual address is continous. The particular region
2394  * start from vva_start, with size of 'size' in argument.
2395  */
2396 static uint32_t
2397 check_hpa_regions(uint64_t vva_start, uint64_t size)
2398 {
2399         uint32_t i, nregions = 0, page_size = getpagesize();
2400         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2401         if (vva_start % page_size) {
2402                 LOG_DEBUG(VHOST_CONFIG,
2403                         "in check_countinous: vva start(%p) mod page_size(%d) "
2404                         "has remainder\n",
2405                         (void *)(uintptr_t)vva_start, page_size);
2406                 return 0;
2407         }
2408         if (size % page_size) {
2409                 LOG_DEBUG(VHOST_CONFIG,
2410                         "in check_countinous: "
2411                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2412                         size, page_size);
2413                 return 0;
2414         }
2415         for (i = 0; i < size - page_size; i = i + page_size) {
2416                 cur_phys_addr
2417                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2418                 next_phys_addr = rte_mem_virt2phy(
2419                         (void *)(uintptr_t)(vva_start + i + page_size));
2420                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2421                         ++nregions;
2422                         LOG_DEBUG(VHOST_CONFIG,
2423                                 "in check_continuous: hva addr:(%p) is not "
2424                                 "continuous with hva addr:(%p), diff:%d\n",
2425                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2426                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2427                                 + page_size), page_size);
2428                         LOG_DEBUG(VHOST_CONFIG,
2429                                 "in check_continuous: hpa addr:(%p) is not "
2430                                 "continuous with hpa addr:(%p), "
2431                                 "diff:(%"PRIu64")\n",
2432                                 (void *)(uintptr_t)cur_phys_addr,
2433                                 (void *)(uintptr_t)next_phys_addr,
2434                                 (next_phys_addr-cur_phys_addr));
2435                 }
2436         }
2437         return nregions;
2438 }
2439
2440 /*
2441  * Divide each region whose vhost virtual address is continous into a few
2442  * sub-regions, make sure the physical address within each sub-region are
2443  * continous. And fill offset(to GPA) and size etc. information of each
2444  * sub-region into regions_hpa.
2445  */
2446 static uint32_t
2447 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2448 {
2449         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2450         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2451
2452         if (mem_region_hpa == NULL)
2453                 return 0;
2454
2455         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2456                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2457                         virtio_memory->regions[regionidx].address_offset;
2458                 mem_region_hpa[regionidx_hpa].guest_phys_address
2459                         = virtio_memory->regions[regionidx].guest_phys_address;
2460                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2461                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2462                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2463                 LOG_DEBUG(VHOST_CONFIG,
2464                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2465                         regionidx_hpa,
2466                         (void *)(uintptr_t)
2467                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2468                 LOG_DEBUG(VHOST_CONFIG,
2469                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2470                         regionidx_hpa,
2471                         (void *)(uintptr_t)
2472                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2473                 for (i = 0, k = 0;
2474                         i < virtio_memory->regions[regionidx].memory_size -
2475                                 page_size;
2476                         i += page_size) {
2477                         cur_phys_addr = rte_mem_virt2phy(
2478                                         (void *)(uintptr_t)(vva_start + i));
2479                         next_phys_addr = rte_mem_virt2phy(
2480                                         (void *)(uintptr_t)(vva_start +
2481                                         i + page_size));
2482                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2483                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2484                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2485                                         k + page_size;
2486                                 mem_region_hpa[regionidx_hpa].memory_size
2487                                         = k + page_size;
2488                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2489                                         "phys addr end  [%d]:(%p)\n",
2490                                         regionidx_hpa,
2491                                         (void *)(uintptr_t)
2492                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2493                                 LOG_DEBUG(VHOST_CONFIG,
2494                                         "in fill_hpa_regions: guest phys addr "
2495                                         "size [%d]:(%p)\n",
2496                                         regionidx_hpa,
2497                                         (void *)(uintptr_t)
2498                                         (mem_region_hpa[regionidx_hpa].memory_size));
2499                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2500                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2501                                 ++regionidx_hpa;
2502                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2503                                         next_phys_addr -
2504                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2505                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2506                                         " phys addr start[%d]:(%p)\n",
2507                                         regionidx_hpa,
2508                                         (void *)(uintptr_t)
2509                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2510                                 LOG_DEBUG(VHOST_CONFIG,
2511                                         "in fill_hpa_regions: host  phys addr "
2512                                         "start[%d]:(%p)\n",
2513                                         regionidx_hpa,
2514                                         (void *)(uintptr_t)
2515                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2516                                 k = 0;
2517                         } else {
2518                                 k += page_size;
2519                         }
2520                 }
2521                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2522                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2523                         + k + page_size;
2524                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2525                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2526                         "[%d]:(%p)\n", regionidx_hpa,
2527                         (void *)(uintptr_t)
2528                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2529                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2530                         "[%d]:(%p)\n", regionidx_hpa,
2531                         (void *)(uintptr_t)
2532                         (mem_region_hpa[regionidx_hpa].memory_size));
2533                 ++regionidx_hpa;
2534         }
2535         return regionidx_hpa;
2536 }
2537
2538 /*
2539  * A new device is added to a data core. First the device is added to the main linked list
2540  * and the allocated to a specific data core.
2541  */
2542 static int
2543 new_device (struct virtio_net *dev)
2544 {
2545         struct virtio_net_data_ll *ll_dev;
2546         int lcore, core_add = 0;
2547         uint32_t device_num_min = num_devices;
2548         struct vhost_dev *vdev;
2549         uint32_t regionidx;
2550
2551         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2552         if (vdev == NULL) {
2553                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2554                         dev->device_fh);
2555                 return -1;
2556         }
2557         vdev->dev = dev;
2558         dev->priv = vdev;
2559
2560         if (zero_copy) {
2561                 vdev->nregions_hpa = dev->mem->nregions;
2562                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2563                         vdev->nregions_hpa
2564                                 += check_hpa_regions(
2565                                         dev->mem->regions[regionidx].guest_phys_address
2566                                         + dev->mem->regions[regionidx].address_offset,
2567                                         dev->mem->regions[regionidx].memory_size);
2568
2569                 }
2570
2571                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2572                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2573                         RTE_CACHE_LINE_SIZE);
2574                 if (vdev->regions_hpa == NULL) {
2575                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2576                         rte_free(vdev);
2577                         return -1;
2578                 }
2579
2580
2581                 if (fill_hpa_memory_regions(
2582                         vdev->regions_hpa, dev->mem
2583                         ) != vdev->nregions_hpa) {
2584
2585                         RTE_LOG(ERR, VHOST_CONFIG,
2586                                 "hpa memory regions number mismatch: "
2587                                 "[%d]\n", vdev->nregions_hpa);
2588                         rte_free(vdev->regions_hpa);
2589                         rte_free(vdev);
2590                         return -1;
2591                 }
2592         }
2593
2594
2595         /* Add device to main ll */
2596         ll_dev = get_data_ll_free_entry(&ll_root_free);
2597         if (ll_dev == NULL) {
2598                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2599                         "of %d devices per core has been reached\n",
2600                         dev->device_fh, num_devices);
2601                 if (vdev->regions_hpa)
2602                         rte_free(vdev->regions_hpa);
2603                 rte_free(vdev);
2604                 return -1;
2605         }
2606         ll_dev->vdev = vdev;
2607         add_data_ll_entry(&ll_root_used, ll_dev);
2608         vdev->vmdq_rx_q
2609                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2610
2611         if (zero_copy) {
2612                 uint32_t index = vdev->vmdq_rx_q;
2613                 uint32_t count_in_ring, i;
2614                 struct mbuf_table *tx_q;
2615
2616                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2617
2618                 LOG_DEBUG(VHOST_CONFIG,
2619                         "(%"PRIu64") in new_device: mbuf count in mempool "
2620                         "before attach is: %d\n",
2621                         dev->device_fh,
2622                         rte_mempool_count(vpool_array[index].pool));
2623                 LOG_DEBUG(VHOST_CONFIG,
2624                         "(%"PRIu64") in new_device: mbuf count in  ring "
2625                         "before attach  is : %d\n",
2626                         dev->device_fh, count_in_ring);
2627
2628                 /*
2629                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2630                  */
2631                 for (i = 0; i < count_in_ring; i++)
2632                         attach_rxmbuf_zcp(dev);
2633
2634                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2635                         "mempool after attach is: %d\n",
2636                         dev->device_fh,
2637                         rte_mempool_count(vpool_array[index].pool));
2638                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2639                         "ring after attach  is : %d\n",
2640                         dev->device_fh,
2641                         rte_ring_count(vpool_array[index].ring));
2642
2643                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2644                 tx_q->txq_id = vdev->vmdq_rx_q;
2645
2646                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2647                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2648
2649                         LOG_DEBUG(VHOST_CONFIG,
2650                                 "(%"PRIu64") In new_device: Failed to start "
2651                                 "tx queue:%d\n",
2652                                 dev->device_fh, vdev->vmdq_rx_q);
2653
2654                         mbuf_destroy_zcp(vpool);
2655                         rte_free(vdev->regions_hpa);
2656                         rte_free(vdev);
2657                         return -1;
2658                 }
2659
2660                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2661                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2662
2663                         LOG_DEBUG(VHOST_CONFIG,
2664                                 "(%"PRIu64") In new_device: Failed to start "
2665                                 "rx queue:%d\n",
2666                                 dev->device_fh, vdev->vmdq_rx_q);
2667
2668                         /* Stop the TX queue. */
2669                         if (rte_eth_dev_tx_queue_stop(ports[0],
2670                                 vdev->vmdq_rx_q) != 0) {
2671                                 LOG_DEBUG(VHOST_CONFIG,
2672                                         "(%"PRIu64") In new_device: Failed to "
2673                                         "stop tx queue:%d\n",
2674                                         dev->device_fh, vdev->vmdq_rx_q);
2675                         }
2676
2677                         mbuf_destroy_zcp(vpool);
2678                         rte_free(vdev->regions_hpa);
2679                         rte_free(vdev);
2680                         return -1;
2681                 }
2682
2683         }
2684
2685         /*reset ready flag*/
2686         vdev->ready = DEVICE_MAC_LEARNING;
2687         vdev->remove = 0;
2688
2689         /* Find a suitable lcore to add the device. */
2690         RTE_LCORE_FOREACH_SLAVE(lcore) {
2691                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2692                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2693                         core_add = lcore;
2694                 }
2695         }
2696         /* Add device to lcore ll */
2697         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2698         if (ll_dev == NULL) {
2699                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2700                 vdev->ready = DEVICE_SAFE_REMOVE;
2701                 destroy_device(dev);
2702                 if (vdev->regions_hpa)
2703                         rte_free(vdev->regions_hpa);
2704                 rte_free(vdev);
2705                 return -1;
2706         }
2707         ll_dev->vdev = vdev;
2708         vdev->coreid = core_add;
2709
2710         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2711
2712         /* Initialize device stats */
2713         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2714
2715         /* Disable notifications. */
2716         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2717         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2718         lcore_info[vdev->coreid].lcore_ll->device_num++;
2719         dev->flags |= VIRTIO_DEV_RUNNING;
2720
2721         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2722
2723         return 0;
2724 }
2725
2726 /*
2727  * These callback allow devices to be added to the data core when configuration
2728  * has been fully complete.
2729  */
2730 static const struct virtio_net_device_ops virtio_net_device_ops =
2731 {
2732         .new_device =  new_device,
2733         .destroy_device = destroy_device,
2734 };
2735
2736 /*
2737  * This is a thread will wake up after a period to print stats if the user has
2738  * enabled them.
2739  */
2740 static void
2741 print_stats(void)
2742 {
2743         struct virtio_net_data_ll *dev_ll;
2744         uint64_t tx_dropped, rx_dropped;
2745         uint64_t tx, tx_total, rx, rx_total;
2746         uint32_t device_fh;
2747         const char clr[] = { 27, '[', '2', 'J', '\0' };
2748         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2749
2750         while(1) {
2751                 sleep(enable_stats);
2752
2753                 /* Clear screen and move to top left */
2754                 printf("%s%s", clr, top_left);
2755
2756                 printf("\nDevice statistics ====================================");
2757
2758                 dev_ll = ll_root_used;
2759                 while (dev_ll != NULL) {
2760                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2761                         tx_total = dev_statistics[device_fh].tx_total;
2762                         tx = dev_statistics[device_fh].tx;
2763                         tx_dropped = tx_total - tx;
2764                         if (zero_copy == 0) {
2765                                 rx_total = rte_atomic64_read(
2766                                         &dev_statistics[device_fh].rx_total_atomic);
2767                                 rx = rte_atomic64_read(
2768                                         &dev_statistics[device_fh].rx_atomic);
2769                         } else {
2770                                 rx_total = dev_statistics[device_fh].rx_total;
2771                                 rx = dev_statistics[device_fh].rx;
2772                         }
2773                         rx_dropped = rx_total - rx;
2774
2775                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2776                                         "\nTX total:            %"PRIu64""
2777                                         "\nTX dropped:          %"PRIu64""
2778                                         "\nTX successful:               %"PRIu64""
2779                                         "\nRX total:            %"PRIu64""
2780                                         "\nRX dropped:          %"PRIu64""
2781                                         "\nRX successful:               %"PRIu64"",
2782                                         device_fh,
2783                                         tx_total,
2784                                         tx_dropped,
2785                                         tx,
2786                                         rx_total,
2787                                         rx_dropped,
2788                                         rx);
2789
2790                         dev_ll = dev_ll->next;
2791                 }
2792                 printf("\n======================================================\n");
2793         }
2794 }
2795
2796 static void
2797 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2798         char *ring_name, uint32_t nb_mbuf)
2799 {
2800         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2801         vpool_array[index].pool
2802                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2803                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2804                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2805                 rte_pktmbuf_init, NULL, socket, 0);
2806         if (vpool_array[index].pool != NULL) {
2807                 vpool_array[index].ring
2808                         = rte_ring_create(ring_name,
2809                                 rte_align32pow2(nb_mbuf + 1),
2810                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2811                 if (likely(vpool_array[index].ring != NULL)) {
2812                         LOG_DEBUG(VHOST_CONFIG,
2813                                 "in setup_mempool_tbl: mbuf count in "
2814                                 "mempool is: %d\n",
2815                                 rte_mempool_count(vpool_array[index].pool));
2816                         LOG_DEBUG(VHOST_CONFIG,
2817                                 "in setup_mempool_tbl: mbuf count in "
2818                                 "ring   is: %d\n",
2819                                 rte_ring_count(vpool_array[index].ring));
2820                 } else {
2821                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2822                                 ring_name);
2823                 }
2824
2825                 /* Need consider head room. */
2826                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2827         } else {
2828                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2829         }
2830 }
2831
2832
2833 /*
2834  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2835  * device is also registered here to handle the IOCTLs.
2836  */
2837 int
2838 main(int argc, char *argv[])
2839 {
2840         struct rte_mempool *mbuf_pool = NULL;
2841         unsigned lcore_id, core_id = 0;
2842         unsigned nb_ports, valid_num_ports;
2843         int ret;
2844         uint8_t portid;
2845         uint16_t queue_id;
2846         static pthread_t tid;
2847
2848         /* init EAL */
2849         ret = rte_eal_init(argc, argv);
2850         if (ret < 0)
2851                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2852         argc -= ret;
2853         argv += ret;
2854
2855         /* parse app arguments */
2856         ret = us_vhost_parse_args(argc, argv);
2857         if (ret < 0)
2858                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2859
2860         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2861                 if (rte_lcore_is_enabled(lcore_id))
2862                         lcore_ids[core_id ++] = lcore_id;
2863
2864         if (rte_lcore_count() > RTE_MAX_LCORE)
2865                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2866
2867         /*set the number of swithcing cores available*/
2868         num_switching_cores = rte_lcore_count()-1;
2869
2870         /* Get the number of physical ports. */
2871         nb_ports = rte_eth_dev_count();
2872         if (nb_ports > RTE_MAX_ETHPORTS)
2873                 nb_ports = RTE_MAX_ETHPORTS;
2874
2875         /*
2876          * Update the global var NUM_PORTS and global array PORTS
2877          * and get value of var VALID_NUM_PORTS according to system ports number
2878          */
2879         valid_num_ports = check_ports_num(nb_ports);
2880
2881         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2882                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2883                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2884                 return -1;
2885         }
2886
2887         if (zero_copy == 0) {
2888                 /* Create the mbuf pool. */
2889                 mbuf_pool = rte_mempool_create(
2890                                 "MBUF_POOL",
2891                                 NUM_MBUFS_PER_PORT
2892                                 * valid_num_ports,
2893                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2894                                 sizeof(struct rte_pktmbuf_pool_private),
2895                                 rte_pktmbuf_pool_init, NULL,
2896                                 rte_pktmbuf_init, NULL,
2897                                 rte_socket_id(), 0);
2898                 if (mbuf_pool == NULL)
2899                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2900
2901                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2902                         vpool_array[queue_id].pool = mbuf_pool;
2903
2904                 if (vm2vm_mode == VM2VM_HARDWARE) {
2905                         /* Enable VT loop back to let L2 switch to do it. */
2906                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2907                         LOG_DEBUG(VHOST_CONFIG,
2908                                 "Enable loop back for L2 switch in vmdq.\n");
2909                 }
2910         } else {
2911                 uint32_t nb_mbuf;
2912                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2913                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2914
2915                 nb_mbuf = num_rx_descriptor
2916                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2917                         + num_switching_cores * MAX_PKT_BURST;
2918
2919                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2920                         snprintf(pool_name, sizeof(pool_name),
2921                                 "rxmbuf_pool_%u", queue_id);
2922                         snprintf(ring_name, sizeof(ring_name),
2923                                 "rxmbuf_ring_%u", queue_id);
2924                         setup_mempool_tbl(rte_socket_id(), queue_id,
2925                                 pool_name, ring_name, nb_mbuf);
2926                 }
2927
2928                 nb_mbuf = num_tx_descriptor
2929                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2930                                 + num_switching_cores * MAX_PKT_BURST;
2931
2932                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2933                         snprintf(pool_name, sizeof(pool_name),
2934                                 "txmbuf_pool_%u", queue_id);
2935                         snprintf(ring_name, sizeof(ring_name),
2936                                 "txmbuf_ring_%u", queue_id);
2937                         setup_mempool_tbl(rte_socket_id(),
2938                                 (queue_id + MAX_QUEUES),
2939                                 pool_name, ring_name, nb_mbuf);
2940                 }
2941
2942                 if (vm2vm_mode == VM2VM_HARDWARE) {
2943                         /* Enable VT loop back to let L2 switch to do it. */
2944                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2945                         LOG_DEBUG(VHOST_CONFIG,
2946                                 "Enable loop back for L2 switch in vmdq.\n");
2947                 }
2948         }
2949         /* Set log level. */
2950         rte_set_log_level(LOG_LEVEL);
2951
2952         /* initialize all ports */
2953         for (portid = 0; portid < nb_ports; portid++) {
2954                 /* skip ports that are not enabled */
2955                 if ((enabled_port_mask & (1 << portid)) == 0) {
2956                         RTE_LOG(INFO, VHOST_PORT,
2957                                 "Skipping disabled port %d\n", portid);
2958                         continue;
2959                 }
2960                 if (port_init(portid) != 0)
2961                         rte_exit(EXIT_FAILURE,
2962                                 "Cannot initialize network ports\n");
2963         }
2964
2965         /* Initialise all linked lists. */
2966         if (init_data_ll() == -1)
2967                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2968
2969         /* Initialize device stats */
2970         memset(&dev_statistics, 0, sizeof(dev_statistics));
2971
2972         /* Enable stats if the user option is set. */
2973         if (enable_stats)
2974                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2975
2976         /* Launch all data cores. */
2977         if (zero_copy == 0) {
2978                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2979                         rte_eal_remote_launch(switch_worker,
2980                                 mbuf_pool, lcore_id);
2981                 }
2982         } else {
2983                 uint32_t count_in_mempool, index, i;
2984                 for (index = 0; index < 2*MAX_QUEUES; index++) {
2985                         /* For all RX and TX queues. */
2986                         count_in_mempool
2987                                 = rte_mempool_count(vpool_array[index].pool);
2988
2989                         /*
2990                          * Transfer all un-attached mbufs from vpool.pool
2991                          * to vpoo.ring.
2992                          */
2993                         for (i = 0; i < count_in_mempool; i++) {
2994                                 struct rte_mbuf *mbuf
2995                                         = __rte_mbuf_raw_alloc(
2996                                                 vpool_array[index].pool);
2997                                 rte_ring_sp_enqueue(vpool_array[index].ring,
2998                                                 (void *)mbuf);
2999                         }
3000
3001                         LOG_DEBUG(VHOST_CONFIG,
3002                                 "in main: mbuf count in mempool at initial "
3003                                 "is: %d\n", count_in_mempool);
3004                         LOG_DEBUG(VHOST_CONFIG,
3005                                 "in main: mbuf count in  ring at initial  is :"
3006                                 " %d\n",
3007                                 rte_ring_count(vpool_array[index].ring));
3008                 }
3009
3010                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3011                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3012                                 lcore_id);
3013         }
3014
3015         if (mergeable == 0)
3016                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3017
3018         /* Register CUSE device to handle IOCTLs. */
3019         ret = rte_vhost_driver_register((char *)&dev_basename);
3020         if (ret != 0)
3021                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3022
3023         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3024
3025         /* Start CUSE session. */
3026         rte_vhost_driver_session_start();
3027         return 0;
3028
3029 }
3030