examples/vhost: fix strict aliasing
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
78 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
79 #define MBUF_CACHE_SIZE_ZCP 0
80
81 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
82 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
83
84 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
85 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
86
87 #define JUMBO_FRAME_MAX_SIZE    0x2600
88
89 /* State of virtio device. */
90 #define DEVICE_MAC_LEARNING 0
91 #define DEVICE_RX                       1
92 #define DEVICE_SAFE_REMOVE      2
93
94 /* Config_core_flag status definitions. */
95 #define REQUEST_DEV_REMOVAL 1
96 #define ACK_DEV_REMOVAL 0
97
98 /* Configurable number of RX/TX ring descriptors */
99 #define RTE_TEST_RX_DESC_DEFAULT 1024
100 #define RTE_TEST_TX_DESC_DEFAULT 512
101
102 /*
103  * Need refine these 2 macros for legacy and DPDK based front end:
104  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
105  * And then adjust power 2.
106  */
107 /*
108  * For legacy front end, 128 descriptors,
109  * half for virtio header, another half for mbuf.
110  */
111 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
112 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
113
114 /* Get first 4 bytes in mbuf headroom. */
115 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
116                 + sizeof(struct rte_mbuf)))
117
118 /* true if x is a power of 2 */
119 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
120
121 #define INVALID_PORT_ID 0xFF
122
123 /* Max number of devices. Limited by vmdq. */
124 #define MAX_DEVICES 64
125
126 /* Size of buffers used for snprintfs. */
127 #define MAX_PRINT_BUFF 6072
128
129 /* Maximum character device basename size. */
130 #define MAX_BASENAME_SZ 10
131
132 /* Maximum long option length for option parsing. */
133 #define MAX_LONG_OPT_SZ 64
134
135 /* Used to compare MAC addresses. */
136 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
137
138 /* Number of descriptors per cacheline. */
139 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
140
141 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
142
143 /* mask of enabled ports */
144 static uint32_t enabled_port_mask = 0;
145
146 /* Promiscuous mode */
147 static uint32_t promiscuous;
148
149 /*Number of switching cores enabled*/
150 static uint32_t num_switching_cores = 0;
151
152 /* number of devices/queues to support*/
153 static uint32_t num_queues = 0;
154 static uint32_t num_devices;
155
156 /*
157  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
158  * disabled on default.
159  */
160 static uint32_t zero_copy;
161 static int mergeable;
162
163 /* Do vlan strip on host, enabled on default */
164 static uint32_t vlan_strip = 1;
165
166 /* number of descriptors to apply*/
167 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
168 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
169
170 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
171 #define MAX_RING_DESC 4096
172
173 struct vpool {
174         struct rte_mempool *pool;
175         struct rte_ring *ring;
176         uint32_t buf_size;
177 } vpool_array[MAX_QUEUES+MAX_QUEUES];
178
179 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
180 typedef enum {
181         VM2VM_DISABLED = 0,
182         VM2VM_SOFTWARE = 1,
183         VM2VM_HARDWARE = 2,
184         VM2VM_LAST
185 } vm2vm_type;
186 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
187
188 /* The type of host physical address translated from guest physical address. */
189 typedef enum {
190         PHYS_ADDR_CONTINUOUS = 0,
191         PHYS_ADDR_CROSS_SUBREG = 1,
192         PHYS_ADDR_INVALID = 2,
193         PHYS_ADDR_LAST
194 } hpa_type;
195
196 /* Enable stats. */
197 static uint32_t enable_stats = 0;
198 /* Enable retries on RX. */
199 static uint32_t enable_retry = 1;
200 /* Specify timeout (in useconds) between retries on RX. */
201 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
202 /* Specify the number of retries on RX. */
203 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
204
205 /* Character device basename. Can be set by user. */
206 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
207
208 /* empty vmdq configuration structure. Filled in programatically */
209 static struct rte_eth_conf vmdq_conf_default = {
210         .rxmode = {
211                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
212                 .split_hdr_size = 0,
213                 .header_split   = 0, /**< Header Split disabled */
214                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
215                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
216                 /*
217                  * It is necessary for 1G NIC such as I350,
218                  * this fixes bug of ipv4 forwarding in guest can't
219                  * forward pakets from one virtio dev to another virtio dev.
220                  */
221                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
222                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
223                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
224         },
225
226         .txmode = {
227                 .mq_mode = ETH_MQ_TX_NONE,
228         },
229         .rx_adv_conf = {
230                 /*
231                  * should be overridden separately in code with
232                  * appropriate values
233                  */
234                 .vmdq_rx_conf = {
235                         .nb_queue_pools = ETH_8_POOLS,
236                         .enable_default_pool = 0,
237                         .default_pool = 0,
238                         .nb_pool_maps = 0,
239                         .pool_map = {{0, 0},},
240                 },
241         },
242 };
243
244 static unsigned lcore_ids[RTE_MAX_LCORE];
245 static uint8_t ports[RTE_MAX_ETHPORTS];
246 static unsigned num_ports = 0; /**< The number of ports specified in command line */
247 static uint16_t num_pf_queues, num_vmdq_queues;
248 static uint16_t vmdq_pool_base, vmdq_queue_base;
249 static uint16_t queues_per_pool;
250
251 static const uint16_t external_pkt_default_vlan_tag = 2000;
252 const uint16_t vlan_tags[] = {
253         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
254         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
255         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
256         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
257         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
258         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
259         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
260         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
261 };
262
263 /* ethernet addresses of ports */
264 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
265
266 /* heads for the main used and free linked lists for the data path. */
267 static struct virtio_net_data_ll *ll_root_used = NULL;
268 static struct virtio_net_data_ll *ll_root_free = NULL;
269
270 /* Array of data core structures containing information on individual core linked lists. */
271 static struct lcore_info lcore_info[RTE_MAX_LCORE];
272
273 /* Used for queueing bursts of TX packets. */
274 struct mbuf_table {
275         unsigned len;
276         unsigned txq_id;
277         struct rte_mbuf *m_table[MAX_PKT_BURST];
278 };
279
280 /* TX queue for each data core. */
281 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
282
283 /* TX queue fori each virtio device for zero copy. */
284 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
285
286 /* Vlan header struct used to insert vlan tags on TX. */
287 struct vlan_ethhdr {
288         unsigned char   h_dest[ETH_ALEN];
289         unsigned char   h_source[ETH_ALEN];
290         __be16          h_vlan_proto;
291         __be16          h_vlan_TCI;
292         __be16          h_vlan_encapsulated_proto;
293 };
294
295 /* IPv4 Header */
296 struct ipv4_hdr {
297         uint8_t  version_ihl;           /**< version and header length */
298         uint8_t  type_of_service;       /**< type of service */
299         uint16_t total_length;          /**< length of packet */
300         uint16_t packet_id;             /**< packet ID */
301         uint16_t fragment_offset;       /**< fragmentation offset */
302         uint8_t  time_to_live;          /**< time to live */
303         uint8_t  next_proto_id;         /**< protocol ID */
304         uint16_t hdr_checksum;          /**< header checksum */
305         uint32_t src_addr;              /**< source address */
306         uint32_t dst_addr;              /**< destination address */
307 } __attribute__((__packed__));
308
309 /* Header lengths. */
310 #define VLAN_HLEN       4
311 #define VLAN_ETH_HLEN   18
312
313 /* Per-device statistics struct */
314 struct device_statistics {
315         uint64_t tx_total;
316         rte_atomic64_t rx_total_atomic;
317         uint64_t rx_total;
318         uint64_t tx;
319         rte_atomic64_t rx_atomic;
320         uint64_t rx;
321 } __rte_cache_aligned;
322 struct device_statistics dev_statistics[MAX_DEVICES];
323
324 /*
325  * Builds up the correct configuration for VMDQ VLAN pool map
326  * according to the pool & queue limits.
327  */
328 static inline int
329 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
330 {
331         struct rte_eth_vmdq_rx_conf conf;
332         struct rte_eth_vmdq_rx_conf *def_conf =
333                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
334         unsigned i;
335
336         memset(&conf, 0, sizeof(conf));
337         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
338         conf.nb_pool_maps = num_devices;
339         conf.enable_loop_back = def_conf->enable_loop_back;
340         conf.rx_mode = def_conf->rx_mode;
341
342         for (i = 0; i < conf.nb_pool_maps; i++) {
343                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
344                 conf.pool_map[i].pools = (1UL << i);
345         }
346
347         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
348         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
349                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
350         return 0;
351 }
352
353 /*
354  * Validate the device number according to the max pool number gotten form
355  * dev_info. If the device number is invalid, give the error message and
356  * return -1. Each device must have its own pool.
357  */
358 static inline int
359 validate_num_devices(uint32_t max_nb_devices)
360 {
361         if (num_devices > max_nb_devices) {
362                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
363                 return -1;
364         }
365         return 0;
366 }
367
368 /*
369  * Initialises a given port using global settings and with the rx buffers
370  * coming from the mbuf_pool passed as parameter
371  */
372 static inline int
373 port_init(uint8_t port)
374 {
375         struct rte_eth_dev_info dev_info;
376         struct rte_eth_conf port_conf;
377         struct rte_eth_rxconf *rxconf;
378         struct rte_eth_txconf *txconf;
379         int16_t rx_rings, tx_rings;
380         uint16_t rx_ring_size, tx_ring_size;
381         int retval;
382         uint16_t q;
383
384         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
385         rte_eth_dev_info_get (port, &dev_info);
386
387         if (dev_info.max_rx_queues > MAX_QUEUES) {
388                 rte_exit(EXIT_FAILURE,
389                         "please define MAX_QUEUES no less than %u in %s\n",
390                         dev_info.max_rx_queues, __FILE__);
391         }
392
393         rxconf = &dev_info.default_rxconf;
394         txconf = &dev_info.default_txconf;
395         rxconf->rx_drop_en = 1;
396
397         /* Enable vlan offload */
398         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
399
400         /*
401          * Zero copy defers queue RX/TX start to the time when guest
402          * finishes its startup and packet buffers from that guest are
403          * available.
404          */
405         if (zero_copy) {
406                 rxconf->rx_deferred_start = 1;
407                 rxconf->rx_drop_en = 0;
408                 txconf->tx_deferred_start = 1;
409         }
410
411         /*configure the number of supported virtio devices based on VMDQ limits */
412         num_devices = dev_info.max_vmdq_pools;
413
414         if (zero_copy) {
415                 rx_ring_size = num_rx_descriptor;
416                 tx_ring_size = num_tx_descriptor;
417                 tx_rings = dev_info.max_tx_queues;
418         } else {
419                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
420                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
421                 tx_rings = (uint16_t)rte_lcore_count();
422         }
423
424         retval = validate_num_devices(MAX_DEVICES);
425         if (retval < 0)
426                 return retval;
427
428         /* Get port configuration. */
429         retval = get_eth_conf(&port_conf, num_devices);
430         if (retval < 0)
431                 return retval;
432         /* NIC queues are divided into pf queues and vmdq queues.  */
433         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
434         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
435         num_vmdq_queues = num_devices * queues_per_pool;
436         num_queues = num_pf_queues + num_vmdq_queues;
437         vmdq_queue_base = dev_info.vmdq_queue_base;
438         vmdq_pool_base  = dev_info.vmdq_pool_base;
439         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
440                 num_pf_queues, num_devices, queues_per_pool);
441
442         if (port >= rte_eth_dev_count()) return -1;
443
444         rx_rings = (uint16_t)dev_info.max_rx_queues;
445         /* Configure ethernet device. */
446         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
447         if (retval != 0)
448                 return retval;
449
450         /* Setup the queues. */
451         for (q = 0; q < rx_rings; q ++) {
452                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
453                                                 rte_eth_dev_socket_id(port),
454                                                 rxconf,
455                                                 vpool_array[q].pool);
456                 if (retval < 0)
457                         return retval;
458         }
459         for (q = 0; q < tx_rings; q ++) {
460                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
461                                                 rte_eth_dev_socket_id(port),
462                                                 txconf);
463                 if (retval < 0)
464                         return retval;
465         }
466
467         /* Start the device. */
468         retval  = rte_eth_dev_start(port);
469         if (retval < 0) {
470                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471                 return retval;
472         }
473
474         if (promiscuous)
475                 rte_eth_promiscuous_enable(port);
476
477         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481                         (unsigned)port,
482                         vmdq_ports_eth_addr[port].addr_bytes[0],
483                         vmdq_ports_eth_addr[port].addr_bytes[1],
484                         vmdq_ports_eth_addr[port].addr_bytes[2],
485                         vmdq_ports_eth_addr[port].addr_bytes[3],
486                         vmdq_ports_eth_addr[port].addr_bytes[4],
487                         vmdq_ports_eth_addr[port].addr_bytes[5]);
488
489         return 0;
490 }
491
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498         /* parse number string */
499
500         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501                 return -1;
502         else
503                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504
505         return 0;
506 }
507
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514         char *end = NULL;
515         unsigned long pm;
516
517         errno = 0;
518
519         /* parse hexadecimal string */
520         pm = strtoul(portmask, &end, 16);
521         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522                 return -1;
523
524         if (pm == 0)
525                 return -1;
526
527         return pm;
528
529 }
530
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537         char *end = NULL;
538         unsigned long num;
539
540         errno = 0;
541
542         /* parse unsigned int string */
543         num = strtoul(q_arg, &end, 10);
544         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545                 return -1;
546
547         if (num > max_valid_value)
548                 return -1;
549
550         return num;
551
552 }
553
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561         "               --vm2vm [0|1|2]\n"
562         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563         "               --dev-basename <name>\n"
564         "               --nb-devices ND\n"
565         "               -p PORTMASK: Set mask for ports to be used by application\n"
566         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
572         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
573         "               --dev-basename: The basename to be used for the character device.\n"
574         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
575                         "zero copy\n"
576         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
577                         "used only when zero copy is enabled.\n"
578         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
579                         "used only when zero copy is enabled.\n",
580                prgname);
581 }
582
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589         int opt, ret;
590         int option_index;
591         unsigned i;
592         const char *prgname = argv[0];
593         static struct option long_option[] = {
594                 {"vm2vm", required_argument, NULL, 0},
595                 {"rx-retry", required_argument, NULL, 0},
596                 {"rx-retry-delay", required_argument, NULL, 0},
597                 {"rx-retry-num", required_argument, NULL, 0},
598                 {"mergeable", required_argument, NULL, 0},
599                 {"vlan-strip", required_argument, NULL, 0},
600                 {"stats", required_argument, NULL, 0},
601                 {"dev-basename", required_argument, NULL, 0},
602                 {"zero-copy", required_argument, NULL, 0},
603                 {"rx-desc-num", required_argument, NULL, 0},
604                 {"tx-desc-num", required_argument, NULL, 0},
605                 {NULL, 0, 0, 0},
606         };
607
608         /* Parse command line */
609         while ((opt = getopt_long(argc, argv, "p:P",
610                         long_option, &option_index)) != EOF) {
611                 switch (opt) {
612                 /* Portmask */
613                 case 'p':
614                         enabled_port_mask = parse_portmask(optarg);
615                         if (enabled_port_mask == 0) {
616                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617                                 us_vhost_usage(prgname);
618                                 return -1;
619                         }
620                         break;
621
622                 case 'P':
623                         promiscuous = 1;
624                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625                                 ETH_VMDQ_ACCEPT_BROADCAST |
626                                 ETH_VMDQ_ACCEPT_MULTICAST;
627                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
628
629                         break;
630
631                 case 0:
632                         /* Enable/disable vm2vm comms. */
633                         if (!strncmp(long_option[option_index].name, "vm2vm",
634                                 MAX_LONG_OPT_SZ)) {
635                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636                                 if (ret == -1) {
637                                         RTE_LOG(INFO, VHOST_CONFIG,
638                                                 "Invalid argument for "
639                                                 "vm2vm [0|1|2]\n");
640                                         us_vhost_usage(prgname);
641                                         return -1;
642                                 } else {
643                                         vm2vm_mode = (vm2vm_type)ret;
644                                 }
645                         }
646
647                         /* Enable/disable retries on RX. */
648                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649                                 ret = parse_num_opt(optarg, 1);
650                                 if (ret == -1) {
651                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652                                         us_vhost_usage(prgname);
653                                         return -1;
654                                 } else {
655                                         enable_retry = ret;
656                                 }
657                         }
658
659                         /* Specify the retries delay time (in useconds) on RX. */
660                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661                                 ret = parse_num_opt(optarg, INT32_MAX);
662                                 if (ret == -1) {
663                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664                                         us_vhost_usage(prgname);
665                                         return -1;
666                                 } else {
667                                         burst_rx_delay_time = ret;
668                                 }
669                         }
670
671                         /* Specify the retries number on RX. */
672                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673                                 ret = parse_num_opt(optarg, INT32_MAX);
674                                 if (ret == -1) {
675                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676                                         us_vhost_usage(prgname);
677                                         return -1;
678                                 } else {
679                                         burst_rx_retry_num = ret;
680                                 }
681                         }
682
683                         /* Enable/disable RX mergeable buffers. */
684                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685                                 ret = parse_num_opt(optarg, 1);
686                                 if (ret == -1) {
687                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688                                         us_vhost_usage(prgname);
689                                         return -1;
690                                 } else {
691                                         mergeable = !!ret;
692                                         if (ret) {
693                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
694                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
695                                                         = JUMBO_FRAME_MAX_SIZE;
696                                         }
697                                 }
698                         }
699
700                         /* Enable/disable RX VLAN strip on host. */
701                         if (!strncmp(long_option[option_index].name,
702                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
703                                 ret = parse_num_opt(optarg, 1);
704                                 if (ret == -1) {
705                                         RTE_LOG(INFO, VHOST_CONFIG,
706                                                 "Invalid argument for VLAN strip [0|1]\n");
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 } else {
710                                         vlan_strip = !!ret;
711                                         vmdq_conf_default.rxmode.hw_vlan_strip =
712                                                 vlan_strip;
713                                 }
714                         }
715
716                         /* Enable/disable stats. */
717                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
718                                 ret = parse_num_opt(optarg, INT32_MAX);
719                                 if (ret == -1) {
720                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
721                                         us_vhost_usage(prgname);
722                                         return -1;
723                                 } else {
724                                         enable_stats = ret;
725                                 }
726                         }
727
728                         /* Set character device basename. */
729                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
730                                 if (us_vhost_parse_basename(optarg) == -1) {
731                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
732                                         us_vhost_usage(prgname);
733                                         return -1;
734                                 }
735                         }
736
737                         /* Enable/disable rx/tx zero copy. */
738                         if (!strncmp(long_option[option_index].name,
739                                 "zero-copy", MAX_LONG_OPT_SZ)) {
740                                 ret = parse_num_opt(optarg, 1);
741                                 if (ret == -1) {
742                                         RTE_LOG(INFO, VHOST_CONFIG,
743                                                 "Invalid argument"
744                                                 " for zero-copy [0|1]\n");
745                                         us_vhost_usage(prgname);
746                                         return -1;
747                                 } else
748                                         zero_copy = ret;
749                         }
750
751                         /* Specify the descriptor number on RX. */
752                         if (!strncmp(long_option[option_index].name,
753                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
754                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
755                                 if ((ret == -1) || (!POWEROF2(ret))) {
756                                         RTE_LOG(INFO, VHOST_CONFIG,
757                                         "Invalid argument for rx-desc-num[0-N],"
758                                         "power of 2 required.\n");
759                                         us_vhost_usage(prgname);
760                                         return -1;
761                                 } else {
762                                         num_rx_descriptor = ret;
763                                 }
764                         }
765
766                         /* Specify the descriptor number on TX. */
767                         if (!strncmp(long_option[option_index].name,
768                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
769                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
770                                 if ((ret == -1) || (!POWEROF2(ret))) {
771                                         RTE_LOG(INFO, VHOST_CONFIG,
772                                         "Invalid argument for tx-desc-num [0-N],"
773                                         "power of 2 required.\n");
774                                         us_vhost_usage(prgname);
775                                         return -1;
776                                 } else {
777                                         num_tx_descriptor = ret;
778                                 }
779                         }
780
781                         break;
782
783                         /* Invalid option - print options. */
784                 default:
785                         us_vhost_usage(prgname);
786                         return -1;
787                 }
788         }
789
790         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
791                 if (enabled_port_mask & (1 << i))
792                         ports[num_ports++] = (uint8_t)i;
793         }
794
795         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
796                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
797                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
798                 return -1;
799         }
800
801         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
802                 RTE_LOG(INFO, VHOST_PORT,
803                         "Vhost zero copy doesn't support software vm2vm,"
804                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
805                 return -1;
806         }
807
808         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
809                 RTE_LOG(INFO, VHOST_PORT,
810                         "Vhost zero copy doesn't support jumbo frame,"
811                         "please specify '--mergeable 0' to disable the "
812                         "mergeable feature.\n");
813                 return -1;
814         }
815
816         return 0;
817 }
818
819 /*
820  * Update the global var NUM_PORTS and array PORTS according to system ports number
821  * and return valid ports number
822  */
823 static unsigned check_ports_num(unsigned nb_ports)
824 {
825         unsigned valid_num_ports = num_ports;
826         unsigned portid;
827
828         if (num_ports > nb_ports) {
829                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
830                         num_ports, nb_ports);
831                 num_ports = nb_ports;
832         }
833
834         for (portid = 0; portid < num_ports; portid ++) {
835                 if (ports[portid] >= nb_ports) {
836                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
837                                 ports[portid], (nb_ports - 1));
838                         ports[portid] = INVALID_PORT_ID;
839                         valid_num_ports--;
840                 }
841         }
842         return valid_num_ports;
843 }
844
845 /*
846  * Macro to print out packet contents. Wrapped in debug define so that the
847  * data path is not effected when debug is disabled.
848  */
849 #ifdef DEBUG
850 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
851         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
852         unsigned int index;                                                                                                                                                                                             \
853         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
854                                                                                                                                                                                                                                         \
855         if ((header))                                                                                                                                                                                                   \
856                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
857         else                                                                                                                                                                                                                    \
858                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
859         for (index = 0; index < (size); index++) {                                                                                                                                              \
860                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
861                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
862         }                                                                                                                                                                                                                               \
863         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
864                                                                                                                                                                                                                                         \
865         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
866 } while(0)
867 #else
868 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
869 #endif
870
871 /*
872  * Function to convert guest physical addresses to vhost physical addresses.
873  * This is used to convert virtio buffer addresses.
874  */
875 static inline uint64_t __attribute__((always_inline))
876 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
877         uint32_t buf_len, hpa_type *addr_type)
878 {
879         struct virtio_memory_regions_hpa *region;
880         uint32_t regionidx;
881         uint64_t vhost_pa = 0;
882
883         *addr_type = PHYS_ADDR_INVALID;
884
885         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
886                 region = &vdev->regions_hpa[regionidx];
887                 if ((guest_pa >= region->guest_phys_address) &&
888                         (guest_pa <= region->guest_phys_address_end)) {
889                         vhost_pa = region->host_phys_addr_offset + guest_pa;
890                         if (likely((guest_pa + buf_len - 1)
891                                 <= region->guest_phys_address_end))
892                                 *addr_type = PHYS_ADDR_CONTINUOUS;
893                         else
894                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
895                         break;
896                 }
897         }
898
899         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
900                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
901                 (void *)(uintptr_t)vhost_pa);
902
903         return vhost_pa;
904 }
905
906 /*
907  * Compares a packet destination MAC address to a device MAC address.
908  */
909 static inline int __attribute__((always_inline))
910 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
911 {
912         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
913 }
914
915 /*
916  * This function learns the MAC address of the device and registers this along with a
917  * vlan tag to a VMDQ.
918  */
919 static int
920 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
921 {
922         struct ether_hdr *pkt_hdr;
923         struct virtio_net_data_ll *dev_ll;
924         struct virtio_net *dev = vdev->dev;
925         int i, ret;
926
927         /* Learn MAC address of guest device from packet */
928         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
929
930         dev_ll = ll_root_used;
931
932         while (dev_ll != NULL) {
933                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
934                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
935                         return -1;
936                 }
937                 dev_ll = dev_ll->next;
938         }
939
940         for (i = 0; i < ETHER_ADDR_LEN; i++)
941                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
942
943         /* vlan_tag currently uses the device_id. */
944         vdev->vlan_tag = vlan_tags[dev->device_fh];
945
946         /* Print out VMDQ registration info. */
947         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
948                 dev->device_fh,
949                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
950                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
951                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
952                 vdev->vlan_tag);
953
954         /* Register the MAC address. */
955         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
956                                 (uint32_t)dev->device_fh + vmdq_pool_base);
957         if (ret)
958                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
959                                         dev->device_fh);
960
961         /* Enable stripping of the vlan tag as we handle routing. */
962         if (vlan_strip)
963                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
964                         (uint16_t)vdev->vmdq_rx_q, 1);
965
966         /* Set device as ready for RX. */
967         vdev->ready = DEVICE_RX;
968
969         return 0;
970 }
971
972 /*
973  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
974  * queue before disabling RX on the device.
975  */
976 static inline void
977 unlink_vmdq(struct vhost_dev *vdev)
978 {
979         unsigned i = 0;
980         unsigned rx_count;
981         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
982
983         if (vdev->ready == DEVICE_RX) {
984                 /*clear MAC and VLAN settings*/
985                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
986                 for (i = 0; i < 6; i++)
987                         vdev->mac_address.addr_bytes[i] = 0;
988
989                 vdev->vlan_tag = 0;
990
991                 /*Clear out the receive buffers*/
992                 rx_count = rte_eth_rx_burst(ports[0],
993                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
994
995                 while (rx_count) {
996                         for (i = 0; i < rx_count; i++)
997                                 rte_pktmbuf_free(pkts_burst[i]);
998
999                         rx_count = rte_eth_rx_burst(ports[0],
1000                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1001                 }
1002
1003                 vdev->ready = DEVICE_MAC_LEARNING;
1004         }
1005 }
1006
1007 /*
1008  * Check if the packet destination MAC address is for a local device. If so then put
1009  * the packet on that devices RX queue. If not then return.
1010  */
1011 static inline int __attribute__((always_inline))
1012 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1013 {
1014         struct virtio_net_data_ll *dev_ll;
1015         struct ether_hdr *pkt_hdr;
1016         uint64_t ret = 0;
1017         struct virtio_net *dev = vdev->dev;
1018         struct virtio_net *tdev; /* destination virito device */
1019
1020         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1021
1022         /*get the used devices list*/
1023         dev_ll = ll_root_used;
1024
1025         while (dev_ll != NULL) {
1026                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1027                                           &dev_ll->vdev->mac_address)) {
1028
1029                         /* Drop the packet if the TX packet is destined for the TX device. */
1030                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1031                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1032                                                         dev->device_fh);
1033                                 return 0;
1034                         }
1035                         tdev = dev_ll->vdev->dev;
1036
1037
1038                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1039
1040                         if (unlikely(dev_ll->vdev->remove)) {
1041                                 /*drop the packet if the device is marked for removal*/
1042                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1043                         } else {
1044                                 /*send the packet to the local virtio device*/
1045                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1046                                 if (enable_stats) {
1047                                         rte_atomic64_add(
1048                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1049                                         1);
1050                                         rte_atomic64_add(
1051                                         &dev_statistics[tdev->device_fh].rx_atomic,
1052                                         ret);
1053                                         dev_statistics[dev->device_fh].tx_total++;
1054                                         dev_statistics[dev->device_fh].tx += ret;
1055                                 }
1056                         }
1057
1058                         return 0;
1059                 }
1060                 dev_ll = dev_ll->next;
1061         }
1062
1063         return -1;
1064 }
1065
1066 /*
1067  * Check if the destination MAC of a packet is one local VM,
1068  * and get its vlan tag, and offset if it is.
1069  */
1070 static inline int __attribute__((always_inline))
1071 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1072         uint32_t *offset, uint16_t *vlan_tag)
1073 {
1074         struct virtio_net_data_ll *dev_ll = ll_root_used;
1075         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1076
1077         while (dev_ll != NULL) {
1078                 if ((dev_ll->vdev->ready == DEVICE_RX)
1079                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1080                 &dev_ll->vdev->mac_address)) {
1081                         /*
1082                          * Drop the packet if the TX packet is
1083                          * destined for the TX device.
1084                          */
1085                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1086                                 LOG_DEBUG(VHOST_DATA,
1087                                 "(%"PRIu64") TX: Source and destination"
1088                                 " MAC addresses are the same. Dropping "
1089                                 "packet.\n",
1090                                 dev_ll->vdev->dev->device_fh);
1091                                 return -1;
1092                         }
1093
1094                         /*
1095                          * HW vlan strip will reduce the packet length
1096                          * by minus length of vlan tag, so need restore
1097                          * the packet length by plus it.
1098                          */
1099                         *offset = VLAN_HLEN;
1100                         *vlan_tag =
1101                         (uint16_t)
1102                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1103
1104                         LOG_DEBUG(VHOST_DATA,
1105                         "(%"PRIu64") TX: pkt to local VM device id:"
1106                         "(%"PRIu64") vlan tag: %d.\n",
1107                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1108                         (int)*vlan_tag);
1109
1110                         break;
1111                 }
1112                 dev_ll = dev_ll->next;
1113         }
1114         return 0;
1115 }
1116
1117 /*
1118  * This function routes the TX packet to the correct interface. This may be a local device
1119  * or the physical port.
1120  */
1121 static inline void __attribute__((always_inline))
1122 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1123 {
1124         struct mbuf_table *tx_q;
1125         struct rte_mbuf **m_table;
1126         unsigned len, ret, offset = 0;
1127         const uint16_t lcore_id = rte_lcore_id();
1128         struct virtio_net *dev = vdev->dev;
1129         struct ether_hdr *nh;
1130
1131         /*check if destination is local VM*/
1132         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1133                 rte_pktmbuf_free(m);
1134                 return;
1135         }
1136
1137         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1138                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1139                         rte_pktmbuf_free(m);
1140                         return;
1141                 }
1142         }
1143
1144         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1145
1146         /*Add packet to the port tx queue*/
1147         tx_q = &lcore_tx_queue[lcore_id];
1148         len = tx_q->len;
1149
1150         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1151         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1152                 /* Guest has inserted the vlan tag. */
1153                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1154                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1155                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1156                         (vh->vlan_tci != vlan_tag_be))
1157                         vh->vlan_tci = vlan_tag_be;
1158         } else {
1159                 m->ol_flags = PKT_TX_VLAN_PKT;
1160
1161                 /*
1162                  * Find the right seg to adjust the data len when offset is
1163                  * bigger than tail room size.
1164                  */
1165                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1166                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1167                                 m->data_len += offset;
1168                         else {
1169                                 struct rte_mbuf *seg = m;
1170
1171                                 while ((seg->next != NULL) &&
1172                                         (offset > rte_pktmbuf_tailroom(seg)))
1173                                         seg = seg->next;
1174
1175                                 seg->data_len += offset;
1176                         }
1177                         m->pkt_len += offset;
1178                 }
1179
1180                 m->vlan_tci = vlan_tag;
1181         }
1182
1183         tx_q->m_table[len] = m;
1184         len++;
1185         if (enable_stats) {
1186                 dev_statistics[dev->device_fh].tx_total++;
1187                 dev_statistics[dev->device_fh].tx++;
1188         }
1189
1190         if (unlikely(len == MAX_PKT_BURST)) {
1191                 m_table = (struct rte_mbuf **)tx_q->m_table;
1192                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1193                 /* Free any buffers not handled by TX and update the port stats. */
1194                 if (unlikely(ret < len)) {
1195                         do {
1196                                 rte_pktmbuf_free(m_table[ret]);
1197                         } while (++ret < len);
1198                 }
1199
1200                 len = 0;
1201         }
1202
1203         tx_q->len = len;
1204         return;
1205 }
1206 /*
1207  * This function is called by each data core. It handles all RX/TX registered with the
1208  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1209  * with all devices in the main linked list.
1210  */
1211 static int
1212 switch_worker(__attribute__((unused)) void *arg)
1213 {
1214         struct rte_mempool *mbuf_pool = arg;
1215         struct virtio_net *dev = NULL;
1216         struct vhost_dev *vdev = NULL;
1217         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1218         struct virtio_net_data_ll *dev_ll;
1219         struct mbuf_table *tx_q;
1220         volatile struct lcore_ll_info *lcore_ll;
1221         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1222         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1223         unsigned ret, i;
1224         const uint16_t lcore_id = rte_lcore_id();
1225         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1226         uint16_t rx_count = 0;
1227         uint16_t tx_count;
1228         uint32_t retry = 0;
1229
1230         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1231         lcore_ll = lcore_info[lcore_id].lcore_ll;
1232         prev_tsc = 0;
1233
1234         tx_q = &lcore_tx_queue[lcore_id];
1235         for (i = 0; i < num_cores; i ++) {
1236                 if (lcore_ids[i] == lcore_id) {
1237                         tx_q->txq_id = i;
1238                         break;
1239                 }
1240         }
1241
1242         while(1) {
1243                 cur_tsc = rte_rdtsc();
1244                 /*
1245                  * TX burst queue drain
1246                  */
1247                 diff_tsc = cur_tsc - prev_tsc;
1248                 if (unlikely(diff_tsc > drain_tsc)) {
1249
1250                         if (tx_q->len) {
1251                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1252
1253                                 /*Tx any packets in the queue*/
1254                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1255                                                                            (struct rte_mbuf **)tx_q->m_table,
1256                                                                            (uint16_t)tx_q->len);
1257                                 if (unlikely(ret < tx_q->len)) {
1258                                         do {
1259                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1260                                         } while (++ret < tx_q->len);
1261                                 }
1262
1263                                 tx_q->len = 0;
1264                         }
1265
1266                         prev_tsc = cur_tsc;
1267
1268                 }
1269
1270                 rte_prefetch0(lcore_ll->ll_root_used);
1271                 /*
1272                  * Inform the configuration core that we have exited the linked list and that no devices are
1273                  * in use if requested.
1274                  */
1275                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1276                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1277
1278                 /*
1279                  * Process devices
1280                  */
1281                 dev_ll = lcore_ll->ll_root_used;
1282
1283                 while (dev_ll != NULL) {
1284                         /*get virtio device ID*/
1285                         vdev = dev_ll->vdev;
1286                         dev = vdev->dev;
1287
1288                         if (unlikely(vdev->remove)) {
1289                                 dev_ll = dev_ll->next;
1290                                 unlink_vmdq(vdev);
1291                                 vdev->ready = DEVICE_SAFE_REMOVE;
1292                                 continue;
1293                         }
1294                         if (likely(vdev->ready == DEVICE_RX)) {
1295                                 /*Handle guest RX*/
1296                                 rx_count = rte_eth_rx_burst(ports[0],
1297                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298
1299                                 if (rx_count) {
1300                                         /*
1301                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1302                                         * Here MAX_PKT_BURST must be less than virtio queue size
1303                                         */
1304                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1305                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1306                                                         rte_delay_us(burst_rx_delay_time);
1307                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1308                                                                 break;
1309                                                 }
1310                                         }
1311                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1312                                         if (enable_stats) {
1313                                                 rte_atomic64_add(
1314                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1315                                                 rx_count);
1316                                                 rte_atomic64_add(
1317                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1318                                         }
1319                                         while (likely(rx_count)) {
1320                                                 rx_count--;
1321                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1322                                         }
1323
1324                                 }
1325                         }
1326
1327                         if (likely(!vdev->remove)) {
1328                                 /* Handle guest TX*/
1329                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1330                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1331                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1332                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1333                                                 while (tx_count)
1334                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1335                                         }
1336                                 }
1337                                 while (tx_count)
1338                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1339                         }
1340
1341                         /*move to the next device in the list*/
1342                         dev_ll = dev_ll->next;
1343                 }
1344         }
1345
1346         return 0;
1347 }
1348
1349 /*
1350  * This function gets available ring number for zero copy rx.
1351  * Only one thread will call this funciton for a paticular virtio device,
1352  * so, it is designed as non-thread-safe function.
1353  */
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_num_zcp(struct virtio_net *dev)
1356 {
1357         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1358         uint16_t avail_idx;
1359
1360         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1362 }
1363
1364 /*
1365  * This function gets available ring index for zero copy rx,
1366  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1367  * Only one thread will call this funciton for a paticular virtio device,
1368  * so, it is designed as non-thread-safe function.
1369  */
1370 static inline uint32_t __attribute__((always_inline))
1371 get_available_ring_index_zcp(struct virtio_net *dev,
1372         uint16_t *res_base_idx, uint32_t count)
1373 {
1374         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1375         uint16_t avail_idx;
1376         uint32_t retry = 0;
1377         uint16_t free_entries;
1378
1379         *res_base_idx = vq->last_used_idx_res;
1380         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381         free_entries = (avail_idx - *res_base_idx);
1382
1383         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1384                         "avail idx: %d, "
1385                         "res base idx:%d, free entries:%d\n",
1386                         dev->device_fh, avail_idx, *res_base_idx,
1387                         free_entries);
1388
1389         /*
1390          * If retry is enabled and the queue is full then we wait
1391          * and retry to avoid packet loss.
1392          */
1393         if (enable_retry && unlikely(count > free_entries)) {
1394                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1395                         rte_delay_us(burst_rx_delay_time);
1396                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1397                         free_entries = (avail_idx - *res_base_idx);
1398                         if (count <= free_entries)
1399                                 break;
1400                 }
1401         }
1402
1403         /*check that we have enough buffers*/
1404         if (unlikely(count > free_entries))
1405                 count = free_entries;
1406
1407         if (unlikely(count == 0)) {
1408                 LOG_DEBUG(VHOST_DATA,
1409                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1410                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1411                         dev->device_fh, avail_idx,
1412                         *res_base_idx, free_entries);
1413                 return 0;
1414         }
1415
1416         vq->last_used_idx_res = *res_base_idx + count;
1417
1418         return count;
1419 }
1420
1421 /*
1422  * This function put descriptor back to used list.
1423  */
1424 static inline void __attribute__((always_inline))
1425 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1426 {
1427         uint16_t res_cur_idx = vq->last_used_idx;
1428         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1429         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1430         rte_compiler_barrier();
1431         *(volatile uint16_t *)&vq->used->idx += 1;
1432         vq->last_used_idx += 1;
1433
1434         /* Kick the guest if necessary. */
1435         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1436                 eventfd_write(vq->callfd, (eventfd_t)1);
1437 }
1438
1439 /*
1440  * This function get available descriptor from vitio vring and un-attached mbuf
1441  * from vpool->ring, and then attach them together. It needs adjust the offset
1442  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1443  * frame data may be put to wrong location in mbuf.
1444  */
1445 static inline void __attribute__((always_inline))
1446 attach_rxmbuf_zcp(struct virtio_net *dev)
1447 {
1448         uint16_t res_base_idx, desc_idx;
1449         uint64_t buff_addr, phys_addr;
1450         struct vhost_virtqueue *vq;
1451         struct vring_desc *desc;
1452         void *obj = NULL;
1453         struct rte_mbuf *mbuf;
1454         struct vpool *vpool;
1455         hpa_type addr_type;
1456         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1457
1458         vpool = &vpool_array[vdev->vmdq_rx_q];
1459         vq = dev->virtqueue[VIRTIO_RXQ];
1460
1461         do {
1462                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1463                                 1) != 1))
1464                         return;
1465                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1466
1467                 desc = &vq->desc[desc_idx];
1468                 if (desc->flags & VRING_DESC_F_NEXT) {
1469                         desc = &vq->desc[desc->next];
1470                         buff_addr = gpa_to_vva(dev, desc->addr);
1471                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1472                                         &addr_type);
1473                 } else {
1474                         buff_addr = gpa_to_vva(dev,
1475                                         desc->addr + vq->vhost_hlen);
1476                         phys_addr = gpa_to_hpa(vdev,
1477                                         desc->addr + vq->vhost_hlen,
1478                                         desc->len, &addr_type);
1479                 }
1480
1481                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1482                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1483                                 " address found when attaching RX frame buffer"
1484                                 " address!\n", dev->device_fh);
1485                         put_desc_to_used_list_zcp(vq, desc_idx);
1486                         continue;
1487                 }
1488
1489                 /*
1490                  * Check if the frame buffer address from guest crosses
1491                  * sub-region or not.
1492                  */
1493                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1494                         RTE_LOG(ERR, VHOST_DATA,
1495                                 "(%"PRIu64") Frame buffer address cross "
1496                                 "sub-regioin found when attaching RX frame "
1497                                 "buffer address!\n",
1498                                 dev->device_fh);
1499                         put_desc_to_used_list_zcp(vq, desc_idx);
1500                         continue;
1501                 }
1502         } while (unlikely(phys_addr == 0));
1503
1504         rte_ring_sc_dequeue(vpool->ring, &obj);
1505         mbuf = obj;
1506         if (unlikely(mbuf == NULL)) {
1507                 LOG_DEBUG(VHOST_DATA,
1508                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1509                         "ring_sc_dequeue fail.\n",
1510                         dev->device_fh);
1511                 put_desc_to_used_list_zcp(vq, desc_idx);
1512                 return;
1513         }
1514
1515         if (unlikely(vpool->buf_size > desc->len)) {
1516                 LOG_DEBUG(VHOST_DATA,
1517                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1518                         "length(%d) of descriptor idx: %d less than room "
1519                         "size required: %d\n",
1520                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1521                 put_desc_to_used_list_zcp(vq, desc_idx);
1522                 rte_ring_sp_enqueue(vpool->ring, obj);
1523                 return;
1524         }
1525
1526         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1527         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1528         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1529         mbuf->data_len = desc->len;
1530         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1531
1532         LOG_DEBUG(VHOST_DATA,
1533                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1534                 "descriptor idx:%d\n",
1535                 dev->device_fh, res_base_idx, desc_idx);
1536
1537         __rte_mbuf_raw_free(mbuf);
1538
1539         return;
1540 }
1541
1542 /*
1543  * Detach an attched packet mbuf -
1544  *  - restore original mbuf address and length values.
1545  *  - reset pktmbuf data and data_len to their default values.
1546  *  All other fields of the given packet mbuf will be left intact.
1547  *
1548  * @param m
1549  *   The attached packet mbuf.
1550  */
1551 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1552 {
1553         const struct rte_mempool *mp = m->pool;
1554         void *buf = rte_mbuf_to_baddr(m);
1555         uint32_t buf_ofs;
1556         uint32_t buf_len = mp->elt_size - sizeof(*m);
1557         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1558
1559         m->buf_addr = buf;
1560         m->buf_len = (uint16_t)buf_len;
1561
1562         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1563                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1564         m->data_off = buf_ofs;
1565
1566         m->data_len = 0;
1567 }
1568
1569 /*
1570  * This function is called after packets have been transimited. It fetchs mbuf
1571  * from vpool->pool, detached it and put into vpool->ring. It also update the
1572  * used index and kick the guest if necessary.
1573  */
1574 static inline uint32_t __attribute__((always_inline))
1575 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1576 {
1577         struct rte_mbuf *mbuf;
1578         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1579         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1580         uint32_t index = 0;
1581         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1582
1583         LOG_DEBUG(VHOST_DATA,
1584                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1585                 "clean is: %d\n",
1586                 dev->device_fh, mbuf_count);
1587         LOG_DEBUG(VHOST_DATA,
1588                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1589                 "clean  is : %d\n",
1590                 dev->device_fh, rte_ring_count(vpool->ring));
1591
1592         for (index = 0; index < mbuf_count; index++) {
1593                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1594                 if (likely(MBUF_EXT_MEM(mbuf)))
1595                         pktmbuf_detach_zcp(mbuf);
1596                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1597
1598                 /* Update used index buffer information. */
1599                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1600                 vq->used->ring[used_idx].len = 0;
1601
1602                 used_idx = (used_idx + 1) & (vq->size - 1);
1603         }
1604
1605         LOG_DEBUG(VHOST_DATA,
1606                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1607                 "clean is: %d\n",
1608                 dev->device_fh, rte_mempool_count(vpool->pool));
1609         LOG_DEBUG(VHOST_DATA,
1610                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1611                 "clean  is : %d\n",
1612                 dev->device_fh, rte_ring_count(vpool->ring));
1613         LOG_DEBUG(VHOST_DATA,
1614                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1615                 "vq->last_used_idx:%d\n",
1616                 dev->device_fh, vq->last_used_idx);
1617
1618         vq->last_used_idx += mbuf_count;
1619
1620         LOG_DEBUG(VHOST_DATA,
1621                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1622                 "vq->last_used_idx:%d\n",
1623                 dev->device_fh, vq->last_used_idx);
1624
1625         rte_compiler_barrier();
1626
1627         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1628
1629         /* Kick guest if required. */
1630         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1631                 eventfd_write(vq->callfd, (eventfd_t)1);
1632
1633         return 0;
1634 }
1635
1636 /*
1637  * This function is called when a virtio device is destroy.
1638  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1639  */
1640 static void mbuf_destroy_zcp(struct vpool *vpool)
1641 {
1642         struct rte_mbuf *mbuf = NULL;
1643         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1644
1645         LOG_DEBUG(VHOST_CONFIG,
1646                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1647                 "mbuf_destroy_zcp is: %d\n",
1648                 mbuf_count);
1649         LOG_DEBUG(VHOST_CONFIG,
1650                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1651                 "mbuf_destroy_zcp  is : %d\n",
1652                 rte_ring_count(vpool->ring));
1653
1654         for (index = 0; index < mbuf_count; index++) {
1655                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1656                 if (likely(mbuf != NULL)) {
1657                         if (likely(MBUF_EXT_MEM(mbuf)))
1658                                 pktmbuf_detach_zcp(mbuf);
1659                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1660                 }
1661         }
1662
1663         LOG_DEBUG(VHOST_CONFIG,
1664                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1665                 "mbuf_destroy_zcp is: %d\n",
1666                 rte_mempool_count(vpool->pool));
1667         LOG_DEBUG(VHOST_CONFIG,
1668                 "in mbuf_destroy_zcp: mbuf count in ring after "
1669                 "mbuf_destroy_zcp is : %d\n",
1670                 rte_ring_count(vpool->ring));
1671 }
1672
1673 /*
1674  * This function update the use flag and counter.
1675  */
1676 static inline uint32_t __attribute__((always_inline))
1677 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1678         uint32_t count)
1679 {
1680         struct vhost_virtqueue *vq;
1681         struct vring_desc *desc;
1682         struct rte_mbuf *buff;
1683         /* The virtio_hdr is initialised to 0. */
1684         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1685                 = {{0, 0, 0, 0, 0, 0}, 0};
1686         uint64_t buff_hdr_addr = 0;
1687         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1688         uint32_t head_idx, packet_success = 0;
1689         uint16_t res_cur_idx;
1690
1691         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1692
1693         if (count == 0)
1694                 return 0;
1695
1696         vq = dev->virtqueue[VIRTIO_RXQ];
1697         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1698
1699         res_cur_idx = vq->last_used_idx;
1700         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1701                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1702
1703         /* Retrieve all of the head indexes first to avoid caching issues. */
1704         for (head_idx = 0; head_idx < count; head_idx++)
1705                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1706
1707         /*Prefetch descriptor index. */
1708         rte_prefetch0(&vq->desc[head[packet_success]]);
1709
1710         while (packet_success != count) {
1711                 /* Get descriptor from available ring */
1712                 desc = &vq->desc[head[packet_success]];
1713
1714                 buff = pkts[packet_success];
1715                 LOG_DEBUG(VHOST_DATA,
1716                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1717                         "pkt[%d] descriptor idx: %d\n",
1718                         dev->device_fh, packet_success,
1719                         MBUF_HEADROOM_UINT32(buff));
1720
1721                 PRINT_PACKET(dev,
1722                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1723                         + RTE_PKTMBUF_HEADROOM),
1724                         rte_pktmbuf_data_len(buff), 0);
1725
1726                 /* Buffer address translation for virtio header. */
1727                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1728                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1729
1730                 /*
1731                  * If the descriptors are chained the header and data are
1732                  * placed in separate buffers.
1733                  */
1734                 if (desc->flags & VRING_DESC_F_NEXT) {
1735                         desc->len = vq->vhost_hlen;
1736                         desc = &vq->desc[desc->next];
1737                         desc->len = rte_pktmbuf_data_len(buff);
1738                 } else {
1739                         desc->len = packet_len;
1740                 }
1741
1742                 /* Update used ring with desc information */
1743                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1744                         = head[packet_success];
1745                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1746                         = packet_len;
1747                 res_cur_idx++;
1748                 packet_success++;
1749
1750                 /* A header is required per buffer. */
1751                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1752                         (const void *)&virtio_hdr, vq->vhost_hlen);
1753
1754                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1755
1756                 if (likely(packet_success < count)) {
1757                         /* Prefetch descriptor index. */
1758                         rte_prefetch0(&vq->desc[head[packet_success]]);
1759                 }
1760         }
1761
1762         rte_compiler_barrier();
1763
1764         LOG_DEBUG(VHOST_DATA,
1765                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1766                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1767                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1768
1769         *(volatile uint16_t *)&vq->used->idx += count;
1770         vq->last_used_idx += count;
1771
1772         LOG_DEBUG(VHOST_DATA,
1773                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1774                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1775                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1776
1777         /* Kick the guest if necessary. */
1778         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1779                 eventfd_write(vq->callfd, (eventfd_t)1);
1780
1781         return count;
1782 }
1783
1784 /*
1785  * This function routes the TX packet to the correct interface.
1786  * This may be a local device or the physical port.
1787  */
1788 static inline void __attribute__((always_inline))
1789 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1790         uint32_t desc_idx, uint8_t need_copy)
1791 {
1792         struct mbuf_table *tx_q;
1793         struct rte_mbuf **m_table;
1794         void *obj = NULL;
1795         struct rte_mbuf *mbuf;
1796         unsigned len, ret, offset = 0;
1797         struct vpool *vpool;
1798         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1799         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1800
1801         /*Add packet to the port tx queue*/
1802         tx_q = &tx_queue_zcp[vmdq_rx_q];
1803         len = tx_q->len;
1804
1805         /* Allocate an mbuf and populate the structure. */
1806         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1807         rte_ring_sc_dequeue(vpool->ring, &obj);
1808         mbuf = obj;
1809         if (unlikely(mbuf == NULL)) {
1810                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1811                 RTE_LOG(ERR, VHOST_DATA,
1812                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1813                         dev->device_fh);
1814                 put_desc_to_used_list_zcp(vq, desc_idx);
1815                 return;
1816         }
1817
1818         if (vm2vm_mode == VM2VM_HARDWARE) {
1819                 /* Avoid using a vlan tag from any vm for external pkt, such as
1820                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1821                  * selection, MAC address determines it as an external pkt
1822                  * which should go to network, while vlan tag determine it as
1823                  * a vm2vm pkt should forward to another vm. Hardware confuse
1824                  * such a ambiguous situation, so pkt will lost.
1825                  */
1826                 vlan_tag = external_pkt_default_vlan_tag;
1827                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1828                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1829                         __rte_mbuf_raw_free(mbuf);
1830                         return;
1831                 }
1832         }
1833
1834         mbuf->nb_segs = m->nb_segs;
1835         mbuf->next = m->next;
1836         mbuf->data_len = m->data_len + offset;
1837         mbuf->pkt_len = mbuf->data_len;
1838         if (unlikely(need_copy)) {
1839                 /* Copy the packet contents to the mbuf. */
1840                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1841                         rte_pktmbuf_mtod(m, void *),
1842                         m->data_len);
1843         } else {
1844                 mbuf->data_off = m->data_off;
1845                 mbuf->buf_physaddr = m->buf_physaddr;
1846                 mbuf->buf_addr = m->buf_addr;
1847         }
1848         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1849         mbuf->vlan_tci = vlan_tag;
1850         mbuf->l2_len = sizeof(struct ether_hdr);
1851         mbuf->l3_len = sizeof(struct ipv4_hdr);
1852         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1853
1854         tx_q->m_table[len] = mbuf;
1855         len++;
1856
1857         LOG_DEBUG(VHOST_DATA,
1858                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1859                 dev->device_fh,
1860                 mbuf->nb_segs,
1861                 (mbuf->next == NULL) ? "null" : "non-null");
1862
1863         if (enable_stats) {
1864                 dev_statistics[dev->device_fh].tx_total++;
1865                 dev_statistics[dev->device_fh].tx++;
1866         }
1867
1868         if (unlikely(len == MAX_PKT_BURST)) {
1869                 m_table = (struct rte_mbuf **)tx_q->m_table;
1870                 ret = rte_eth_tx_burst(ports[0],
1871                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1872
1873                 /*
1874                  * Free any buffers not handled by TX and update
1875                  * the port stats.
1876                  */
1877                 if (unlikely(ret < len)) {
1878                         do {
1879                                 rte_pktmbuf_free(m_table[ret]);
1880                         } while (++ret < len);
1881                 }
1882
1883                 len = 0;
1884                 txmbuf_clean_zcp(dev, vpool);
1885         }
1886
1887         tx_q->len = len;
1888
1889         return;
1890 }
1891
1892 /*
1893  * This function TX all available packets in virtio TX queue for one
1894  * virtio-net device. If it is first packet, it learns MAC address and
1895  * setup VMDQ.
1896  */
1897 static inline void __attribute__((always_inline))
1898 virtio_dev_tx_zcp(struct virtio_net *dev)
1899 {
1900         struct rte_mbuf m;
1901         struct vhost_virtqueue *vq;
1902         struct vring_desc *desc;
1903         uint64_t buff_addr = 0, phys_addr;
1904         uint32_t head[MAX_PKT_BURST];
1905         uint32_t i;
1906         uint16_t free_entries, packet_success = 0;
1907         uint16_t avail_idx;
1908         uint8_t need_copy = 0;
1909         hpa_type addr_type;
1910         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1911
1912         vq = dev->virtqueue[VIRTIO_TXQ];
1913         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1914
1915         /* If there are no available buffers then return. */
1916         if (vq->last_used_idx_res == avail_idx)
1917                 return;
1918
1919         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1920
1921         /* Prefetch available ring to retrieve head indexes. */
1922         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1923
1924         /* Get the number of free entries in the ring */
1925         free_entries = (avail_idx - vq->last_used_idx_res);
1926
1927         /* Limit to MAX_PKT_BURST. */
1928         free_entries
1929                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1930
1931         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1932                 dev->device_fh, free_entries);
1933
1934         /* Retrieve all of the head indexes first to avoid caching issues. */
1935         for (i = 0; i < free_entries; i++)
1936                 head[i]
1937                         = vq->avail->ring[(vq->last_used_idx_res + i)
1938                         & (vq->size - 1)];
1939
1940         vq->last_used_idx_res += free_entries;
1941
1942         /* Prefetch descriptor index. */
1943         rte_prefetch0(&vq->desc[head[packet_success]]);
1944         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1945
1946         while (packet_success < free_entries) {
1947                 desc = &vq->desc[head[packet_success]];
1948
1949                 /* Discard first buffer as it is the virtio header */
1950                 desc = &vq->desc[desc->next];
1951
1952                 /* Buffer address translation. */
1953                 buff_addr = gpa_to_vva(dev, desc->addr);
1954                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1955                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1956                         &addr_type);
1957
1958                 if (likely(packet_success < (free_entries - 1)))
1959                         /* Prefetch descriptor index. */
1960                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1961
1962                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1963                         RTE_LOG(ERR, VHOST_DATA,
1964                                 "(%"PRIu64") Invalid frame buffer address found"
1965                                 "when TX packets!\n",
1966                                 dev->device_fh);
1967                         packet_success++;
1968                         continue;
1969                 }
1970
1971                 /* Prefetch buffer address. */
1972                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1973
1974                 /*
1975                  * Setup dummy mbuf. This is copied to a real mbuf if
1976                  * transmitted out the physical port.
1977                  */
1978                 m.data_len = desc->len;
1979                 m.nb_segs = 1;
1980                 m.next = NULL;
1981                 m.data_off = 0;
1982                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1983                 m.buf_physaddr = phys_addr;
1984
1985                 /*
1986                  * Check if the frame buffer address from guest crosses
1987                  * sub-region or not.
1988                  */
1989                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1990                         RTE_LOG(ERR, VHOST_DATA,
1991                                 "(%"PRIu64") Frame buffer address cross "
1992                                 "sub-regioin found when attaching TX frame "
1993                                 "buffer address!\n",
1994                                 dev->device_fh);
1995                         need_copy = 1;
1996                 } else
1997                         need_copy = 0;
1998
1999                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2000
2001                 /*
2002                  * If this is the first received packet we need to learn
2003                  * the MAC and setup VMDQ
2004                  */
2005                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2006                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2007                                 /*
2008                                  * Discard frame if device is scheduled for
2009                                  * removal or a duplicate MAC address is found.
2010                                  */
2011                                 packet_success += free_entries;
2012                                 vq->last_used_idx += packet_success;
2013                                 break;
2014                         }
2015                 }
2016
2017                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2018                 packet_success++;
2019         }
2020 }
2021
2022 /*
2023  * This function is called by each data core. It handles all RX/TX registered
2024  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2025  * addresses are compared with all devices in the main linked list.
2026  */
2027 static int
2028 switch_worker_zcp(__attribute__((unused)) void *arg)
2029 {
2030         struct virtio_net *dev = NULL;
2031         struct vhost_dev  *vdev = NULL;
2032         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2033         struct virtio_net_data_ll *dev_ll;
2034         struct mbuf_table *tx_q;
2035         volatile struct lcore_ll_info *lcore_ll;
2036         const uint64_t drain_tsc
2037                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2038                 * BURST_TX_DRAIN_US;
2039         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2040         unsigned ret;
2041         const uint16_t lcore_id = rte_lcore_id();
2042         uint16_t count_in_ring, rx_count = 0;
2043
2044         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2045
2046         lcore_ll = lcore_info[lcore_id].lcore_ll;
2047         prev_tsc = 0;
2048
2049         while (1) {
2050                 cur_tsc = rte_rdtsc();
2051
2052                 /* TX burst queue drain */
2053                 diff_tsc = cur_tsc - prev_tsc;
2054                 if (unlikely(diff_tsc > drain_tsc)) {
2055                         /*
2056                          * Get mbuf from vpool.pool and detach mbuf and
2057                          * put back into vpool.ring.
2058                          */
2059                         dev_ll = lcore_ll->ll_root_used;
2060                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2061                                 /* Get virtio device ID */
2062                                 vdev = dev_ll->vdev;
2063                                 dev = vdev->dev;
2064
2065                                 if (likely(!vdev->remove)) {
2066                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2067                                         if (tx_q->len) {
2068                                                 LOG_DEBUG(VHOST_DATA,
2069                                                 "TX queue drained after timeout"
2070                                                 " with burst size %u\n",
2071                                                 tx_q->len);
2072
2073                                                 /*
2074                                                  * Tx any packets in the queue
2075                                                  */
2076                                                 ret = rte_eth_tx_burst(
2077                                                         ports[0],
2078                                                         (uint16_t)tx_q->txq_id,
2079                                                         (struct rte_mbuf **)
2080                                                         tx_q->m_table,
2081                                                         (uint16_t)tx_q->len);
2082                                                 if (unlikely(ret < tx_q->len)) {
2083                                                         do {
2084                                                                 rte_pktmbuf_free(
2085                                                                         tx_q->m_table[ret]);
2086                                                         } while (++ret < tx_q->len);
2087                                                 }
2088                                                 tx_q->len = 0;
2089
2090                                                 txmbuf_clean_zcp(dev,
2091                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2092                                         }
2093                                 }
2094                                 dev_ll = dev_ll->next;
2095                         }
2096                         prev_tsc = cur_tsc;
2097                 }
2098
2099                 rte_prefetch0(lcore_ll->ll_root_used);
2100
2101                 /*
2102                  * Inform the configuration core that we have exited the linked
2103                  * list and that no devices are in use if requested.
2104                  */
2105                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2106                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2107
2108                 /* Process devices */
2109                 dev_ll = lcore_ll->ll_root_used;
2110
2111                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2112                         vdev = dev_ll->vdev;
2113                         dev  = vdev->dev;
2114                         if (unlikely(vdev->remove)) {
2115                                 dev_ll = dev_ll->next;
2116                                 unlink_vmdq(vdev);
2117                                 vdev->ready = DEVICE_SAFE_REMOVE;
2118                                 continue;
2119                         }
2120
2121                         if (likely(vdev->ready == DEVICE_RX)) {
2122                                 uint32_t index = vdev->vmdq_rx_q;
2123                                 uint16_t i;
2124                                 count_in_ring
2125                                 = rte_ring_count(vpool_array[index].ring);
2126                                 uint16_t free_entries
2127                                 = (uint16_t)get_available_ring_num_zcp(dev);
2128
2129                                 /*
2130                                  * Attach all mbufs in vpool.ring and put back
2131                                  * into vpool.pool.
2132                                  */
2133                                 for (i = 0;
2134                                 i < RTE_MIN(free_entries,
2135                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2136                                 i++)
2137                                         attach_rxmbuf_zcp(dev);
2138
2139                                 /* Handle guest RX */
2140                                 rx_count = rte_eth_rx_burst(ports[0],
2141                                         vdev->vmdq_rx_q, pkts_burst,
2142                                         MAX_PKT_BURST);
2143
2144                                 if (rx_count) {
2145                                         ret_count = virtio_dev_rx_zcp(dev,
2146                                                         pkts_burst, rx_count);
2147                                         if (enable_stats) {
2148                                                 dev_statistics[dev->device_fh].rx_total
2149                                                         += rx_count;
2150                                                 dev_statistics[dev->device_fh].rx
2151                                                         += ret_count;
2152                                         }
2153                                         while (likely(rx_count)) {
2154                                                 rx_count--;
2155                                                 pktmbuf_detach_zcp(
2156                                                         pkts_burst[rx_count]);
2157                                                 rte_ring_sp_enqueue(
2158                                                         vpool_array[index].ring,
2159                                                         (void *)pkts_burst[rx_count]);
2160                                         }
2161                                 }
2162                         }
2163
2164                         if (likely(!vdev->remove))
2165                                 /* Handle guest TX */
2166                                 virtio_dev_tx_zcp(dev);
2167
2168                         /* Move to the next device in the list */
2169                         dev_ll = dev_ll->next;
2170                 }
2171         }
2172
2173         return 0;
2174 }
2175
2176
2177 /*
2178  * Add an entry to a used linked list. A free entry must first be found
2179  * in the free linked list using get_data_ll_free_entry();
2180  */
2181 static void
2182 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2183         struct virtio_net_data_ll *ll_dev)
2184 {
2185         struct virtio_net_data_ll *ll = *ll_root_addr;
2186
2187         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2188         ll_dev->next = NULL;
2189         rte_compiler_barrier();
2190
2191         /* If ll == NULL then this is the first device. */
2192         if (ll) {
2193                 /* Increment to the tail of the linked list. */
2194                 while ((ll->next != NULL) )
2195                         ll = ll->next;
2196
2197                 ll->next = ll_dev;
2198         } else {
2199                 *ll_root_addr = ll_dev;
2200         }
2201 }
2202
2203 /*
2204  * Remove an entry from a used linked list. The entry must then be added to
2205  * the free linked list using put_data_ll_free_entry().
2206  */
2207 static void
2208 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2209         struct virtio_net_data_ll *ll_dev,
2210         struct virtio_net_data_ll *ll_dev_last)
2211 {
2212         struct virtio_net_data_ll *ll = *ll_root_addr;
2213
2214         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2215                 return;
2216
2217         if (ll_dev == ll)
2218                 *ll_root_addr = ll_dev->next;
2219         else
2220                 if (likely(ll_dev_last != NULL))
2221                         ll_dev_last->next = ll_dev->next;
2222                 else
2223                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2224 }
2225
2226 /*
2227  * Find and return an entry from the free linked list.
2228  */
2229 static struct virtio_net_data_ll *
2230 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2231 {
2232         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2233         struct virtio_net_data_ll *ll_dev;
2234
2235         if (ll_free == NULL)
2236                 return NULL;
2237
2238         ll_dev = ll_free;
2239         *ll_root_addr = ll_free->next;
2240
2241         return ll_dev;
2242 }
2243
2244 /*
2245  * Place an entry back on to the free linked list.
2246  */
2247 static void
2248 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2249         struct virtio_net_data_ll *ll_dev)
2250 {
2251         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2252
2253         if (ll_dev == NULL)
2254                 return;
2255
2256         ll_dev->next = ll_free;
2257         *ll_root_addr = ll_dev;
2258 }
2259
2260 /*
2261  * Creates a linked list of a given size.
2262  */
2263 static struct virtio_net_data_ll *
2264 alloc_data_ll(uint32_t size)
2265 {
2266         struct virtio_net_data_ll *ll_new;
2267         uint32_t i;
2268
2269         /* Malloc and then chain the linked list. */
2270         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2271         if (ll_new == NULL) {
2272                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2273                 return NULL;
2274         }
2275
2276         for (i = 0; i < size - 1; i++) {
2277                 ll_new[i].vdev = NULL;
2278                 ll_new[i].next = &ll_new[i+1];
2279         }
2280         ll_new[i].next = NULL;
2281
2282         return (ll_new);
2283 }
2284
2285 /*
2286  * Create the main linked list along with each individual cores linked list. A used and a free list
2287  * are created to manage entries.
2288  */
2289 static int
2290 init_data_ll (void)
2291 {
2292         int lcore;
2293
2294         RTE_LCORE_FOREACH_SLAVE(lcore) {
2295                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2296                 if (lcore_info[lcore].lcore_ll == NULL) {
2297                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2298                         return -1;
2299                 }
2300
2301                 lcore_info[lcore].lcore_ll->device_num = 0;
2302                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2303                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2304                 if (num_devices % num_switching_cores)
2305                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2306                 else
2307                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2308         }
2309
2310         /* Allocate devices up to a maximum of MAX_DEVICES. */
2311         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2312
2313         return 0;
2314 }
2315
2316 /*
2317  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2318  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2319  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2320  */
2321 static void
2322 destroy_device (volatile struct virtio_net *dev)
2323 {
2324         struct virtio_net_data_ll *ll_lcore_dev_cur;
2325         struct virtio_net_data_ll *ll_main_dev_cur;
2326         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2327         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2328         struct vhost_dev *vdev;
2329         int lcore;
2330
2331         dev->flags &= ~VIRTIO_DEV_RUNNING;
2332
2333         vdev = (struct vhost_dev *)dev->priv;
2334         /*set the remove flag. */
2335         vdev->remove = 1;
2336         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2337                 rte_pause();
2338         }
2339
2340         /* Search for entry to be removed from lcore ll */
2341         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2342         while (ll_lcore_dev_cur != NULL) {
2343                 if (ll_lcore_dev_cur->vdev == vdev) {
2344                         break;
2345                 } else {
2346                         ll_lcore_dev_last = ll_lcore_dev_cur;
2347                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2348                 }
2349         }
2350
2351         if (ll_lcore_dev_cur == NULL) {
2352                 RTE_LOG(ERR, VHOST_CONFIG,
2353                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2354                         dev->device_fh);
2355                 return;
2356         }
2357
2358         /* Search for entry to be removed from main ll */
2359         ll_main_dev_cur = ll_root_used;
2360         ll_main_dev_last = NULL;
2361         while (ll_main_dev_cur != NULL) {
2362                 if (ll_main_dev_cur->vdev == vdev) {
2363                         break;
2364                 } else {
2365                         ll_main_dev_last = ll_main_dev_cur;
2366                         ll_main_dev_cur = ll_main_dev_cur->next;
2367                 }
2368         }
2369
2370         /* Remove entries from the lcore and main ll. */
2371         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2372         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2373
2374         /* Set the dev_removal_flag on each lcore. */
2375         RTE_LCORE_FOREACH_SLAVE(lcore) {
2376                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2377         }
2378
2379         /*
2380          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2381          * they can no longer access the device removed from the linked lists and that the devices
2382          * are no longer in use.
2383          */
2384         RTE_LCORE_FOREACH_SLAVE(lcore) {
2385                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2386                         rte_pause();
2387                 }
2388         }
2389
2390         /* Add the entries back to the lcore and main free ll.*/
2391         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2392         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2393
2394         /* Decrement number of device on the lcore. */
2395         lcore_info[vdev->coreid].lcore_ll->device_num--;
2396
2397         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2398
2399         if (zero_copy) {
2400                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2401
2402                 /* Stop the RX queue. */
2403                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2404                         LOG_DEBUG(VHOST_CONFIG,
2405                                 "(%"PRIu64") In destroy_device: Failed to stop "
2406                                 "rx queue:%d\n",
2407                                 dev->device_fh,
2408                                 vdev->vmdq_rx_q);
2409                 }
2410
2411                 LOG_DEBUG(VHOST_CONFIG,
2412                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2413                         "mempool back to ring for RX queue: %d\n",
2414                         dev->device_fh, vdev->vmdq_rx_q);
2415
2416                 mbuf_destroy_zcp(vpool);
2417
2418                 /* Stop the TX queue. */
2419                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2420                         LOG_DEBUG(VHOST_CONFIG,
2421                                 "(%"PRIu64") In destroy_device: Failed to "
2422                                 "stop tx queue:%d\n",
2423                                 dev->device_fh, vdev->vmdq_rx_q);
2424                 }
2425
2426                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2427
2428                 LOG_DEBUG(VHOST_CONFIG,
2429                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2430                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2431                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2432                         dev->device_fh);
2433
2434                 mbuf_destroy_zcp(vpool);
2435                 rte_free(vdev->regions_hpa);
2436         }
2437         rte_free(vdev);
2438
2439 }
2440
2441 /*
2442  * Calculate the region count of physical continous regions for one particular
2443  * region of whose vhost virtual address is continous. The particular region
2444  * start from vva_start, with size of 'size' in argument.
2445  */
2446 static uint32_t
2447 check_hpa_regions(uint64_t vva_start, uint64_t size)
2448 {
2449         uint32_t i, nregions = 0, page_size = getpagesize();
2450         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2451         if (vva_start % page_size) {
2452                 LOG_DEBUG(VHOST_CONFIG,
2453                         "in check_countinous: vva start(%p) mod page_size(%d) "
2454                         "has remainder\n",
2455                         (void *)(uintptr_t)vva_start, page_size);
2456                 return 0;
2457         }
2458         if (size % page_size) {
2459                 LOG_DEBUG(VHOST_CONFIG,
2460                         "in check_countinous: "
2461                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2462                         size, page_size);
2463                 return 0;
2464         }
2465         for (i = 0; i < size - page_size; i = i + page_size) {
2466                 cur_phys_addr
2467                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2468                 next_phys_addr = rte_mem_virt2phy(
2469                         (void *)(uintptr_t)(vva_start + i + page_size));
2470                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2471                         ++nregions;
2472                         LOG_DEBUG(VHOST_CONFIG,
2473                                 "in check_continuous: hva addr:(%p) is not "
2474                                 "continuous with hva addr:(%p), diff:%d\n",
2475                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2476                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2477                                 + page_size), page_size);
2478                         LOG_DEBUG(VHOST_CONFIG,
2479                                 "in check_continuous: hpa addr:(%p) is not "
2480                                 "continuous with hpa addr:(%p), "
2481                                 "diff:(%"PRIu64")\n",
2482                                 (void *)(uintptr_t)cur_phys_addr,
2483                                 (void *)(uintptr_t)next_phys_addr,
2484                                 (next_phys_addr-cur_phys_addr));
2485                 }
2486         }
2487         return nregions;
2488 }
2489
2490 /*
2491  * Divide each region whose vhost virtual address is continous into a few
2492  * sub-regions, make sure the physical address within each sub-region are
2493  * continous. And fill offset(to GPA) and size etc. information of each
2494  * sub-region into regions_hpa.
2495  */
2496 static uint32_t
2497 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2498 {
2499         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2500         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2501
2502         if (mem_region_hpa == NULL)
2503                 return 0;
2504
2505         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2506                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2507                         virtio_memory->regions[regionidx].address_offset;
2508                 mem_region_hpa[regionidx_hpa].guest_phys_address
2509                         = virtio_memory->regions[regionidx].guest_phys_address;
2510                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2511                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2512                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2513                 LOG_DEBUG(VHOST_CONFIG,
2514                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2515                         regionidx_hpa,
2516                         (void *)(uintptr_t)
2517                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2518                 LOG_DEBUG(VHOST_CONFIG,
2519                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2520                         regionidx_hpa,
2521                         (void *)(uintptr_t)
2522                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2523                 for (i = 0, k = 0;
2524                         i < virtio_memory->regions[regionidx].memory_size -
2525                                 page_size;
2526                         i += page_size) {
2527                         cur_phys_addr = rte_mem_virt2phy(
2528                                         (void *)(uintptr_t)(vva_start + i));
2529                         next_phys_addr = rte_mem_virt2phy(
2530                                         (void *)(uintptr_t)(vva_start +
2531                                         i + page_size));
2532                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2533                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2534                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2535                                         k + page_size;
2536                                 mem_region_hpa[regionidx_hpa].memory_size
2537                                         = k + page_size;
2538                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2539                                         "phys addr end  [%d]:(%p)\n",
2540                                         regionidx_hpa,
2541                                         (void *)(uintptr_t)
2542                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2543                                 LOG_DEBUG(VHOST_CONFIG,
2544                                         "in fill_hpa_regions: guest phys addr "
2545                                         "size [%d]:(%p)\n",
2546                                         regionidx_hpa,
2547                                         (void *)(uintptr_t)
2548                                         (mem_region_hpa[regionidx_hpa].memory_size));
2549                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2550                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2551                                 ++regionidx_hpa;
2552                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2553                                         next_phys_addr -
2554                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2555                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2556                                         " phys addr start[%d]:(%p)\n",
2557                                         regionidx_hpa,
2558                                         (void *)(uintptr_t)
2559                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2560                                 LOG_DEBUG(VHOST_CONFIG,
2561                                         "in fill_hpa_regions: host  phys addr "
2562                                         "start[%d]:(%p)\n",
2563                                         regionidx_hpa,
2564                                         (void *)(uintptr_t)
2565                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2566                                 k = 0;
2567                         } else {
2568                                 k += page_size;
2569                         }
2570                 }
2571                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2572                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2573                         + k + page_size;
2574                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2575                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2576                         "[%d]:(%p)\n", regionidx_hpa,
2577                         (void *)(uintptr_t)
2578                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2579                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2580                         "[%d]:(%p)\n", regionidx_hpa,
2581                         (void *)(uintptr_t)
2582                         (mem_region_hpa[regionidx_hpa].memory_size));
2583                 ++regionidx_hpa;
2584         }
2585         return regionidx_hpa;
2586 }
2587
2588 /*
2589  * A new device is added to a data core. First the device is added to the main linked list
2590  * and the allocated to a specific data core.
2591  */
2592 static int
2593 new_device (struct virtio_net *dev)
2594 {
2595         struct virtio_net_data_ll *ll_dev;
2596         int lcore, core_add = 0;
2597         uint32_t device_num_min = num_devices;
2598         struct vhost_dev *vdev;
2599         uint32_t regionidx;
2600
2601         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2602         if (vdev == NULL) {
2603                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2604                         dev->device_fh);
2605                 return -1;
2606         }
2607         vdev->dev = dev;
2608         dev->priv = vdev;
2609
2610         if (zero_copy) {
2611                 vdev->nregions_hpa = dev->mem->nregions;
2612                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2613                         vdev->nregions_hpa
2614                                 += check_hpa_regions(
2615                                         dev->mem->regions[regionidx].guest_phys_address
2616                                         + dev->mem->regions[regionidx].address_offset,
2617                                         dev->mem->regions[regionidx].memory_size);
2618
2619                 }
2620
2621                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2622                                                vdev->nregions_hpa,
2623                                                sizeof(struct virtio_memory_regions_hpa),
2624                                                RTE_CACHE_LINE_SIZE);
2625                 if (vdev->regions_hpa == NULL) {
2626                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2627                         rte_free(vdev);
2628                         return -1;
2629                 }
2630
2631
2632                 if (fill_hpa_memory_regions(
2633                         vdev->regions_hpa, dev->mem
2634                         ) != vdev->nregions_hpa) {
2635
2636                         RTE_LOG(ERR, VHOST_CONFIG,
2637                                 "hpa memory regions number mismatch: "
2638                                 "[%d]\n", vdev->nregions_hpa);
2639                         rte_free(vdev->regions_hpa);
2640                         rte_free(vdev);
2641                         return -1;
2642                 }
2643         }
2644
2645
2646         /* Add device to main ll */
2647         ll_dev = get_data_ll_free_entry(&ll_root_free);
2648         if (ll_dev == NULL) {
2649                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2650                         "of %d devices per core has been reached\n",
2651                         dev->device_fh, num_devices);
2652                 if (vdev->regions_hpa)
2653                         rte_free(vdev->regions_hpa);
2654                 rte_free(vdev);
2655                 return -1;
2656         }
2657         ll_dev->vdev = vdev;
2658         add_data_ll_entry(&ll_root_used, ll_dev);
2659         vdev->vmdq_rx_q
2660                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2661
2662         if (zero_copy) {
2663                 uint32_t index = vdev->vmdq_rx_q;
2664                 uint32_t count_in_ring, i;
2665                 struct mbuf_table *tx_q;
2666
2667                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2668
2669                 LOG_DEBUG(VHOST_CONFIG,
2670                         "(%"PRIu64") in new_device: mbuf count in mempool "
2671                         "before attach is: %d\n",
2672                         dev->device_fh,
2673                         rte_mempool_count(vpool_array[index].pool));
2674                 LOG_DEBUG(VHOST_CONFIG,
2675                         "(%"PRIu64") in new_device: mbuf count in  ring "
2676                         "before attach  is : %d\n",
2677                         dev->device_fh, count_in_ring);
2678
2679                 /*
2680                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2681                  */
2682                 for (i = 0; i < count_in_ring; i++)
2683                         attach_rxmbuf_zcp(dev);
2684
2685                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2686                         "mempool after attach is: %d\n",
2687                         dev->device_fh,
2688                         rte_mempool_count(vpool_array[index].pool));
2689                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2690                         "ring after attach  is : %d\n",
2691                         dev->device_fh,
2692                         rte_ring_count(vpool_array[index].ring));
2693
2694                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2695                 tx_q->txq_id = vdev->vmdq_rx_q;
2696
2697                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2698                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2699
2700                         LOG_DEBUG(VHOST_CONFIG,
2701                                 "(%"PRIu64") In new_device: Failed to start "
2702                                 "tx queue:%d\n",
2703                                 dev->device_fh, vdev->vmdq_rx_q);
2704
2705                         mbuf_destroy_zcp(vpool);
2706                         rte_free(vdev->regions_hpa);
2707                         rte_free(vdev);
2708                         return -1;
2709                 }
2710
2711                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2712                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2713
2714                         LOG_DEBUG(VHOST_CONFIG,
2715                                 "(%"PRIu64") In new_device: Failed to start "
2716                                 "rx queue:%d\n",
2717                                 dev->device_fh, vdev->vmdq_rx_q);
2718
2719                         /* Stop the TX queue. */
2720                         if (rte_eth_dev_tx_queue_stop(ports[0],
2721                                 vdev->vmdq_rx_q) != 0) {
2722                                 LOG_DEBUG(VHOST_CONFIG,
2723                                         "(%"PRIu64") In new_device: Failed to "
2724                                         "stop tx queue:%d\n",
2725                                         dev->device_fh, vdev->vmdq_rx_q);
2726                         }
2727
2728                         mbuf_destroy_zcp(vpool);
2729                         rte_free(vdev->regions_hpa);
2730                         rte_free(vdev);
2731                         return -1;
2732                 }
2733
2734         }
2735
2736         /*reset ready flag*/
2737         vdev->ready = DEVICE_MAC_LEARNING;
2738         vdev->remove = 0;
2739
2740         /* Find a suitable lcore to add the device. */
2741         RTE_LCORE_FOREACH_SLAVE(lcore) {
2742                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2743                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2744                         core_add = lcore;
2745                 }
2746         }
2747         /* Add device to lcore ll */
2748         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2749         if (ll_dev == NULL) {
2750                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2751                 vdev->ready = DEVICE_SAFE_REMOVE;
2752                 destroy_device(dev);
2753                 rte_free(vdev->regions_hpa);
2754                 rte_free(vdev);
2755                 return -1;
2756         }
2757         ll_dev->vdev = vdev;
2758         vdev->coreid = core_add;
2759
2760         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2761
2762         /* Initialize device stats */
2763         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2764
2765         /* Disable notifications. */
2766         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2767         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2768         lcore_info[vdev->coreid].lcore_ll->device_num++;
2769         dev->flags |= VIRTIO_DEV_RUNNING;
2770
2771         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2772
2773         return 0;
2774 }
2775
2776 /*
2777  * These callback allow devices to be added to the data core when configuration
2778  * has been fully complete.
2779  */
2780 static const struct virtio_net_device_ops virtio_net_device_ops =
2781 {
2782         .new_device =  new_device,
2783         .destroy_device = destroy_device,
2784 };
2785
2786 /*
2787  * This is a thread will wake up after a period to print stats if the user has
2788  * enabled them.
2789  */
2790 static void
2791 print_stats(void)
2792 {
2793         struct virtio_net_data_ll *dev_ll;
2794         uint64_t tx_dropped, rx_dropped;
2795         uint64_t tx, tx_total, rx, rx_total;
2796         uint32_t device_fh;
2797         const char clr[] = { 27, '[', '2', 'J', '\0' };
2798         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2799
2800         while(1) {
2801                 sleep(enable_stats);
2802
2803                 /* Clear screen and move to top left */
2804                 printf("%s%s", clr, top_left);
2805
2806                 printf("\nDevice statistics ====================================");
2807
2808                 dev_ll = ll_root_used;
2809                 while (dev_ll != NULL) {
2810                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2811                         tx_total = dev_statistics[device_fh].tx_total;
2812                         tx = dev_statistics[device_fh].tx;
2813                         tx_dropped = tx_total - tx;
2814                         if (zero_copy == 0) {
2815                                 rx_total = rte_atomic64_read(
2816                                         &dev_statistics[device_fh].rx_total_atomic);
2817                                 rx = rte_atomic64_read(
2818                                         &dev_statistics[device_fh].rx_atomic);
2819                         } else {
2820                                 rx_total = dev_statistics[device_fh].rx_total;
2821                                 rx = dev_statistics[device_fh].rx;
2822                         }
2823                         rx_dropped = rx_total - rx;
2824
2825                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2826                                         "\nTX total:            %"PRIu64""
2827                                         "\nTX dropped:          %"PRIu64""
2828                                         "\nTX successful:               %"PRIu64""
2829                                         "\nRX total:            %"PRIu64""
2830                                         "\nRX dropped:          %"PRIu64""
2831                                         "\nRX successful:               %"PRIu64"",
2832                                         device_fh,
2833                                         tx_total,
2834                                         tx_dropped,
2835                                         tx,
2836                                         rx_total,
2837                                         rx_dropped,
2838                                         rx);
2839
2840                         dev_ll = dev_ll->next;
2841                 }
2842                 printf("\n======================================================\n");
2843         }
2844 }
2845
2846 static void
2847 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2848         char *ring_name, uint32_t nb_mbuf)
2849 {
2850         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2851                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2852         if (vpool_array[index].pool != NULL) {
2853                 vpool_array[index].ring
2854                         = rte_ring_create(ring_name,
2855                                 rte_align32pow2(nb_mbuf + 1),
2856                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2857                 if (likely(vpool_array[index].ring != NULL)) {
2858                         LOG_DEBUG(VHOST_CONFIG,
2859                                 "in setup_mempool_tbl: mbuf count in "
2860                                 "mempool is: %d\n",
2861                                 rte_mempool_count(vpool_array[index].pool));
2862                         LOG_DEBUG(VHOST_CONFIG,
2863                                 "in setup_mempool_tbl: mbuf count in "
2864                                 "ring   is: %d\n",
2865                                 rte_ring_count(vpool_array[index].ring));
2866                 } else {
2867                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2868                                 ring_name);
2869                 }
2870
2871                 /* Need consider head room. */
2872                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2873         } else {
2874                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2875         }
2876 }
2877
2878 /* When we receive a INT signal, unregister vhost driver */
2879 static void
2880 sigint_handler(__rte_unused int signum)
2881 {
2882         /* Unregister vhost driver. */
2883         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2884         if (ret != 0)
2885                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2886         exit(0);
2887 }
2888
2889 /*
2890  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2891  * device is also registered here to handle the IOCTLs.
2892  */
2893 int
2894 main(int argc, char *argv[])
2895 {
2896         struct rte_mempool *mbuf_pool = NULL;
2897         unsigned lcore_id, core_id = 0;
2898         unsigned nb_ports, valid_num_ports;
2899         int ret;
2900         uint8_t portid;
2901         uint16_t queue_id;
2902         static pthread_t tid;
2903         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2904
2905         signal(SIGINT, sigint_handler);
2906
2907         /* init EAL */
2908         ret = rte_eal_init(argc, argv);
2909         if (ret < 0)
2910                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2911         argc -= ret;
2912         argv += ret;
2913
2914         /* parse app arguments */
2915         ret = us_vhost_parse_args(argc, argv);
2916         if (ret < 0)
2917                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2918
2919         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2920                 if (rte_lcore_is_enabled(lcore_id))
2921                         lcore_ids[core_id ++] = lcore_id;
2922
2923         if (rte_lcore_count() > RTE_MAX_LCORE)
2924                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2925
2926         /*set the number of swithcing cores available*/
2927         num_switching_cores = rte_lcore_count()-1;
2928
2929         /* Get the number of physical ports. */
2930         nb_ports = rte_eth_dev_count();
2931         if (nb_ports > RTE_MAX_ETHPORTS)
2932                 nb_ports = RTE_MAX_ETHPORTS;
2933
2934         /*
2935          * Update the global var NUM_PORTS and global array PORTS
2936          * and get value of var VALID_NUM_PORTS according to system ports number
2937          */
2938         valid_num_ports = check_ports_num(nb_ports);
2939
2940         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2941                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2942                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2943                 return -1;
2944         }
2945
2946         if (zero_copy == 0) {
2947                 /* Create the mbuf pool. */
2948                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2949                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2950                         0, MBUF_DATA_SIZE, rte_socket_id());
2951                 if (mbuf_pool == NULL)
2952                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2953
2954                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2955                         vpool_array[queue_id].pool = mbuf_pool;
2956
2957                 if (vm2vm_mode == VM2VM_HARDWARE) {
2958                         /* Enable VT loop back to let L2 switch to do it. */
2959                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2960                         LOG_DEBUG(VHOST_CONFIG,
2961                                 "Enable loop back for L2 switch in vmdq.\n");
2962                 }
2963         } else {
2964                 uint32_t nb_mbuf;
2965                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2966                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2967
2968                 nb_mbuf = num_rx_descriptor
2969                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2970                         + num_switching_cores * MAX_PKT_BURST;
2971
2972                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2973                         snprintf(pool_name, sizeof(pool_name),
2974                                 "rxmbuf_pool_%u", queue_id);
2975                         snprintf(ring_name, sizeof(ring_name),
2976                                 "rxmbuf_ring_%u", queue_id);
2977                         setup_mempool_tbl(rte_socket_id(), queue_id,
2978                                 pool_name, ring_name, nb_mbuf);
2979                 }
2980
2981                 nb_mbuf = num_tx_descriptor
2982                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2983                                 + num_switching_cores * MAX_PKT_BURST;
2984
2985                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2986                         snprintf(pool_name, sizeof(pool_name),
2987                                 "txmbuf_pool_%u", queue_id);
2988                         snprintf(ring_name, sizeof(ring_name),
2989                                 "txmbuf_ring_%u", queue_id);
2990                         setup_mempool_tbl(rte_socket_id(),
2991                                 (queue_id + MAX_QUEUES),
2992                                 pool_name, ring_name, nb_mbuf);
2993                 }
2994
2995                 if (vm2vm_mode == VM2VM_HARDWARE) {
2996                         /* Enable VT loop back to let L2 switch to do it. */
2997                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2998                         LOG_DEBUG(VHOST_CONFIG,
2999                                 "Enable loop back for L2 switch in vmdq.\n");
3000                 }
3001         }
3002         /* Set log level. */
3003         rte_set_log_level(LOG_LEVEL);
3004
3005         /* initialize all ports */
3006         for (portid = 0; portid < nb_ports; portid++) {
3007                 /* skip ports that are not enabled */
3008                 if ((enabled_port_mask & (1 << portid)) == 0) {
3009                         RTE_LOG(INFO, VHOST_PORT,
3010                                 "Skipping disabled port %d\n", portid);
3011                         continue;
3012                 }
3013                 if (port_init(portid) != 0)
3014                         rte_exit(EXIT_FAILURE,
3015                                 "Cannot initialize network ports\n");
3016         }
3017
3018         /* Initialise all linked lists. */
3019         if (init_data_ll() == -1)
3020                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3021
3022         /* Initialize device stats */
3023         memset(&dev_statistics, 0, sizeof(dev_statistics));
3024
3025         /* Enable stats if the user option is set. */
3026         if (enable_stats) {
3027                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3028                 if (ret != 0)
3029                         rte_exit(EXIT_FAILURE,
3030                                 "Cannot create print-stats thread\n");
3031
3032                 /* Set thread_name for aid in debugging.  */
3033                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3034                 ret = rte_thread_setname(tid, thread_name);
3035                 if (ret != 0)
3036                         RTE_LOG(ERR, VHOST_CONFIG,
3037                                 "Cannot set print-stats name\n");
3038         }
3039
3040         /* Launch all data cores. */
3041         if (zero_copy == 0) {
3042                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3043                         rte_eal_remote_launch(switch_worker,
3044                                 mbuf_pool, lcore_id);
3045                 }
3046         } else {
3047                 uint32_t count_in_mempool, index, i;
3048                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3049                         /* For all RX and TX queues. */
3050                         count_in_mempool
3051                                 = rte_mempool_count(vpool_array[index].pool);
3052
3053                         /*
3054                          * Transfer all un-attached mbufs from vpool.pool
3055                          * to vpoo.ring.
3056                          */
3057                         for (i = 0; i < count_in_mempool; i++) {
3058                                 struct rte_mbuf *mbuf
3059                                         = __rte_mbuf_raw_alloc(
3060                                                 vpool_array[index].pool);
3061                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3062                                                 (void *)mbuf);
3063                         }
3064
3065                         LOG_DEBUG(VHOST_CONFIG,
3066                                 "in main: mbuf count in mempool at initial "
3067                                 "is: %d\n", count_in_mempool);
3068                         LOG_DEBUG(VHOST_CONFIG,
3069                                 "in main: mbuf count in  ring at initial  is :"
3070                                 " %d\n",
3071                                 rte_ring_count(vpool_array[index].ring));
3072                 }
3073
3074                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3075                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3076                                 lcore_id);
3077         }
3078
3079         if (mergeable == 0)
3080                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3081
3082         /* Register vhost(cuse or user) driver to handle vhost messages. */
3083         ret = rte_vhost_driver_register((char *)&dev_basename);
3084         if (ret != 0)
3085                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3086
3087         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3088
3089         /* Start CUSE session. */
3090         rte_vhost_driver_session_start();
3091         return 0;
3092
3093 }