apps: use helper to create mbuf pools
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_DATA_SIZE (2048 + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_DATA_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM)
79 #define MBUF_CACHE_SIZE_ZCP 0
80
81 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
82 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
83
84 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
85 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
86
87 #define JUMBO_FRAME_MAX_SIZE    0x2600
88
89 /* State of virtio device. */
90 #define DEVICE_MAC_LEARNING 0
91 #define DEVICE_RX                       1
92 #define DEVICE_SAFE_REMOVE      2
93
94 /* Config_core_flag status definitions. */
95 #define REQUEST_DEV_REMOVAL 1
96 #define ACK_DEV_REMOVAL 0
97
98 /* Configurable number of RX/TX ring descriptors */
99 #define RTE_TEST_RX_DESC_DEFAULT 1024
100 #define RTE_TEST_TX_DESC_DEFAULT 512
101
102 /*
103  * Need refine these 2 macros for legacy and DPDK based front end:
104  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
105  * And then adjust power 2.
106  */
107 /*
108  * For legacy front end, 128 descriptors,
109  * half for virtio header, another half for mbuf.
110  */
111 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
112 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
113
114 /* Get first 4 bytes in mbuf headroom. */
115 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
116                 + sizeof(struct rte_mbuf)))
117
118 /* true if x is a power of 2 */
119 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
120
121 #define INVALID_PORT_ID 0xFF
122
123 /* Max number of devices. Limited by vmdq. */
124 #define MAX_DEVICES 64
125
126 /* Size of buffers used for snprintfs. */
127 #define MAX_PRINT_BUFF 6072
128
129 /* Maximum character device basename size. */
130 #define MAX_BASENAME_SZ 10
131
132 /* Maximum long option length for option parsing. */
133 #define MAX_LONG_OPT_SZ 64
134
135 /* Used to compare MAC addresses. */
136 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
137
138 /* Number of descriptors per cacheline. */
139 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
140
141 #define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
142
143 /* mask of enabled ports */
144 static uint32_t enabled_port_mask = 0;
145
146 /* Promiscuous mode */
147 static uint32_t promiscuous;
148
149 /*Number of switching cores enabled*/
150 static uint32_t num_switching_cores = 0;
151
152 /* number of devices/queues to support*/
153 static uint32_t num_queues = 0;
154 static uint32_t num_devices;
155
156 /*
157  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
158  * disabled on default.
159  */
160 static uint32_t zero_copy;
161 static int mergeable;
162
163 /* Do vlan strip on host, enabled on default */
164 static uint32_t vlan_strip = 1;
165
166 /* number of descriptors to apply*/
167 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
168 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
169
170 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
171 #define MAX_RING_DESC 4096
172
173 struct vpool {
174         struct rte_mempool *pool;
175         struct rte_ring *ring;
176         uint32_t buf_size;
177 } vpool_array[MAX_QUEUES+MAX_QUEUES];
178
179 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
180 typedef enum {
181         VM2VM_DISABLED = 0,
182         VM2VM_SOFTWARE = 1,
183         VM2VM_HARDWARE = 2,
184         VM2VM_LAST
185 } vm2vm_type;
186 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
187
188 /* The type of host physical address translated from guest physical address. */
189 typedef enum {
190         PHYS_ADDR_CONTINUOUS = 0,
191         PHYS_ADDR_CROSS_SUBREG = 1,
192         PHYS_ADDR_INVALID = 2,
193         PHYS_ADDR_LAST
194 } hpa_type;
195
196 /* Enable stats. */
197 static uint32_t enable_stats = 0;
198 /* Enable retries on RX. */
199 static uint32_t enable_retry = 1;
200 /* Specify timeout (in useconds) between retries on RX. */
201 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
202 /* Specify the number of retries on RX. */
203 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
204
205 /* Character device basename. Can be set by user. */
206 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
207
208 /* empty vmdq configuration structure. Filled in programatically */
209 static struct rte_eth_conf vmdq_conf_default = {
210         .rxmode = {
211                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
212                 .split_hdr_size = 0,
213                 .header_split   = 0, /**< Header Split disabled */
214                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
215                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
216                 /*
217                  * It is necessary for 1G NIC such as I350,
218                  * this fixes bug of ipv4 forwarding in guest can't
219                  * forward pakets from one virtio dev to another virtio dev.
220                  */
221                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
222                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
223                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
224         },
225
226         .txmode = {
227                 .mq_mode = ETH_MQ_TX_NONE,
228         },
229         .rx_adv_conf = {
230                 /*
231                  * should be overridden separately in code with
232                  * appropriate values
233                  */
234                 .vmdq_rx_conf = {
235                         .nb_queue_pools = ETH_8_POOLS,
236                         .enable_default_pool = 0,
237                         .default_pool = 0,
238                         .nb_pool_maps = 0,
239                         .pool_map = {{0, 0},},
240                 },
241         },
242 };
243
244 static unsigned lcore_ids[RTE_MAX_LCORE];
245 static uint8_t ports[RTE_MAX_ETHPORTS];
246 static unsigned num_ports = 0; /**< The number of ports specified in command line */
247 static uint16_t num_pf_queues, num_vmdq_queues;
248 static uint16_t vmdq_pool_base, vmdq_queue_base;
249 static uint16_t queues_per_pool;
250
251 static const uint16_t external_pkt_default_vlan_tag = 2000;
252 const uint16_t vlan_tags[] = {
253         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
254         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
255         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
256         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
257         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
258         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
259         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
260         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
261 };
262
263 /* ethernet addresses of ports */
264 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
265
266 /* heads for the main used and free linked lists for the data path. */
267 static struct virtio_net_data_ll *ll_root_used = NULL;
268 static struct virtio_net_data_ll *ll_root_free = NULL;
269
270 /* Array of data core structures containing information on individual core linked lists. */
271 static struct lcore_info lcore_info[RTE_MAX_LCORE];
272
273 /* Used for queueing bursts of TX packets. */
274 struct mbuf_table {
275         unsigned len;
276         unsigned txq_id;
277         struct rte_mbuf *m_table[MAX_PKT_BURST];
278 };
279
280 /* TX queue for each data core. */
281 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
282
283 /* TX queue fori each virtio device for zero copy. */
284 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
285
286 /* Vlan header struct used to insert vlan tags on TX. */
287 struct vlan_ethhdr {
288         unsigned char   h_dest[ETH_ALEN];
289         unsigned char   h_source[ETH_ALEN];
290         __be16          h_vlan_proto;
291         __be16          h_vlan_TCI;
292         __be16          h_vlan_encapsulated_proto;
293 };
294
295 /* IPv4 Header */
296 struct ipv4_hdr {
297         uint8_t  version_ihl;           /**< version and header length */
298         uint8_t  type_of_service;       /**< type of service */
299         uint16_t total_length;          /**< length of packet */
300         uint16_t packet_id;             /**< packet ID */
301         uint16_t fragment_offset;       /**< fragmentation offset */
302         uint8_t  time_to_live;          /**< time to live */
303         uint8_t  next_proto_id;         /**< protocol ID */
304         uint16_t hdr_checksum;          /**< header checksum */
305         uint32_t src_addr;              /**< source address */
306         uint32_t dst_addr;              /**< destination address */
307 } __attribute__((__packed__));
308
309 /* Header lengths. */
310 #define VLAN_HLEN       4
311 #define VLAN_ETH_HLEN   18
312
313 /* Per-device statistics struct */
314 struct device_statistics {
315         uint64_t tx_total;
316         rte_atomic64_t rx_total_atomic;
317         uint64_t rx_total;
318         uint64_t tx;
319         rte_atomic64_t rx_atomic;
320         uint64_t rx;
321 } __rte_cache_aligned;
322 struct device_statistics dev_statistics[MAX_DEVICES];
323
324 /*
325  * Builds up the correct configuration for VMDQ VLAN pool map
326  * according to the pool & queue limits.
327  */
328 static inline int
329 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
330 {
331         struct rte_eth_vmdq_rx_conf conf;
332         struct rte_eth_vmdq_rx_conf *def_conf =
333                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
334         unsigned i;
335
336         memset(&conf, 0, sizeof(conf));
337         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
338         conf.nb_pool_maps = num_devices;
339         conf.enable_loop_back = def_conf->enable_loop_back;
340         conf.rx_mode = def_conf->rx_mode;
341
342         for (i = 0; i < conf.nb_pool_maps; i++) {
343                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
344                 conf.pool_map[i].pools = (1UL << i);
345         }
346
347         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
348         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
349                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
350         return 0;
351 }
352
353 /*
354  * Validate the device number according to the max pool number gotten form
355  * dev_info. If the device number is invalid, give the error message and
356  * return -1. Each device must have its own pool.
357  */
358 static inline int
359 validate_num_devices(uint32_t max_nb_devices)
360 {
361         if (num_devices > max_nb_devices) {
362                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
363                 return -1;
364         }
365         return 0;
366 }
367
368 /*
369  * Initialises a given port using global settings and with the rx buffers
370  * coming from the mbuf_pool passed as parameter
371  */
372 static inline int
373 port_init(uint8_t port)
374 {
375         struct rte_eth_dev_info dev_info;
376         struct rte_eth_conf port_conf;
377         struct rte_eth_rxconf *rxconf;
378         struct rte_eth_txconf *txconf;
379         int16_t rx_rings, tx_rings;
380         uint16_t rx_ring_size, tx_ring_size;
381         int retval;
382         uint16_t q;
383
384         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
385         rte_eth_dev_info_get (port, &dev_info);
386
387         if (dev_info.max_rx_queues > MAX_QUEUES) {
388                 rte_exit(EXIT_FAILURE,
389                         "please define MAX_QUEUES no less than %u in %s\n",
390                         dev_info.max_rx_queues, __FILE__);
391         }
392
393         rxconf = &dev_info.default_rxconf;
394         txconf = &dev_info.default_txconf;
395         rxconf->rx_drop_en = 1;
396
397         /* Enable vlan offload */
398         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
399
400         /*
401          * Zero copy defers queue RX/TX start to the time when guest
402          * finishes its startup and packet buffers from that guest are
403          * available.
404          */
405         if (zero_copy) {
406                 rxconf->rx_deferred_start = 1;
407                 rxconf->rx_drop_en = 0;
408                 txconf->tx_deferred_start = 1;
409         }
410
411         /*configure the number of supported virtio devices based on VMDQ limits */
412         num_devices = dev_info.max_vmdq_pools;
413
414         if (zero_copy) {
415                 rx_ring_size = num_rx_descriptor;
416                 tx_ring_size = num_tx_descriptor;
417                 tx_rings = dev_info.max_tx_queues;
418         } else {
419                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
420                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
421                 tx_rings = (uint16_t)rte_lcore_count();
422         }
423
424         retval = validate_num_devices(MAX_DEVICES);
425         if (retval < 0)
426                 return retval;
427
428         /* Get port configuration. */
429         retval = get_eth_conf(&port_conf, num_devices);
430         if (retval < 0)
431                 return retval;
432         /* NIC queues are divided into pf queues and vmdq queues.  */
433         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
434         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
435         num_vmdq_queues = num_devices * queues_per_pool;
436         num_queues = num_pf_queues + num_vmdq_queues;
437         vmdq_queue_base = dev_info.vmdq_queue_base;
438         vmdq_pool_base  = dev_info.vmdq_pool_base;
439         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
440                 num_pf_queues, num_devices, queues_per_pool);
441
442         if (port >= rte_eth_dev_count()) return -1;
443
444         rx_rings = (uint16_t)dev_info.max_rx_queues;
445         /* Configure ethernet device. */
446         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
447         if (retval != 0)
448                 return retval;
449
450         /* Setup the queues. */
451         for (q = 0; q < rx_rings; q ++) {
452                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
453                                                 rte_eth_dev_socket_id(port),
454                                                 rxconf,
455                                                 vpool_array[q].pool);
456                 if (retval < 0)
457                         return retval;
458         }
459         for (q = 0; q < tx_rings; q ++) {
460                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
461                                                 rte_eth_dev_socket_id(port),
462                                                 txconf);
463                 if (retval < 0)
464                         return retval;
465         }
466
467         /* Start the device. */
468         retval  = rte_eth_dev_start(port);
469         if (retval < 0) {
470                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471                 return retval;
472         }
473
474         if (promiscuous)
475                 rte_eth_promiscuous_enable(port);
476
477         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481                         (unsigned)port,
482                         vmdq_ports_eth_addr[port].addr_bytes[0],
483                         vmdq_ports_eth_addr[port].addr_bytes[1],
484                         vmdq_ports_eth_addr[port].addr_bytes[2],
485                         vmdq_ports_eth_addr[port].addr_bytes[3],
486                         vmdq_ports_eth_addr[port].addr_bytes[4],
487                         vmdq_ports_eth_addr[port].addr_bytes[5]);
488
489         return 0;
490 }
491
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498         /* parse number string */
499
500         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501                 return -1;
502         else
503                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504
505         return 0;
506 }
507
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514         char *end = NULL;
515         unsigned long pm;
516
517         errno = 0;
518
519         /* parse hexadecimal string */
520         pm = strtoul(portmask, &end, 16);
521         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522                 return -1;
523
524         if (pm == 0)
525                 return -1;
526
527         return pm;
528
529 }
530
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537         char *end = NULL;
538         unsigned long num;
539
540         errno = 0;
541
542         /* parse unsigned int string */
543         num = strtoul(q_arg, &end, 10);
544         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545                 return -1;
546
547         if (num > max_valid_value)
548                 return -1;
549
550         return num;
551
552 }
553
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561         "               --vm2vm [0|1|2]\n"
562         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563         "               --dev-basename <name>\n"
564         "               --nb-devices ND\n"
565         "               -p PORTMASK: Set mask for ports to be used by application\n"
566         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
572         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
573         "               --dev-basename: The basename to be used for the character device.\n"
574         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
575                         "zero copy\n"
576         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
577                         "used only when zero copy is enabled.\n"
578         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
579                         "used only when zero copy is enabled.\n",
580                prgname);
581 }
582
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589         int opt, ret;
590         int option_index;
591         unsigned i;
592         const char *prgname = argv[0];
593         static struct option long_option[] = {
594                 {"vm2vm", required_argument, NULL, 0},
595                 {"rx-retry", required_argument, NULL, 0},
596                 {"rx-retry-delay", required_argument, NULL, 0},
597                 {"rx-retry-num", required_argument, NULL, 0},
598                 {"mergeable", required_argument, NULL, 0},
599                 {"vlan-strip", required_argument, NULL, 0},
600                 {"stats", required_argument, NULL, 0},
601                 {"dev-basename", required_argument, NULL, 0},
602                 {"zero-copy", required_argument, NULL, 0},
603                 {"rx-desc-num", required_argument, NULL, 0},
604                 {"tx-desc-num", required_argument, NULL, 0},
605                 {NULL, 0, 0, 0},
606         };
607
608         /* Parse command line */
609         while ((opt = getopt_long(argc, argv, "p:P",
610                         long_option, &option_index)) != EOF) {
611                 switch (opt) {
612                 /* Portmask */
613                 case 'p':
614                         enabled_port_mask = parse_portmask(optarg);
615                         if (enabled_port_mask == 0) {
616                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617                                 us_vhost_usage(prgname);
618                                 return -1;
619                         }
620                         break;
621
622                 case 'P':
623                         promiscuous = 1;
624                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625                                 ETH_VMDQ_ACCEPT_BROADCAST |
626                                 ETH_VMDQ_ACCEPT_MULTICAST;
627                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
628
629                         break;
630
631                 case 0:
632                         /* Enable/disable vm2vm comms. */
633                         if (!strncmp(long_option[option_index].name, "vm2vm",
634                                 MAX_LONG_OPT_SZ)) {
635                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
636                                 if (ret == -1) {
637                                         RTE_LOG(INFO, VHOST_CONFIG,
638                                                 "Invalid argument for "
639                                                 "vm2vm [0|1|2]\n");
640                                         us_vhost_usage(prgname);
641                                         return -1;
642                                 } else {
643                                         vm2vm_mode = (vm2vm_type)ret;
644                                 }
645                         }
646
647                         /* Enable/disable retries on RX. */
648                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649                                 ret = parse_num_opt(optarg, 1);
650                                 if (ret == -1) {
651                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652                                         us_vhost_usage(prgname);
653                                         return -1;
654                                 } else {
655                                         enable_retry = ret;
656                                 }
657                         }
658
659                         /* Specify the retries delay time (in useconds) on RX. */
660                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661                                 ret = parse_num_opt(optarg, INT32_MAX);
662                                 if (ret == -1) {
663                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664                                         us_vhost_usage(prgname);
665                                         return -1;
666                                 } else {
667                                         burst_rx_delay_time = ret;
668                                 }
669                         }
670
671                         /* Specify the retries number on RX. */
672                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673                                 ret = parse_num_opt(optarg, INT32_MAX);
674                                 if (ret == -1) {
675                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676                                         us_vhost_usage(prgname);
677                                         return -1;
678                                 } else {
679                                         burst_rx_retry_num = ret;
680                                 }
681                         }
682
683                         /* Enable/disable RX mergeable buffers. */
684                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685                                 ret = parse_num_opt(optarg, 1);
686                                 if (ret == -1) {
687                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688                                         us_vhost_usage(prgname);
689                                         return -1;
690                                 } else {
691                                         mergeable = !!ret;
692                                         if (ret) {
693                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
694                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
695                                                         = JUMBO_FRAME_MAX_SIZE;
696                                         }
697                                 }
698                         }
699
700                         /* Enable/disable RX VLAN strip on host. */
701                         if (!strncmp(long_option[option_index].name,
702                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
703                                 ret = parse_num_opt(optarg, 1);
704                                 if (ret == -1) {
705                                         RTE_LOG(INFO, VHOST_CONFIG,
706                                                 "Invalid argument for VLAN strip [0|1]\n");
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 } else {
710                                         vlan_strip = !!ret;
711                                         vmdq_conf_default.rxmode.hw_vlan_strip =
712                                                 vlan_strip;
713                                 }
714                         }
715
716                         /* Enable/disable stats. */
717                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
718                                 ret = parse_num_opt(optarg, INT32_MAX);
719                                 if (ret == -1) {
720                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
721                                         us_vhost_usage(prgname);
722                                         return -1;
723                                 } else {
724                                         enable_stats = ret;
725                                 }
726                         }
727
728                         /* Set character device basename. */
729                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
730                                 if (us_vhost_parse_basename(optarg) == -1) {
731                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
732                                         us_vhost_usage(prgname);
733                                         return -1;
734                                 }
735                         }
736
737                         /* Enable/disable rx/tx zero copy. */
738                         if (!strncmp(long_option[option_index].name,
739                                 "zero-copy", MAX_LONG_OPT_SZ)) {
740                                 ret = parse_num_opt(optarg, 1);
741                                 if (ret == -1) {
742                                         RTE_LOG(INFO, VHOST_CONFIG,
743                                                 "Invalid argument"
744                                                 " for zero-copy [0|1]\n");
745                                         us_vhost_usage(prgname);
746                                         return -1;
747                                 } else
748                                         zero_copy = ret;
749                         }
750
751                         /* Specify the descriptor number on RX. */
752                         if (!strncmp(long_option[option_index].name,
753                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
754                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
755                                 if ((ret == -1) || (!POWEROF2(ret))) {
756                                         RTE_LOG(INFO, VHOST_CONFIG,
757                                         "Invalid argument for rx-desc-num[0-N],"
758                                         "power of 2 required.\n");
759                                         us_vhost_usage(prgname);
760                                         return -1;
761                                 } else {
762                                         num_rx_descriptor = ret;
763                                 }
764                         }
765
766                         /* Specify the descriptor number on TX. */
767                         if (!strncmp(long_option[option_index].name,
768                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
769                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
770                                 if ((ret == -1) || (!POWEROF2(ret))) {
771                                         RTE_LOG(INFO, VHOST_CONFIG,
772                                         "Invalid argument for tx-desc-num [0-N],"
773                                         "power of 2 required.\n");
774                                         us_vhost_usage(prgname);
775                                         return -1;
776                                 } else {
777                                         num_tx_descriptor = ret;
778                                 }
779                         }
780
781                         break;
782
783                         /* Invalid option - print options. */
784                 default:
785                         us_vhost_usage(prgname);
786                         return -1;
787                 }
788         }
789
790         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
791                 if (enabled_port_mask & (1 << i))
792                         ports[num_ports++] = (uint8_t)i;
793         }
794
795         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
796                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
797                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
798                 return -1;
799         }
800
801         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
802                 RTE_LOG(INFO, VHOST_PORT,
803                         "Vhost zero copy doesn't support software vm2vm,"
804                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
805                 return -1;
806         }
807
808         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
809                 RTE_LOG(INFO, VHOST_PORT,
810                         "Vhost zero copy doesn't support jumbo frame,"
811                         "please specify '--mergeable 0' to disable the "
812                         "mergeable feature.\n");
813                 return -1;
814         }
815
816         return 0;
817 }
818
819 /*
820  * Update the global var NUM_PORTS and array PORTS according to system ports number
821  * and return valid ports number
822  */
823 static unsigned check_ports_num(unsigned nb_ports)
824 {
825         unsigned valid_num_ports = num_ports;
826         unsigned portid;
827
828         if (num_ports > nb_ports) {
829                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
830                         num_ports, nb_ports);
831                 num_ports = nb_ports;
832         }
833
834         for (portid = 0; portid < num_ports; portid ++) {
835                 if (ports[portid] >= nb_ports) {
836                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
837                                 ports[portid], (nb_ports - 1));
838                         ports[portid] = INVALID_PORT_ID;
839                         valid_num_ports--;
840                 }
841         }
842         return valid_num_ports;
843 }
844
845 /*
846  * Macro to print out packet contents. Wrapped in debug define so that the
847  * data path is not effected when debug is disabled.
848  */
849 #ifdef DEBUG
850 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
851         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
852         unsigned int index;                                                                                                                                                                                             \
853         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
854                                                                                                                                                                                                                                         \
855         if ((header))                                                                                                                                                                                                   \
856                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
857         else                                                                                                                                                                                                                    \
858                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
859         for (index = 0; index < (size); index++) {                                                                                                                                              \
860                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
861                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
862         }                                                                                                                                                                                                                               \
863         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
864                                                                                                                                                                                                                                         \
865         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
866 } while(0)
867 #else
868 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
869 #endif
870
871 /*
872  * Function to convert guest physical addresses to vhost physical addresses.
873  * This is used to convert virtio buffer addresses.
874  */
875 static inline uint64_t __attribute__((always_inline))
876 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
877         uint32_t buf_len, hpa_type *addr_type)
878 {
879         struct virtio_memory_regions_hpa *region;
880         uint32_t regionidx;
881         uint64_t vhost_pa = 0;
882
883         *addr_type = PHYS_ADDR_INVALID;
884
885         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
886                 region = &vdev->regions_hpa[regionidx];
887                 if ((guest_pa >= region->guest_phys_address) &&
888                         (guest_pa <= region->guest_phys_address_end)) {
889                         vhost_pa = region->host_phys_addr_offset + guest_pa;
890                         if (likely((guest_pa + buf_len - 1)
891                                 <= region->guest_phys_address_end))
892                                 *addr_type = PHYS_ADDR_CONTINUOUS;
893                         else
894                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
895                         break;
896                 }
897         }
898
899         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
900                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
901                 (void *)(uintptr_t)vhost_pa);
902
903         return vhost_pa;
904 }
905
906 /*
907  * Compares a packet destination MAC address to a device MAC address.
908  */
909 static inline int __attribute__((always_inline))
910 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
911 {
912         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
913 }
914
915 /*
916  * This function learns the MAC address of the device and registers this along with a
917  * vlan tag to a VMDQ.
918  */
919 static int
920 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
921 {
922         struct ether_hdr *pkt_hdr;
923         struct virtio_net_data_ll *dev_ll;
924         struct virtio_net *dev = vdev->dev;
925         int i, ret;
926
927         /* Learn MAC address of guest device from packet */
928         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
929
930         dev_ll = ll_root_used;
931
932         while (dev_ll != NULL) {
933                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
934                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
935                         return -1;
936                 }
937                 dev_ll = dev_ll->next;
938         }
939
940         for (i = 0; i < ETHER_ADDR_LEN; i++)
941                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
942
943         /* vlan_tag currently uses the device_id. */
944         vdev->vlan_tag = vlan_tags[dev->device_fh];
945
946         /* Print out VMDQ registration info. */
947         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
948                 dev->device_fh,
949                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
950                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
951                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
952                 vdev->vlan_tag);
953
954         /* Register the MAC address. */
955         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
956                                 (uint32_t)dev->device_fh + vmdq_pool_base);
957         if (ret)
958                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
959                                         dev->device_fh);
960
961         /* Enable stripping of the vlan tag as we handle routing. */
962         if (vlan_strip)
963                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
964                         (uint16_t)vdev->vmdq_rx_q, 1);
965
966         /* Set device as ready for RX. */
967         vdev->ready = DEVICE_RX;
968
969         return 0;
970 }
971
972 /*
973  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
974  * queue before disabling RX on the device.
975  */
976 static inline void
977 unlink_vmdq(struct vhost_dev *vdev)
978 {
979         unsigned i = 0;
980         unsigned rx_count;
981         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
982
983         if (vdev->ready == DEVICE_RX) {
984                 /*clear MAC and VLAN settings*/
985                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
986                 for (i = 0; i < 6; i++)
987                         vdev->mac_address.addr_bytes[i] = 0;
988
989                 vdev->vlan_tag = 0;
990
991                 /*Clear out the receive buffers*/
992                 rx_count = rte_eth_rx_burst(ports[0],
993                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
994
995                 while (rx_count) {
996                         for (i = 0; i < rx_count; i++)
997                                 rte_pktmbuf_free(pkts_burst[i]);
998
999                         rx_count = rte_eth_rx_burst(ports[0],
1000                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1001                 }
1002
1003                 vdev->ready = DEVICE_MAC_LEARNING;
1004         }
1005 }
1006
1007 /*
1008  * Check if the packet destination MAC address is for a local device. If so then put
1009  * the packet on that devices RX queue. If not then return.
1010  */
1011 static inline int __attribute__((always_inline))
1012 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1013 {
1014         struct virtio_net_data_ll *dev_ll;
1015         struct ether_hdr *pkt_hdr;
1016         uint64_t ret = 0;
1017         struct virtio_net *dev = vdev->dev;
1018         struct virtio_net *tdev; /* destination virito device */
1019
1020         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1021
1022         /*get the used devices list*/
1023         dev_ll = ll_root_used;
1024
1025         while (dev_ll != NULL) {
1026                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1027                                           &dev_ll->vdev->mac_address)) {
1028
1029                         /* Drop the packet if the TX packet is destined for the TX device. */
1030                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1031                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1032                                                         dev->device_fh);
1033                                 return 0;
1034                         }
1035                         tdev = dev_ll->vdev->dev;
1036
1037
1038                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1039
1040                         if (unlikely(dev_ll->vdev->remove)) {
1041                                 /*drop the packet if the device is marked for removal*/
1042                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1043                         } else {
1044                                 /*send the packet to the local virtio device*/
1045                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1046                                 if (enable_stats) {
1047                                         rte_atomic64_add(
1048                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1049                                         1);
1050                                         rte_atomic64_add(
1051                                         &dev_statistics[tdev->device_fh].rx_atomic,
1052                                         ret);
1053                                         dev_statistics[tdev->device_fh].tx_total++;
1054                                         dev_statistics[tdev->device_fh].tx += ret;
1055                                 }
1056                         }
1057
1058                         return 0;
1059                 }
1060                 dev_ll = dev_ll->next;
1061         }
1062
1063         return -1;
1064 }
1065
1066 /*
1067  * Check if the destination MAC of a packet is one local VM,
1068  * and get its vlan tag, and offset if it is.
1069  */
1070 static inline int __attribute__((always_inline))
1071 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1072         uint32_t *offset, uint16_t *vlan_tag)
1073 {
1074         struct virtio_net_data_ll *dev_ll = ll_root_used;
1075         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1076
1077         while (dev_ll != NULL) {
1078                 if ((dev_ll->vdev->ready == DEVICE_RX)
1079                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1080                 &dev_ll->vdev->mac_address)) {
1081                         /*
1082                          * Drop the packet if the TX packet is
1083                          * destined for the TX device.
1084                          */
1085                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1086                                 LOG_DEBUG(VHOST_DATA,
1087                                 "(%"PRIu64") TX: Source and destination"
1088                                 " MAC addresses are the same. Dropping "
1089                                 "packet.\n",
1090                                 dev_ll->vdev->dev->device_fh);
1091                                 return -1;
1092                         }
1093
1094                         /*
1095                          * HW vlan strip will reduce the packet length
1096                          * by minus length of vlan tag, so need restore
1097                          * the packet length by plus it.
1098                          */
1099                         *offset = VLAN_HLEN;
1100                         *vlan_tag =
1101                         (uint16_t)
1102                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1103
1104                         LOG_DEBUG(VHOST_DATA,
1105                         "(%"PRIu64") TX: pkt to local VM device id:"
1106                         "(%"PRIu64") vlan tag: %d.\n",
1107                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1108                         vlan_tag);
1109
1110                         break;
1111                 }
1112                 dev_ll = dev_ll->next;
1113         }
1114         return 0;
1115 }
1116
1117 /*
1118  * This function routes the TX packet to the correct interface. This may be a local device
1119  * or the physical port.
1120  */
1121 static inline void __attribute__((always_inline))
1122 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1123 {
1124         struct mbuf_table *tx_q;
1125         struct rte_mbuf **m_table;
1126         unsigned len, ret, offset = 0;
1127         const uint16_t lcore_id = rte_lcore_id();
1128         struct virtio_net *dev = vdev->dev;
1129         struct ether_hdr *nh;
1130
1131         /*check if destination is local VM*/
1132         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1133                 rte_pktmbuf_free(m);
1134                 return;
1135         }
1136
1137         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1138                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1139                         rte_pktmbuf_free(m);
1140                         return;
1141                 }
1142         }
1143
1144         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1145
1146         /*Add packet to the port tx queue*/
1147         tx_q = &lcore_tx_queue[lcore_id];
1148         len = tx_q->len;
1149
1150         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1151         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1152                 /* Guest has inserted the vlan tag. */
1153                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1154                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1155                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1156                         (vh->vlan_tci != vlan_tag_be))
1157                         vh->vlan_tci = vlan_tag_be;
1158         } else {
1159                 m->ol_flags = PKT_TX_VLAN_PKT;
1160
1161                 /*
1162                  * Find the right seg to adjust the data len when offset is
1163                  * bigger than tail room size.
1164                  */
1165                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1166                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1167                                 m->data_len += offset;
1168                         else {
1169                                 struct rte_mbuf *seg = m;
1170
1171                                 while ((seg->next != NULL) &&
1172                                         (offset > rte_pktmbuf_tailroom(seg)))
1173                                         seg = seg->next;
1174
1175                                 seg->data_len += offset;
1176                         }
1177                         m->pkt_len += offset;
1178                 }
1179
1180                 m->vlan_tci = vlan_tag;
1181         }
1182
1183         tx_q->m_table[len] = m;
1184         len++;
1185         if (enable_stats) {
1186                 dev_statistics[dev->device_fh].tx_total++;
1187                 dev_statistics[dev->device_fh].tx++;
1188         }
1189
1190         if (unlikely(len == MAX_PKT_BURST)) {
1191                 m_table = (struct rte_mbuf **)tx_q->m_table;
1192                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1193                 /* Free any buffers not handled by TX and update the port stats. */
1194                 if (unlikely(ret < len)) {
1195                         do {
1196                                 rte_pktmbuf_free(m_table[ret]);
1197                         } while (++ret < len);
1198                 }
1199
1200                 len = 0;
1201         }
1202
1203         tx_q->len = len;
1204         return;
1205 }
1206 /*
1207  * This function is called by each data core. It handles all RX/TX registered with the
1208  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1209  * with all devices in the main linked list.
1210  */
1211 static int
1212 switch_worker(__attribute__((unused)) void *arg)
1213 {
1214         struct rte_mempool *mbuf_pool = arg;
1215         struct virtio_net *dev = NULL;
1216         struct vhost_dev *vdev = NULL;
1217         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1218         struct virtio_net_data_ll *dev_ll;
1219         struct mbuf_table *tx_q;
1220         volatile struct lcore_ll_info *lcore_ll;
1221         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1222         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1223         unsigned ret, i;
1224         const uint16_t lcore_id = rte_lcore_id();
1225         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1226         uint16_t rx_count = 0;
1227         uint16_t tx_count;
1228         uint32_t retry = 0;
1229
1230         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1231         lcore_ll = lcore_info[lcore_id].lcore_ll;
1232         prev_tsc = 0;
1233
1234         tx_q = &lcore_tx_queue[lcore_id];
1235         for (i = 0; i < num_cores; i ++) {
1236                 if (lcore_ids[i] == lcore_id) {
1237                         tx_q->txq_id = i;
1238                         break;
1239                 }
1240         }
1241
1242         while(1) {
1243                 cur_tsc = rte_rdtsc();
1244                 /*
1245                  * TX burst queue drain
1246                  */
1247                 diff_tsc = cur_tsc - prev_tsc;
1248                 if (unlikely(diff_tsc > drain_tsc)) {
1249
1250                         if (tx_q->len) {
1251                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1252
1253                                 /*Tx any packets in the queue*/
1254                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1255                                                                            (struct rte_mbuf **)tx_q->m_table,
1256                                                                            (uint16_t)tx_q->len);
1257                                 if (unlikely(ret < tx_q->len)) {
1258                                         do {
1259                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1260                                         } while (++ret < tx_q->len);
1261                                 }
1262
1263                                 tx_q->len = 0;
1264                         }
1265
1266                         prev_tsc = cur_tsc;
1267
1268                 }
1269
1270                 rte_prefetch0(lcore_ll->ll_root_used);
1271                 /*
1272                  * Inform the configuration core that we have exited the linked list and that no devices are
1273                  * in use if requested.
1274                  */
1275                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1276                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1277
1278                 /*
1279                  * Process devices
1280                  */
1281                 dev_ll = lcore_ll->ll_root_used;
1282
1283                 while (dev_ll != NULL) {
1284                         /*get virtio device ID*/
1285                         vdev = dev_ll->vdev;
1286                         dev = vdev->dev;
1287
1288                         if (unlikely(vdev->remove)) {
1289                                 dev_ll = dev_ll->next;
1290                                 unlink_vmdq(vdev);
1291                                 vdev->ready = DEVICE_SAFE_REMOVE;
1292                                 continue;
1293                         }
1294                         if (likely(vdev->ready == DEVICE_RX)) {
1295                                 /*Handle guest RX*/
1296                                 rx_count = rte_eth_rx_burst(ports[0],
1297                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1298
1299                                 if (rx_count) {
1300                                         /*
1301                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1302                                         * Here MAX_PKT_BURST must be less than virtio queue size
1303                                         */
1304                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1305                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1306                                                         rte_delay_us(burst_rx_delay_time);
1307                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1308                                                                 break;
1309                                                 }
1310                                         }
1311                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1312                                         if (enable_stats) {
1313                                                 rte_atomic64_add(
1314                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1315                                                 rx_count);
1316                                                 rte_atomic64_add(
1317                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1318                                         }
1319                                         while (likely(rx_count)) {
1320                                                 rx_count--;
1321                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1322                                         }
1323
1324                                 }
1325                         }
1326
1327                         if (likely(!vdev->remove)) {
1328                                 /* Handle guest TX*/
1329                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1330                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1331                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1332                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1333                                                 while (tx_count)
1334                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1335                                         }
1336                                 }
1337                                 while (tx_count)
1338                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1339                         }
1340
1341                         /*move to the next device in the list*/
1342                         dev_ll = dev_ll->next;
1343                 }
1344         }
1345
1346         return 0;
1347 }
1348
1349 /*
1350  * This function gets available ring number for zero copy rx.
1351  * Only one thread will call this funciton for a paticular virtio device,
1352  * so, it is designed as non-thread-safe function.
1353  */
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_num_zcp(struct virtio_net *dev)
1356 {
1357         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1358         uint16_t avail_idx;
1359
1360         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1362 }
1363
1364 /*
1365  * This function gets available ring index for zero copy rx,
1366  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1367  * Only one thread will call this funciton for a paticular virtio device,
1368  * so, it is designed as non-thread-safe function.
1369  */
1370 static inline uint32_t __attribute__((always_inline))
1371 get_available_ring_index_zcp(struct virtio_net *dev,
1372         uint16_t *res_base_idx, uint32_t count)
1373 {
1374         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1375         uint16_t avail_idx;
1376         uint32_t retry = 0;
1377         uint16_t free_entries;
1378
1379         *res_base_idx = vq->last_used_idx_res;
1380         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381         free_entries = (avail_idx - *res_base_idx);
1382
1383         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1384                         "avail idx: %d, "
1385                         "res base idx:%d, free entries:%d\n",
1386                         dev->device_fh, avail_idx, *res_base_idx,
1387                         free_entries);
1388
1389         /*
1390          * If retry is enabled and the queue is full then we wait
1391          * and retry to avoid packet loss.
1392          */
1393         if (enable_retry && unlikely(count > free_entries)) {
1394                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1395                         rte_delay_us(burst_rx_delay_time);
1396                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1397                         free_entries = (avail_idx - *res_base_idx);
1398                         if (count <= free_entries)
1399                                 break;
1400                 }
1401         }
1402
1403         /*check that we have enough buffers*/
1404         if (unlikely(count > free_entries))
1405                 count = free_entries;
1406
1407         if (unlikely(count == 0)) {
1408                 LOG_DEBUG(VHOST_DATA,
1409                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1410                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1411                         dev->device_fh, avail_idx,
1412                         *res_base_idx, free_entries);
1413                 return 0;
1414         }
1415
1416         vq->last_used_idx_res = *res_base_idx + count;
1417
1418         return count;
1419 }
1420
1421 /*
1422  * This function put descriptor back to used list.
1423  */
1424 static inline void __attribute__((always_inline))
1425 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1426 {
1427         uint16_t res_cur_idx = vq->last_used_idx;
1428         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1429         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1430         rte_compiler_barrier();
1431         *(volatile uint16_t *)&vq->used->idx += 1;
1432         vq->last_used_idx += 1;
1433
1434         /* Kick the guest if necessary. */
1435         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1436                 eventfd_write((int)vq->callfd, 1);
1437 }
1438
1439 /*
1440  * This function get available descriptor from vitio vring and un-attached mbuf
1441  * from vpool->ring, and then attach them together. It needs adjust the offset
1442  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1443  * frame data may be put to wrong location in mbuf.
1444  */
1445 static inline void __attribute__((always_inline))
1446 attach_rxmbuf_zcp(struct virtio_net *dev)
1447 {
1448         uint16_t res_base_idx, desc_idx;
1449         uint64_t buff_addr, phys_addr;
1450         struct vhost_virtqueue *vq;
1451         struct vring_desc *desc;
1452         struct rte_mbuf *mbuf = NULL;
1453         struct vpool *vpool;
1454         hpa_type addr_type;
1455         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1456
1457         vpool = &vpool_array[vdev->vmdq_rx_q];
1458         vq = dev->virtqueue[VIRTIO_RXQ];
1459
1460         do {
1461                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1462                                 1) != 1))
1463                         return;
1464                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1465
1466                 desc = &vq->desc[desc_idx];
1467                 if (desc->flags & VRING_DESC_F_NEXT) {
1468                         desc = &vq->desc[desc->next];
1469                         buff_addr = gpa_to_vva(dev, desc->addr);
1470                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1471                                         &addr_type);
1472                 } else {
1473                         buff_addr = gpa_to_vva(dev,
1474                                         desc->addr + vq->vhost_hlen);
1475                         phys_addr = gpa_to_hpa(vdev,
1476                                         desc->addr + vq->vhost_hlen,
1477                                         desc->len, &addr_type);
1478                 }
1479
1480                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1481                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1482                                 " address found when attaching RX frame buffer"
1483                                 " address!\n", dev->device_fh);
1484                         put_desc_to_used_list_zcp(vq, desc_idx);
1485                         continue;
1486                 }
1487
1488                 /*
1489                  * Check if the frame buffer address from guest crosses
1490                  * sub-region or not.
1491                  */
1492                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1493                         RTE_LOG(ERR, VHOST_DATA,
1494                                 "(%"PRIu64") Frame buffer address cross "
1495                                 "sub-regioin found when attaching RX frame "
1496                                 "buffer address!\n",
1497                                 dev->device_fh);
1498                         put_desc_to_used_list_zcp(vq, desc_idx);
1499                         continue;
1500                 }
1501         } while (unlikely(phys_addr == 0));
1502
1503         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1504         if (unlikely(mbuf == NULL)) {
1505                 LOG_DEBUG(VHOST_DATA,
1506                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1507                         "ring_sc_dequeue fail.\n",
1508                         dev->device_fh);
1509                 put_desc_to_used_list_zcp(vq, desc_idx);
1510                 return;
1511         }
1512
1513         if (unlikely(vpool->buf_size > desc->len)) {
1514                 LOG_DEBUG(VHOST_DATA,
1515                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1516                         "length(%d) of descriptor idx: %d less than room "
1517                         "size required: %d\n",
1518                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1519                 put_desc_to_used_list_zcp(vq, desc_idx);
1520                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1521                 return;
1522         }
1523
1524         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1525         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1526         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1527         mbuf->data_len = desc->len;
1528         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1529
1530         LOG_DEBUG(VHOST_DATA,
1531                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1532                 "descriptor idx:%d\n",
1533                 dev->device_fh, res_base_idx, desc_idx);
1534
1535         __rte_mbuf_raw_free(mbuf);
1536
1537         return;
1538 }
1539
1540 /*
1541  * Detach an attched packet mbuf -
1542  *  - restore original mbuf address and length values.
1543  *  - reset pktmbuf data and data_len to their default values.
1544  *  All other fields of the given packet mbuf will be left intact.
1545  *
1546  * @param m
1547  *   The attached packet mbuf.
1548  */
1549 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1550 {
1551         const struct rte_mempool *mp = m->pool;
1552         void *buf = RTE_MBUF_TO_BADDR(m);
1553         uint32_t buf_ofs;
1554         uint32_t buf_len = mp->elt_size - sizeof(*m);
1555         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1556
1557         m->buf_addr = buf;
1558         m->buf_len = (uint16_t)buf_len;
1559
1560         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1561                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1562         m->data_off = buf_ofs;
1563
1564         m->data_len = 0;
1565 }
1566
1567 /*
1568  * This function is called after packets have been transimited. It fetchs mbuf
1569  * from vpool->pool, detached it and put into vpool->ring. It also update the
1570  * used index and kick the guest if necessary.
1571  */
1572 static inline uint32_t __attribute__((always_inline))
1573 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1574 {
1575         struct rte_mbuf *mbuf;
1576         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1577         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1578         uint32_t index = 0;
1579         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1580
1581         LOG_DEBUG(VHOST_DATA,
1582                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1583                 "clean is: %d\n",
1584                 dev->device_fh, mbuf_count);
1585         LOG_DEBUG(VHOST_DATA,
1586                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1587                 "clean  is : %d\n",
1588                 dev->device_fh, rte_ring_count(vpool->ring));
1589
1590         for (index = 0; index < mbuf_count; index++) {
1591                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1592                 if (likely(MBUF_EXT_MEM(mbuf)))
1593                         pktmbuf_detach_zcp(mbuf);
1594                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1595
1596                 /* Update used index buffer information. */
1597                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1598                 vq->used->ring[used_idx].len = 0;
1599
1600                 used_idx = (used_idx + 1) & (vq->size - 1);
1601         }
1602
1603         LOG_DEBUG(VHOST_DATA,
1604                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1605                 "clean is: %d\n",
1606                 dev->device_fh, rte_mempool_count(vpool->pool));
1607         LOG_DEBUG(VHOST_DATA,
1608                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1609                 "clean  is : %d\n",
1610                 dev->device_fh, rte_ring_count(vpool->ring));
1611         LOG_DEBUG(VHOST_DATA,
1612                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1613                 "vq->last_used_idx:%d\n",
1614                 dev->device_fh, vq->last_used_idx);
1615
1616         vq->last_used_idx += mbuf_count;
1617
1618         LOG_DEBUG(VHOST_DATA,
1619                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1620                 "vq->last_used_idx:%d\n",
1621                 dev->device_fh, vq->last_used_idx);
1622
1623         rte_compiler_barrier();
1624
1625         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1626
1627         /* Kick guest if required. */
1628         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1629                 eventfd_write((int)vq->callfd, 1);
1630
1631         return 0;
1632 }
1633
1634 /*
1635  * This function is called when a virtio device is destroy.
1636  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1637  */
1638 static void mbuf_destroy_zcp(struct vpool *vpool)
1639 {
1640         struct rte_mbuf *mbuf = NULL;
1641         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1642
1643         LOG_DEBUG(VHOST_CONFIG,
1644                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1645                 "mbuf_destroy_zcp is: %d\n",
1646                 mbuf_count);
1647         LOG_DEBUG(VHOST_CONFIG,
1648                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1649                 "mbuf_destroy_zcp  is : %d\n",
1650                 rte_ring_count(vpool->ring));
1651
1652         for (index = 0; index < mbuf_count; index++) {
1653                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1654                 if (likely(mbuf != NULL)) {
1655                         if (likely(MBUF_EXT_MEM(mbuf)))
1656                                 pktmbuf_detach_zcp(mbuf);
1657                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1658                 }
1659         }
1660
1661         LOG_DEBUG(VHOST_CONFIG,
1662                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1663                 "mbuf_destroy_zcp is: %d\n",
1664                 rte_mempool_count(vpool->pool));
1665         LOG_DEBUG(VHOST_CONFIG,
1666                 "in mbuf_destroy_zcp: mbuf count in ring after "
1667                 "mbuf_destroy_zcp is : %d\n",
1668                 rte_ring_count(vpool->ring));
1669 }
1670
1671 /*
1672  * This function update the use flag and counter.
1673  */
1674 static inline uint32_t __attribute__((always_inline))
1675 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1676         uint32_t count)
1677 {
1678         struct vhost_virtqueue *vq;
1679         struct vring_desc *desc;
1680         struct rte_mbuf *buff;
1681         /* The virtio_hdr is initialised to 0. */
1682         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1683                 = {{0, 0, 0, 0, 0, 0}, 0};
1684         uint64_t buff_hdr_addr = 0;
1685         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1686         uint32_t head_idx, packet_success = 0;
1687         uint16_t res_cur_idx;
1688
1689         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1690
1691         if (count == 0)
1692                 return 0;
1693
1694         vq = dev->virtqueue[VIRTIO_RXQ];
1695         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1696
1697         res_cur_idx = vq->last_used_idx;
1698         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1699                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1700
1701         /* Retrieve all of the head indexes first to avoid caching issues. */
1702         for (head_idx = 0; head_idx < count; head_idx++)
1703                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1704
1705         /*Prefetch descriptor index. */
1706         rte_prefetch0(&vq->desc[head[packet_success]]);
1707
1708         while (packet_success != count) {
1709                 /* Get descriptor from available ring */
1710                 desc = &vq->desc[head[packet_success]];
1711
1712                 buff = pkts[packet_success];
1713                 LOG_DEBUG(VHOST_DATA,
1714                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1715                         "pkt[%d] descriptor idx: %d\n",
1716                         dev->device_fh, packet_success,
1717                         MBUF_HEADROOM_UINT32(buff));
1718
1719                 PRINT_PACKET(dev,
1720                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1721                         + RTE_PKTMBUF_HEADROOM),
1722                         rte_pktmbuf_data_len(buff), 0);
1723
1724                 /* Buffer address translation for virtio header. */
1725                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1726                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1727
1728                 /*
1729                  * If the descriptors are chained the header and data are
1730                  * placed in separate buffers.
1731                  */
1732                 if (desc->flags & VRING_DESC_F_NEXT) {
1733                         desc->len = vq->vhost_hlen;
1734                         desc = &vq->desc[desc->next];
1735                         desc->len = rte_pktmbuf_data_len(buff);
1736                 } else {
1737                         desc->len = packet_len;
1738                 }
1739
1740                 /* Update used ring with desc information */
1741                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1742                         = head[packet_success];
1743                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1744                         = packet_len;
1745                 res_cur_idx++;
1746                 packet_success++;
1747
1748                 /* A header is required per buffer. */
1749                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1750                         (const void *)&virtio_hdr, vq->vhost_hlen);
1751
1752                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1753
1754                 if (likely(packet_success < count)) {
1755                         /* Prefetch descriptor index. */
1756                         rte_prefetch0(&vq->desc[head[packet_success]]);
1757                 }
1758         }
1759
1760         rte_compiler_barrier();
1761
1762         LOG_DEBUG(VHOST_DATA,
1763                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1764                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1765                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1766
1767         *(volatile uint16_t *)&vq->used->idx += count;
1768         vq->last_used_idx += count;
1769
1770         LOG_DEBUG(VHOST_DATA,
1771                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1772                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1773                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1774
1775         /* Kick the guest if necessary. */
1776         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1777                 eventfd_write((int)vq->callfd, 1);
1778
1779         return count;
1780 }
1781
1782 /*
1783  * This function routes the TX packet to the correct interface.
1784  * This may be a local device or the physical port.
1785  */
1786 static inline void __attribute__((always_inline))
1787 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1788         uint32_t desc_idx, uint8_t need_copy)
1789 {
1790         struct mbuf_table *tx_q;
1791         struct rte_mbuf **m_table;
1792         struct rte_mbuf *mbuf = NULL;
1793         unsigned len, ret, offset = 0;
1794         struct vpool *vpool;
1795         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1796         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1797
1798         /*Add packet to the port tx queue*/
1799         tx_q = &tx_queue_zcp[vmdq_rx_q];
1800         len = tx_q->len;
1801
1802         /* Allocate an mbuf and populate the structure. */
1803         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1804         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1805         if (unlikely(mbuf == NULL)) {
1806                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1807                 RTE_LOG(ERR, VHOST_DATA,
1808                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1809                         dev->device_fh);
1810                 put_desc_to_used_list_zcp(vq, desc_idx);
1811                 return;
1812         }
1813
1814         if (vm2vm_mode == VM2VM_HARDWARE) {
1815                 /* Avoid using a vlan tag from any vm for external pkt, such as
1816                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1817                  * selection, MAC address determines it as an external pkt
1818                  * which should go to network, while vlan tag determine it as
1819                  * a vm2vm pkt should forward to another vm. Hardware confuse
1820                  * such a ambiguous situation, so pkt will lost.
1821                  */
1822                 vlan_tag = external_pkt_default_vlan_tag;
1823                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1824                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1825                         __rte_mbuf_raw_free(mbuf);
1826                         return;
1827                 }
1828         }
1829
1830         mbuf->nb_segs = m->nb_segs;
1831         mbuf->next = m->next;
1832         mbuf->data_len = m->data_len + offset;
1833         mbuf->pkt_len = mbuf->data_len;
1834         if (unlikely(need_copy)) {
1835                 /* Copy the packet contents to the mbuf. */
1836                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1837                         rte_pktmbuf_mtod(m, void *),
1838                         m->data_len);
1839         } else {
1840                 mbuf->data_off = m->data_off;
1841                 mbuf->buf_physaddr = m->buf_physaddr;
1842                 mbuf->buf_addr = m->buf_addr;
1843         }
1844         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1845         mbuf->vlan_tci = vlan_tag;
1846         mbuf->l2_len = sizeof(struct ether_hdr);
1847         mbuf->l3_len = sizeof(struct ipv4_hdr);
1848         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1849
1850         tx_q->m_table[len] = mbuf;
1851         len++;
1852
1853         LOG_DEBUG(VHOST_DATA,
1854                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1855                 dev->device_fh,
1856                 mbuf->nb_segs,
1857                 (mbuf->next == NULL) ? "null" : "non-null");
1858
1859         if (enable_stats) {
1860                 dev_statistics[dev->device_fh].tx_total++;
1861                 dev_statistics[dev->device_fh].tx++;
1862         }
1863
1864         if (unlikely(len == MAX_PKT_BURST)) {
1865                 m_table = (struct rte_mbuf **)tx_q->m_table;
1866                 ret = rte_eth_tx_burst(ports[0],
1867                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1868
1869                 /*
1870                  * Free any buffers not handled by TX and update
1871                  * the port stats.
1872                  */
1873                 if (unlikely(ret < len)) {
1874                         do {
1875                                 rte_pktmbuf_free(m_table[ret]);
1876                         } while (++ret < len);
1877                 }
1878
1879                 len = 0;
1880                 txmbuf_clean_zcp(dev, vpool);
1881         }
1882
1883         tx_q->len = len;
1884
1885         return;
1886 }
1887
1888 /*
1889  * This function TX all available packets in virtio TX queue for one
1890  * virtio-net device. If it is first packet, it learns MAC address and
1891  * setup VMDQ.
1892  */
1893 static inline void __attribute__((always_inline))
1894 virtio_dev_tx_zcp(struct virtio_net *dev)
1895 {
1896         struct rte_mbuf m;
1897         struct vhost_virtqueue *vq;
1898         struct vring_desc *desc;
1899         uint64_t buff_addr = 0, phys_addr;
1900         uint32_t head[MAX_PKT_BURST];
1901         uint32_t i;
1902         uint16_t free_entries, packet_success = 0;
1903         uint16_t avail_idx;
1904         uint8_t need_copy = 0;
1905         hpa_type addr_type;
1906         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1907
1908         vq = dev->virtqueue[VIRTIO_TXQ];
1909         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1910
1911         /* If there are no available buffers then return. */
1912         if (vq->last_used_idx_res == avail_idx)
1913                 return;
1914
1915         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1916
1917         /* Prefetch available ring to retrieve head indexes. */
1918         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1919
1920         /* Get the number of free entries in the ring */
1921         free_entries = (avail_idx - vq->last_used_idx_res);
1922
1923         /* Limit to MAX_PKT_BURST. */
1924         free_entries
1925                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1926
1927         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1928                 dev->device_fh, free_entries);
1929
1930         /* Retrieve all of the head indexes first to avoid caching issues. */
1931         for (i = 0; i < free_entries; i++)
1932                 head[i]
1933                         = vq->avail->ring[(vq->last_used_idx_res + i)
1934                         & (vq->size - 1)];
1935
1936         vq->last_used_idx_res += free_entries;
1937
1938         /* Prefetch descriptor index. */
1939         rte_prefetch0(&vq->desc[head[packet_success]]);
1940         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1941
1942         while (packet_success < free_entries) {
1943                 desc = &vq->desc[head[packet_success]];
1944
1945                 /* Discard first buffer as it is the virtio header */
1946                 desc = &vq->desc[desc->next];
1947
1948                 /* Buffer address translation. */
1949                 buff_addr = gpa_to_vva(dev, desc->addr);
1950                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1951                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1952                         &addr_type);
1953
1954                 if (likely(packet_success < (free_entries - 1)))
1955                         /* Prefetch descriptor index. */
1956                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1957
1958                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1959                         RTE_LOG(ERR, VHOST_DATA,
1960                                 "(%"PRIu64") Invalid frame buffer address found"
1961                                 "when TX packets!\n",
1962                                 dev->device_fh);
1963                         packet_success++;
1964                         continue;
1965                 }
1966
1967                 /* Prefetch buffer address. */
1968                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1969
1970                 /*
1971                  * Setup dummy mbuf. This is copied to a real mbuf if
1972                  * transmitted out the physical port.
1973                  */
1974                 m.data_len = desc->len;
1975                 m.nb_segs = 1;
1976                 m.next = NULL;
1977                 m.data_off = 0;
1978                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1979                 m.buf_physaddr = phys_addr;
1980
1981                 /*
1982                  * Check if the frame buffer address from guest crosses
1983                  * sub-region or not.
1984                  */
1985                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1986                         RTE_LOG(ERR, VHOST_DATA,
1987                                 "(%"PRIu64") Frame buffer address cross "
1988                                 "sub-regioin found when attaching TX frame "
1989                                 "buffer address!\n",
1990                                 dev->device_fh);
1991                         need_copy = 1;
1992                 } else
1993                         need_copy = 0;
1994
1995                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1996
1997                 /*
1998                  * If this is the first received packet we need to learn
1999                  * the MAC and setup VMDQ
2000                  */
2001                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2002                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2003                                 /*
2004                                  * Discard frame if device is scheduled for
2005                                  * removal or a duplicate MAC address is found.
2006                                  */
2007                                 packet_success += free_entries;
2008                                 vq->last_used_idx += packet_success;
2009                                 break;
2010                         }
2011                 }
2012
2013                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2014                 packet_success++;
2015         }
2016 }
2017
2018 /*
2019  * This function is called by each data core. It handles all RX/TX registered
2020  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2021  * addresses are compared with all devices in the main linked list.
2022  */
2023 static int
2024 switch_worker_zcp(__attribute__((unused)) void *arg)
2025 {
2026         struct virtio_net *dev = NULL;
2027         struct vhost_dev  *vdev = NULL;
2028         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2029         struct virtio_net_data_ll *dev_ll;
2030         struct mbuf_table *tx_q;
2031         volatile struct lcore_ll_info *lcore_ll;
2032         const uint64_t drain_tsc
2033                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2034                 * BURST_TX_DRAIN_US;
2035         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2036         unsigned ret;
2037         const uint16_t lcore_id = rte_lcore_id();
2038         uint16_t count_in_ring, rx_count = 0;
2039
2040         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2041
2042         lcore_ll = lcore_info[lcore_id].lcore_ll;
2043         prev_tsc = 0;
2044
2045         while (1) {
2046                 cur_tsc = rte_rdtsc();
2047
2048                 /* TX burst queue drain */
2049                 diff_tsc = cur_tsc - prev_tsc;
2050                 if (unlikely(diff_tsc > drain_tsc)) {
2051                         /*
2052                          * Get mbuf from vpool.pool and detach mbuf and
2053                          * put back into vpool.ring.
2054                          */
2055                         dev_ll = lcore_ll->ll_root_used;
2056                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2057                                 /* Get virtio device ID */
2058                                 vdev = dev_ll->vdev;
2059                                 dev = vdev->dev;
2060
2061                                 if (likely(!vdev->remove)) {
2062                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2063                                         if (tx_q->len) {
2064                                                 LOG_DEBUG(VHOST_DATA,
2065                                                 "TX queue drained after timeout"
2066                                                 " with burst size %u\n",
2067                                                 tx_q->len);
2068
2069                                                 /*
2070                                                  * Tx any packets in the queue
2071                                                  */
2072                                                 ret = rte_eth_tx_burst(
2073                                                         ports[0],
2074                                                         (uint16_t)tx_q->txq_id,
2075                                                         (struct rte_mbuf **)
2076                                                         tx_q->m_table,
2077                                                         (uint16_t)tx_q->len);
2078                                                 if (unlikely(ret < tx_q->len)) {
2079                                                         do {
2080                                                                 rte_pktmbuf_free(
2081                                                                         tx_q->m_table[ret]);
2082                                                         } while (++ret < tx_q->len);
2083                                                 }
2084                                                 tx_q->len = 0;
2085
2086                                                 txmbuf_clean_zcp(dev,
2087                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2088                                         }
2089                                 }
2090                                 dev_ll = dev_ll->next;
2091                         }
2092                         prev_tsc = cur_tsc;
2093                 }
2094
2095                 rte_prefetch0(lcore_ll->ll_root_used);
2096
2097                 /*
2098                  * Inform the configuration core that we have exited the linked
2099                  * list and that no devices are in use if requested.
2100                  */
2101                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2102                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2103
2104                 /* Process devices */
2105                 dev_ll = lcore_ll->ll_root_used;
2106
2107                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2108                         vdev = dev_ll->vdev;
2109                         dev  = vdev->dev;
2110                         if (unlikely(vdev->remove)) {
2111                                 dev_ll = dev_ll->next;
2112                                 unlink_vmdq(vdev);
2113                                 vdev->ready = DEVICE_SAFE_REMOVE;
2114                                 continue;
2115                         }
2116
2117                         if (likely(vdev->ready == DEVICE_RX)) {
2118                                 uint32_t index = vdev->vmdq_rx_q;
2119                                 uint16_t i;
2120                                 count_in_ring
2121                                 = rte_ring_count(vpool_array[index].ring);
2122                                 uint16_t free_entries
2123                                 = (uint16_t)get_available_ring_num_zcp(dev);
2124
2125                                 /*
2126                                  * Attach all mbufs in vpool.ring and put back
2127                                  * into vpool.pool.
2128                                  */
2129                                 for (i = 0;
2130                                 i < RTE_MIN(free_entries,
2131                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2132                                 i++)
2133                                         attach_rxmbuf_zcp(dev);
2134
2135                                 /* Handle guest RX */
2136                                 rx_count = rte_eth_rx_burst(ports[0],
2137                                         vdev->vmdq_rx_q, pkts_burst,
2138                                         MAX_PKT_BURST);
2139
2140                                 if (rx_count) {
2141                                         ret_count = virtio_dev_rx_zcp(dev,
2142                                                         pkts_burst, rx_count);
2143                                         if (enable_stats) {
2144                                                 dev_statistics[dev->device_fh].rx_total
2145                                                         += rx_count;
2146                                                 dev_statistics[dev->device_fh].rx
2147                                                         += ret_count;
2148                                         }
2149                                         while (likely(rx_count)) {
2150                                                 rx_count--;
2151                                                 pktmbuf_detach_zcp(
2152                                                         pkts_burst[rx_count]);
2153                                                 rte_ring_sp_enqueue(
2154                                                         vpool_array[index].ring,
2155                                                         (void *)pkts_burst[rx_count]);
2156                                         }
2157                                 }
2158                         }
2159
2160                         if (likely(!vdev->remove))
2161                                 /* Handle guest TX */
2162                                 virtio_dev_tx_zcp(dev);
2163
2164                         /* Move to the next device in the list */
2165                         dev_ll = dev_ll->next;
2166                 }
2167         }
2168
2169         return 0;
2170 }
2171
2172
2173 /*
2174  * Add an entry to a used linked list. A free entry must first be found
2175  * in the free linked list using get_data_ll_free_entry();
2176  */
2177 static void
2178 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2179         struct virtio_net_data_ll *ll_dev)
2180 {
2181         struct virtio_net_data_ll *ll = *ll_root_addr;
2182
2183         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2184         ll_dev->next = NULL;
2185         rte_compiler_barrier();
2186
2187         /* If ll == NULL then this is the first device. */
2188         if (ll) {
2189                 /* Increment to the tail of the linked list. */
2190                 while ((ll->next != NULL) )
2191                         ll = ll->next;
2192
2193                 ll->next = ll_dev;
2194         } else {
2195                 *ll_root_addr = ll_dev;
2196         }
2197 }
2198
2199 /*
2200  * Remove an entry from a used linked list. The entry must then be added to
2201  * the free linked list using put_data_ll_free_entry().
2202  */
2203 static void
2204 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2205         struct virtio_net_data_ll *ll_dev,
2206         struct virtio_net_data_ll *ll_dev_last)
2207 {
2208         struct virtio_net_data_ll *ll = *ll_root_addr;
2209
2210         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2211                 return;
2212
2213         if (ll_dev == ll)
2214                 *ll_root_addr = ll_dev->next;
2215         else
2216                 if (likely(ll_dev_last != NULL))
2217                         ll_dev_last->next = ll_dev->next;
2218                 else
2219                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2220 }
2221
2222 /*
2223  * Find and return an entry from the free linked list.
2224  */
2225 static struct virtio_net_data_ll *
2226 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2227 {
2228         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2229         struct virtio_net_data_ll *ll_dev;
2230
2231         if (ll_free == NULL)
2232                 return NULL;
2233
2234         ll_dev = ll_free;
2235         *ll_root_addr = ll_free->next;
2236
2237         return ll_dev;
2238 }
2239
2240 /*
2241  * Place an entry back on to the free linked list.
2242  */
2243 static void
2244 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2245         struct virtio_net_data_ll *ll_dev)
2246 {
2247         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2248
2249         if (ll_dev == NULL)
2250                 return;
2251
2252         ll_dev->next = ll_free;
2253         *ll_root_addr = ll_dev;
2254 }
2255
2256 /*
2257  * Creates a linked list of a given size.
2258  */
2259 static struct virtio_net_data_ll *
2260 alloc_data_ll(uint32_t size)
2261 {
2262         struct virtio_net_data_ll *ll_new;
2263         uint32_t i;
2264
2265         /* Malloc and then chain the linked list. */
2266         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2267         if (ll_new == NULL) {
2268                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2269                 return NULL;
2270         }
2271
2272         for (i = 0; i < size - 1; i++) {
2273                 ll_new[i].vdev = NULL;
2274                 ll_new[i].next = &ll_new[i+1];
2275         }
2276         ll_new[i].next = NULL;
2277
2278         return (ll_new);
2279 }
2280
2281 /*
2282  * Create the main linked list along with each individual cores linked list. A used and a free list
2283  * are created to manage entries.
2284  */
2285 static int
2286 init_data_ll (void)
2287 {
2288         int lcore;
2289
2290         RTE_LCORE_FOREACH_SLAVE(lcore) {
2291                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2292                 if (lcore_info[lcore].lcore_ll == NULL) {
2293                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2294                         return -1;
2295                 }
2296
2297                 lcore_info[lcore].lcore_ll->device_num = 0;
2298                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2299                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2300                 if (num_devices % num_switching_cores)
2301                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2302                 else
2303                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2304         }
2305
2306         /* Allocate devices up to a maximum of MAX_DEVICES. */
2307         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2308
2309         return 0;
2310 }
2311
2312 /*
2313  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2314  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2315  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2316  */
2317 static void
2318 destroy_device (volatile struct virtio_net *dev)
2319 {
2320         struct virtio_net_data_ll *ll_lcore_dev_cur;
2321         struct virtio_net_data_ll *ll_main_dev_cur;
2322         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2323         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2324         struct vhost_dev *vdev;
2325         int lcore;
2326
2327         dev->flags &= ~VIRTIO_DEV_RUNNING;
2328
2329         vdev = (struct vhost_dev *)dev->priv;
2330         /*set the remove flag. */
2331         vdev->remove = 1;
2332         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2333                 rte_pause();
2334         }
2335
2336         /* Search for entry to be removed from lcore ll */
2337         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2338         while (ll_lcore_dev_cur != NULL) {
2339                 if (ll_lcore_dev_cur->vdev == vdev) {
2340                         break;
2341                 } else {
2342                         ll_lcore_dev_last = ll_lcore_dev_cur;
2343                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2344                 }
2345         }
2346
2347         if (ll_lcore_dev_cur == NULL) {
2348                 RTE_LOG(ERR, VHOST_CONFIG,
2349                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2350                         dev->device_fh);
2351                 return;
2352         }
2353
2354         /* Search for entry to be removed from main ll */
2355         ll_main_dev_cur = ll_root_used;
2356         ll_main_dev_last = NULL;
2357         while (ll_main_dev_cur != NULL) {
2358                 if (ll_main_dev_cur->vdev == vdev) {
2359                         break;
2360                 } else {
2361                         ll_main_dev_last = ll_main_dev_cur;
2362                         ll_main_dev_cur = ll_main_dev_cur->next;
2363                 }
2364         }
2365
2366         /* Remove entries from the lcore and main ll. */
2367         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2368         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2369
2370         /* Set the dev_removal_flag on each lcore. */
2371         RTE_LCORE_FOREACH_SLAVE(lcore) {
2372                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2373         }
2374
2375         /*
2376          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2377          * they can no longer access the device removed from the linked lists and that the devices
2378          * are no longer in use.
2379          */
2380         RTE_LCORE_FOREACH_SLAVE(lcore) {
2381                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2382                         rte_pause();
2383                 }
2384         }
2385
2386         /* Add the entries back to the lcore and main free ll.*/
2387         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2388         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2389
2390         /* Decrement number of device on the lcore. */
2391         lcore_info[vdev->coreid].lcore_ll->device_num--;
2392
2393         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2394
2395         if (zero_copy) {
2396                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2397
2398                 /* Stop the RX queue. */
2399                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2400                         LOG_DEBUG(VHOST_CONFIG,
2401                                 "(%"PRIu64") In destroy_device: Failed to stop "
2402                                 "rx queue:%d\n",
2403                                 dev->device_fh,
2404                                 vdev->vmdq_rx_q);
2405                 }
2406
2407                 LOG_DEBUG(VHOST_CONFIG,
2408                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2409                         "mempool back to ring for RX queue: %d\n",
2410                         dev->device_fh, vdev->vmdq_rx_q);
2411
2412                 mbuf_destroy_zcp(vpool);
2413
2414                 /* Stop the TX queue. */
2415                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2416                         LOG_DEBUG(VHOST_CONFIG,
2417                                 "(%"PRIu64") In destroy_device: Failed to "
2418                                 "stop tx queue:%d\n",
2419                                 dev->device_fh, vdev->vmdq_rx_q);
2420                 }
2421
2422                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2423
2424                 LOG_DEBUG(VHOST_CONFIG,
2425                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2426                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2427                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2428                         dev->device_fh);
2429
2430                 mbuf_destroy_zcp(vpool);
2431                 rte_free(vdev->regions_hpa);
2432         }
2433         rte_free(vdev);
2434
2435 }
2436
2437 /*
2438  * Calculate the region count of physical continous regions for one particular
2439  * region of whose vhost virtual address is continous. The particular region
2440  * start from vva_start, with size of 'size' in argument.
2441  */
2442 static uint32_t
2443 check_hpa_regions(uint64_t vva_start, uint64_t size)
2444 {
2445         uint32_t i, nregions = 0, page_size = getpagesize();
2446         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2447         if (vva_start % page_size) {
2448                 LOG_DEBUG(VHOST_CONFIG,
2449                         "in check_countinous: vva start(%p) mod page_size(%d) "
2450                         "has remainder\n",
2451                         (void *)(uintptr_t)vva_start, page_size);
2452                 return 0;
2453         }
2454         if (size % page_size) {
2455                 LOG_DEBUG(VHOST_CONFIG,
2456                         "in check_countinous: "
2457                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2458                         size, page_size);
2459                 return 0;
2460         }
2461         for (i = 0; i < size - page_size; i = i + page_size) {
2462                 cur_phys_addr
2463                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2464                 next_phys_addr = rte_mem_virt2phy(
2465                         (void *)(uintptr_t)(vva_start + i + page_size));
2466                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2467                         ++nregions;
2468                         LOG_DEBUG(VHOST_CONFIG,
2469                                 "in check_continuous: hva addr:(%p) is not "
2470                                 "continuous with hva addr:(%p), diff:%d\n",
2471                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2472                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2473                                 + page_size), page_size);
2474                         LOG_DEBUG(VHOST_CONFIG,
2475                                 "in check_continuous: hpa addr:(%p) is not "
2476                                 "continuous with hpa addr:(%p), "
2477                                 "diff:(%"PRIu64")\n",
2478                                 (void *)(uintptr_t)cur_phys_addr,
2479                                 (void *)(uintptr_t)next_phys_addr,
2480                                 (next_phys_addr-cur_phys_addr));
2481                 }
2482         }
2483         return nregions;
2484 }
2485
2486 /*
2487  * Divide each region whose vhost virtual address is continous into a few
2488  * sub-regions, make sure the physical address within each sub-region are
2489  * continous. And fill offset(to GPA) and size etc. information of each
2490  * sub-region into regions_hpa.
2491  */
2492 static uint32_t
2493 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2494 {
2495         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2496         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2497
2498         if (mem_region_hpa == NULL)
2499                 return 0;
2500
2501         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2502                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2503                         virtio_memory->regions[regionidx].address_offset;
2504                 mem_region_hpa[regionidx_hpa].guest_phys_address
2505                         = virtio_memory->regions[regionidx].guest_phys_address;
2506                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2507                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2508                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2509                 LOG_DEBUG(VHOST_CONFIG,
2510                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2511                         regionidx_hpa,
2512                         (void *)(uintptr_t)
2513                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2514                 LOG_DEBUG(VHOST_CONFIG,
2515                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2516                         regionidx_hpa,
2517                         (void *)(uintptr_t)
2518                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2519                 for (i = 0, k = 0;
2520                         i < virtio_memory->regions[regionidx].memory_size -
2521                                 page_size;
2522                         i += page_size) {
2523                         cur_phys_addr = rte_mem_virt2phy(
2524                                         (void *)(uintptr_t)(vva_start + i));
2525                         next_phys_addr = rte_mem_virt2phy(
2526                                         (void *)(uintptr_t)(vva_start +
2527                                         i + page_size));
2528                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2529                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2530                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2531                                         k + page_size;
2532                                 mem_region_hpa[regionidx_hpa].memory_size
2533                                         = k + page_size;
2534                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2535                                         "phys addr end  [%d]:(%p)\n",
2536                                         regionidx_hpa,
2537                                         (void *)(uintptr_t)
2538                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2539                                 LOG_DEBUG(VHOST_CONFIG,
2540                                         "in fill_hpa_regions: guest phys addr "
2541                                         "size [%d]:(%p)\n",
2542                                         regionidx_hpa,
2543                                         (void *)(uintptr_t)
2544                                         (mem_region_hpa[regionidx_hpa].memory_size));
2545                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2546                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2547                                 ++regionidx_hpa;
2548                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2549                                         next_phys_addr -
2550                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2551                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2552                                         " phys addr start[%d]:(%p)\n",
2553                                         regionidx_hpa,
2554                                         (void *)(uintptr_t)
2555                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2556                                 LOG_DEBUG(VHOST_CONFIG,
2557                                         "in fill_hpa_regions: host  phys addr "
2558                                         "start[%d]:(%p)\n",
2559                                         regionidx_hpa,
2560                                         (void *)(uintptr_t)
2561                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2562                                 k = 0;
2563                         } else {
2564                                 k += page_size;
2565                         }
2566                 }
2567                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2568                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2569                         + k + page_size;
2570                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2571                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2572                         "[%d]:(%p)\n", regionidx_hpa,
2573                         (void *)(uintptr_t)
2574                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2575                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2576                         "[%d]:(%p)\n", regionidx_hpa,
2577                         (void *)(uintptr_t)
2578                         (mem_region_hpa[regionidx_hpa].memory_size));
2579                 ++regionidx_hpa;
2580         }
2581         return regionidx_hpa;
2582 }
2583
2584 /*
2585  * A new device is added to a data core. First the device is added to the main linked list
2586  * and the allocated to a specific data core.
2587  */
2588 static int
2589 new_device (struct virtio_net *dev)
2590 {
2591         struct virtio_net_data_ll *ll_dev;
2592         int lcore, core_add = 0;
2593         uint32_t device_num_min = num_devices;
2594         struct vhost_dev *vdev;
2595         uint32_t regionidx;
2596
2597         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2598         if (vdev == NULL) {
2599                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2600                         dev->device_fh);
2601                 return -1;
2602         }
2603         vdev->dev = dev;
2604         dev->priv = vdev;
2605
2606         if (zero_copy) {
2607                 vdev->nregions_hpa = dev->mem->nregions;
2608                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2609                         vdev->nregions_hpa
2610                                 += check_hpa_regions(
2611                                         dev->mem->regions[regionidx].guest_phys_address
2612                                         + dev->mem->regions[regionidx].address_offset,
2613                                         dev->mem->regions[regionidx].memory_size);
2614
2615                 }
2616
2617                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2618                                                vdev->nregions_hpa,
2619                                                sizeof(struct virtio_memory_regions_hpa),
2620                                                RTE_CACHE_LINE_SIZE);
2621                 if (vdev->regions_hpa == NULL) {
2622                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2623                         rte_free(vdev);
2624                         return -1;
2625                 }
2626
2627
2628                 if (fill_hpa_memory_regions(
2629                         vdev->regions_hpa, dev->mem
2630                         ) != vdev->nregions_hpa) {
2631
2632                         RTE_LOG(ERR, VHOST_CONFIG,
2633                                 "hpa memory regions number mismatch: "
2634                                 "[%d]\n", vdev->nregions_hpa);
2635                         rte_free(vdev->regions_hpa);
2636                         rte_free(vdev);
2637                         return -1;
2638                 }
2639         }
2640
2641
2642         /* Add device to main ll */
2643         ll_dev = get_data_ll_free_entry(&ll_root_free);
2644         if (ll_dev == NULL) {
2645                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2646                         "of %d devices per core has been reached\n",
2647                         dev->device_fh, num_devices);
2648                 if (vdev->regions_hpa)
2649                         rte_free(vdev->regions_hpa);
2650                 rte_free(vdev);
2651                 return -1;
2652         }
2653         ll_dev->vdev = vdev;
2654         add_data_ll_entry(&ll_root_used, ll_dev);
2655         vdev->vmdq_rx_q
2656                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2657
2658         if (zero_copy) {
2659                 uint32_t index = vdev->vmdq_rx_q;
2660                 uint32_t count_in_ring, i;
2661                 struct mbuf_table *tx_q;
2662
2663                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2664
2665                 LOG_DEBUG(VHOST_CONFIG,
2666                         "(%"PRIu64") in new_device: mbuf count in mempool "
2667                         "before attach is: %d\n",
2668                         dev->device_fh,
2669                         rte_mempool_count(vpool_array[index].pool));
2670                 LOG_DEBUG(VHOST_CONFIG,
2671                         "(%"PRIu64") in new_device: mbuf count in  ring "
2672                         "before attach  is : %d\n",
2673                         dev->device_fh, count_in_ring);
2674
2675                 /*
2676                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2677                  */
2678                 for (i = 0; i < count_in_ring; i++)
2679                         attach_rxmbuf_zcp(dev);
2680
2681                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2682                         "mempool after attach is: %d\n",
2683                         dev->device_fh,
2684                         rte_mempool_count(vpool_array[index].pool));
2685                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2686                         "ring after attach  is : %d\n",
2687                         dev->device_fh,
2688                         rte_ring_count(vpool_array[index].ring));
2689
2690                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2691                 tx_q->txq_id = vdev->vmdq_rx_q;
2692
2693                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2694                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2695
2696                         LOG_DEBUG(VHOST_CONFIG,
2697                                 "(%"PRIu64") In new_device: Failed to start "
2698                                 "tx queue:%d\n",
2699                                 dev->device_fh, vdev->vmdq_rx_q);
2700
2701                         mbuf_destroy_zcp(vpool);
2702                         rte_free(vdev->regions_hpa);
2703                         rte_free(vdev);
2704                         return -1;
2705                 }
2706
2707                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2708                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2709
2710                         LOG_DEBUG(VHOST_CONFIG,
2711                                 "(%"PRIu64") In new_device: Failed to start "
2712                                 "rx queue:%d\n",
2713                                 dev->device_fh, vdev->vmdq_rx_q);
2714
2715                         /* Stop the TX queue. */
2716                         if (rte_eth_dev_tx_queue_stop(ports[0],
2717                                 vdev->vmdq_rx_q) != 0) {
2718                                 LOG_DEBUG(VHOST_CONFIG,
2719                                         "(%"PRIu64") In new_device: Failed to "
2720                                         "stop tx queue:%d\n",
2721                                         dev->device_fh, vdev->vmdq_rx_q);
2722                         }
2723
2724                         mbuf_destroy_zcp(vpool);
2725                         rte_free(vdev->regions_hpa);
2726                         rte_free(vdev);
2727                         return -1;
2728                 }
2729
2730         }
2731
2732         /*reset ready flag*/
2733         vdev->ready = DEVICE_MAC_LEARNING;
2734         vdev->remove = 0;
2735
2736         /* Find a suitable lcore to add the device. */
2737         RTE_LCORE_FOREACH_SLAVE(lcore) {
2738                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2739                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2740                         core_add = lcore;
2741                 }
2742         }
2743         /* Add device to lcore ll */
2744         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2745         if (ll_dev == NULL) {
2746                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2747                 vdev->ready = DEVICE_SAFE_REMOVE;
2748                 destroy_device(dev);
2749                 rte_free(vdev->regions_hpa);
2750                 rte_free(vdev);
2751                 return -1;
2752         }
2753         ll_dev->vdev = vdev;
2754         vdev->coreid = core_add;
2755
2756         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2757
2758         /* Initialize device stats */
2759         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2760
2761         /* Disable notifications. */
2762         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2763         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2764         lcore_info[vdev->coreid].lcore_ll->device_num++;
2765         dev->flags |= VIRTIO_DEV_RUNNING;
2766
2767         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2768
2769         return 0;
2770 }
2771
2772 /*
2773  * These callback allow devices to be added to the data core when configuration
2774  * has been fully complete.
2775  */
2776 static const struct virtio_net_device_ops virtio_net_device_ops =
2777 {
2778         .new_device =  new_device,
2779         .destroy_device = destroy_device,
2780 };
2781
2782 /*
2783  * This is a thread will wake up after a period to print stats if the user has
2784  * enabled them.
2785  */
2786 static void
2787 print_stats(void)
2788 {
2789         struct virtio_net_data_ll *dev_ll;
2790         uint64_t tx_dropped, rx_dropped;
2791         uint64_t tx, tx_total, rx, rx_total;
2792         uint32_t device_fh;
2793         const char clr[] = { 27, '[', '2', 'J', '\0' };
2794         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2795
2796         while(1) {
2797                 sleep(enable_stats);
2798
2799                 /* Clear screen and move to top left */
2800                 printf("%s%s", clr, top_left);
2801
2802                 printf("\nDevice statistics ====================================");
2803
2804                 dev_ll = ll_root_used;
2805                 while (dev_ll != NULL) {
2806                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2807                         tx_total = dev_statistics[device_fh].tx_total;
2808                         tx = dev_statistics[device_fh].tx;
2809                         tx_dropped = tx_total - tx;
2810                         if (zero_copy == 0) {
2811                                 rx_total = rte_atomic64_read(
2812                                         &dev_statistics[device_fh].rx_total_atomic);
2813                                 rx = rte_atomic64_read(
2814                                         &dev_statistics[device_fh].rx_atomic);
2815                         } else {
2816                                 rx_total = dev_statistics[device_fh].rx_total;
2817                                 rx = dev_statistics[device_fh].rx;
2818                         }
2819                         rx_dropped = rx_total - rx;
2820
2821                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2822                                         "\nTX total:            %"PRIu64""
2823                                         "\nTX dropped:          %"PRIu64""
2824                                         "\nTX successful:               %"PRIu64""
2825                                         "\nRX total:            %"PRIu64""
2826                                         "\nRX dropped:          %"PRIu64""
2827                                         "\nRX successful:               %"PRIu64"",
2828                                         device_fh,
2829                                         tx_total,
2830                                         tx_dropped,
2831                                         tx,
2832                                         rx_total,
2833                                         rx_dropped,
2834                                         rx);
2835
2836                         dev_ll = dev_ll->next;
2837                 }
2838                 printf("\n======================================================\n");
2839         }
2840 }
2841
2842 static void
2843 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2844         char *ring_name, uint32_t nb_mbuf)
2845 {
2846         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2847                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2848         if (vpool_array[index].pool != NULL) {
2849                 vpool_array[index].ring
2850                         = rte_ring_create(ring_name,
2851                                 rte_align32pow2(nb_mbuf + 1),
2852                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2853                 if (likely(vpool_array[index].ring != NULL)) {
2854                         LOG_DEBUG(VHOST_CONFIG,
2855                                 "in setup_mempool_tbl: mbuf count in "
2856                                 "mempool is: %d\n",
2857                                 rte_mempool_count(vpool_array[index].pool));
2858                         LOG_DEBUG(VHOST_CONFIG,
2859                                 "in setup_mempool_tbl: mbuf count in "
2860                                 "ring   is: %d\n",
2861                                 rte_ring_count(vpool_array[index].ring));
2862                 } else {
2863                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2864                                 ring_name);
2865                 }
2866
2867                 /* Need consider head room. */
2868                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2869         } else {
2870                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2871         }
2872 }
2873
2874
2875 /*
2876  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2877  * device is also registered here to handle the IOCTLs.
2878  */
2879 int
2880 main(int argc, char *argv[])
2881 {
2882         struct rte_mempool *mbuf_pool = NULL;
2883         unsigned lcore_id, core_id = 0;
2884         unsigned nb_ports, valid_num_ports;
2885         int ret;
2886         uint8_t portid;
2887         uint16_t queue_id;
2888         static pthread_t tid;
2889
2890         /* init EAL */
2891         ret = rte_eal_init(argc, argv);
2892         if (ret < 0)
2893                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2894         argc -= ret;
2895         argv += ret;
2896
2897         /* parse app arguments */
2898         ret = us_vhost_parse_args(argc, argv);
2899         if (ret < 0)
2900                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2901
2902         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2903                 if (rte_lcore_is_enabled(lcore_id))
2904                         lcore_ids[core_id ++] = lcore_id;
2905
2906         if (rte_lcore_count() > RTE_MAX_LCORE)
2907                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2908
2909         /*set the number of swithcing cores available*/
2910         num_switching_cores = rte_lcore_count()-1;
2911
2912         /* Get the number of physical ports. */
2913         nb_ports = rte_eth_dev_count();
2914         if (nb_ports > RTE_MAX_ETHPORTS)
2915                 nb_ports = RTE_MAX_ETHPORTS;
2916
2917         /*
2918          * Update the global var NUM_PORTS and global array PORTS
2919          * and get value of var VALID_NUM_PORTS according to system ports number
2920          */
2921         valid_num_ports = check_ports_num(nb_ports);
2922
2923         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2924                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2925                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2926                 return -1;
2927         }
2928
2929         if (zero_copy == 0) {
2930                 /* Create the mbuf pool. */
2931                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2932                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2933                         0, MBUF_DATA_SIZE, rte_socket_id());
2934                 if (mbuf_pool == NULL)
2935                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2936
2937                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2938                         vpool_array[queue_id].pool = mbuf_pool;
2939
2940                 if (vm2vm_mode == VM2VM_HARDWARE) {
2941                         /* Enable VT loop back to let L2 switch to do it. */
2942                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2943                         LOG_DEBUG(VHOST_CONFIG,
2944                                 "Enable loop back for L2 switch in vmdq.\n");
2945                 }
2946         } else {
2947                 uint32_t nb_mbuf;
2948                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2949                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2950
2951                 nb_mbuf = num_rx_descriptor
2952                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2953                         + num_switching_cores * MAX_PKT_BURST;
2954
2955                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2956                         snprintf(pool_name, sizeof(pool_name),
2957                                 "rxmbuf_pool_%u", queue_id);
2958                         snprintf(ring_name, sizeof(ring_name),
2959                                 "rxmbuf_ring_%u", queue_id);
2960                         setup_mempool_tbl(rte_socket_id(), queue_id,
2961                                 pool_name, ring_name, nb_mbuf);
2962                 }
2963
2964                 nb_mbuf = num_tx_descriptor
2965                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2966                                 + num_switching_cores * MAX_PKT_BURST;
2967
2968                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2969                         snprintf(pool_name, sizeof(pool_name),
2970                                 "txmbuf_pool_%u", queue_id);
2971                         snprintf(ring_name, sizeof(ring_name),
2972                                 "txmbuf_ring_%u", queue_id);
2973                         setup_mempool_tbl(rte_socket_id(),
2974                                 (queue_id + MAX_QUEUES),
2975                                 pool_name, ring_name, nb_mbuf);
2976                 }
2977
2978                 if (vm2vm_mode == VM2VM_HARDWARE) {
2979                         /* Enable VT loop back to let L2 switch to do it. */
2980                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2981                         LOG_DEBUG(VHOST_CONFIG,
2982                                 "Enable loop back for L2 switch in vmdq.\n");
2983                 }
2984         }
2985         /* Set log level. */
2986         rte_set_log_level(LOG_LEVEL);
2987
2988         /* initialize all ports */
2989         for (portid = 0; portid < nb_ports; portid++) {
2990                 /* skip ports that are not enabled */
2991                 if ((enabled_port_mask & (1 << portid)) == 0) {
2992                         RTE_LOG(INFO, VHOST_PORT,
2993                                 "Skipping disabled port %d\n", portid);
2994                         continue;
2995                 }
2996                 if (port_init(portid) != 0)
2997                         rte_exit(EXIT_FAILURE,
2998                                 "Cannot initialize network ports\n");
2999         }
3000
3001         /* Initialise all linked lists. */
3002         if (init_data_ll() == -1)
3003                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3004
3005         /* Initialize device stats */
3006         memset(&dev_statistics, 0, sizeof(dev_statistics));
3007
3008         /* Enable stats if the user option is set. */
3009         if (enable_stats)
3010                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3011
3012         /* Launch all data cores. */
3013         if (zero_copy == 0) {
3014                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3015                         rte_eal_remote_launch(switch_worker,
3016                                 mbuf_pool, lcore_id);
3017                 }
3018         } else {
3019                 uint32_t count_in_mempool, index, i;
3020                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3021                         /* For all RX and TX queues. */
3022                         count_in_mempool
3023                                 = rte_mempool_count(vpool_array[index].pool);
3024
3025                         /*
3026                          * Transfer all un-attached mbufs from vpool.pool
3027                          * to vpoo.ring.
3028                          */
3029                         for (i = 0; i < count_in_mempool; i++) {
3030                                 struct rte_mbuf *mbuf
3031                                         = __rte_mbuf_raw_alloc(
3032                                                 vpool_array[index].pool);
3033                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3034                                                 (void *)mbuf);
3035                         }
3036
3037                         LOG_DEBUG(VHOST_CONFIG,
3038                                 "in main: mbuf count in mempool at initial "
3039                                 "is: %d\n", count_in_mempool);
3040                         LOG_DEBUG(VHOST_CONFIG,
3041                                 "in main: mbuf count in  ring at initial  is :"
3042                                 " %d\n",
3043                                 rte_ring_count(vpool_array[index].ring));
3044                 }
3045
3046                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3047                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3048                                 lcore_id);
3049         }
3050
3051         if (mergeable == 0)
3052                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3053
3054         /* Register CUSE device to handle IOCTLs. */
3055         ret = rte_vhost_driver_register((char *)&dev_basename);
3056         if (ret != 0)
3057                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3058
3059         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3060
3061         /* Start CUSE session. */
3062         rte_vhost_driver_session_start();
3063         return 0;
3064
3065 }
3066