examples/vhost: add vlan strip command line option
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
84
85 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
87
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX                       1
93 #define DEVICE_SAFE_REMOVE      2
94
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117                 + sizeof(struct rte_mbuf)))
118
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121
122 #define INVALID_PORT_ID 0xFF
123
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141
142 /* mask of enabled ports */
143 static uint32_t enabled_port_mask = 0;
144
145 /* Promiscuous mode */
146 static uint32_t promiscuous;
147
148 /*Number of switching cores enabled*/
149 static uint32_t num_switching_cores = 0;
150
151 /* number of devices/queues to support*/
152 static uint32_t num_queues = 0;
153 static uint32_t num_devices;
154
155 /*
156  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
157  * disabled on default.
158  */
159 static uint32_t zero_copy;
160 static int mergeable;
161
162 /* Do vlan strip on host, enabled on default */
163 static uint32_t vlan_strip = 1;
164
165 /* number of descriptors to apply*/
166 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
167 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
168
169 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
170 #define MAX_RING_DESC 4096
171
172 struct vpool {
173         struct rte_mempool *pool;
174         struct rte_ring *ring;
175         uint32_t buf_size;
176 } vpool_array[MAX_QUEUES+MAX_QUEUES];
177
178 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
179 typedef enum {
180         VM2VM_DISABLED = 0,
181         VM2VM_SOFTWARE = 1,
182         VM2VM_HARDWARE = 2,
183         VM2VM_LAST
184 } vm2vm_type;
185 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
186
187 /* The type of host physical address translated from guest physical address. */
188 typedef enum {
189         PHYS_ADDR_CONTINUOUS = 0,
190         PHYS_ADDR_CROSS_SUBREG = 1,
191         PHYS_ADDR_INVALID = 2,
192         PHYS_ADDR_LAST
193 } hpa_type;
194
195 /* Enable stats. */
196 static uint32_t enable_stats = 0;
197 /* Enable retries on RX. */
198 static uint32_t enable_retry = 1;
199 /* Specify timeout (in useconds) between retries on RX. */
200 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
201 /* Specify the number of retries on RX. */
202 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
203
204 /* Character device basename. Can be set by user. */
205 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
206
207 /* empty vmdq configuration structure. Filled in programatically */
208 static struct rte_eth_conf vmdq_conf_default = {
209         .rxmode = {
210                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
211                 .split_hdr_size = 0,
212                 .header_split   = 0, /**< Header Split disabled */
213                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
214                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
215                 /*
216                  * It is necessary for 1G NIC such as I350,
217                  * this fixes bug of ipv4 forwarding in guest can't
218                  * forward pakets from one virtio dev to another virtio dev.
219                  */
220                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
221                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
222                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
223         },
224
225         .txmode = {
226                 .mq_mode = ETH_MQ_TX_NONE,
227         },
228         .rx_adv_conf = {
229                 /*
230                  * should be overridden separately in code with
231                  * appropriate values
232                  */
233                 .vmdq_rx_conf = {
234                         .nb_queue_pools = ETH_8_POOLS,
235                         .enable_default_pool = 0,
236                         .default_pool = 0,
237                         .nb_pool_maps = 0,
238                         .pool_map = {{0, 0},},
239                 },
240         },
241 };
242
243 static unsigned lcore_ids[RTE_MAX_LCORE];
244 static uint8_t ports[RTE_MAX_ETHPORTS];
245 static unsigned num_ports = 0; /**< The number of ports specified in command line */
246 static uint16_t num_pf_queues, num_vmdq_queues;
247 static uint16_t vmdq_pool_base, vmdq_queue_base;
248 static uint16_t queues_per_pool;
249
250 static const uint16_t external_pkt_default_vlan_tag = 2000;
251 const uint16_t vlan_tags[] = {
252         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
253         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
254         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
255         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
256         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
257         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
258         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
259         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
260 };
261
262 /* ethernet addresses of ports */
263 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
264
265 /* heads for the main used and free linked lists for the data path. */
266 static struct virtio_net_data_ll *ll_root_used = NULL;
267 static struct virtio_net_data_ll *ll_root_free = NULL;
268
269 /* Array of data core structures containing information on individual core linked lists. */
270 static struct lcore_info lcore_info[RTE_MAX_LCORE];
271
272 /* Used for queueing bursts of TX packets. */
273 struct mbuf_table {
274         unsigned len;
275         unsigned txq_id;
276         struct rte_mbuf *m_table[MAX_PKT_BURST];
277 };
278
279 /* TX queue for each data core. */
280 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
281
282 /* TX queue fori each virtio device for zero copy. */
283 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
284
285 /* Vlan header struct used to insert vlan tags on TX. */
286 struct vlan_ethhdr {
287         unsigned char   h_dest[ETH_ALEN];
288         unsigned char   h_source[ETH_ALEN];
289         __be16          h_vlan_proto;
290         __be16          h_vlan_TCI;
291         __be16          h_vlan_encapsulated_proto;
292 };
293
294 /* IPv4 Header */
295 struct ipv4_hdr {
296         uint8_t  version_ihl;           /**< version and header length */
297         uint8_t  type_of_service;       /**< type of service */
298         uint16_t total_length;          /**< length of packet */
299         uint16_t packet_id;             /**< packet ID */
300         uint16_t fragment_offset;       /**< fragmentation offset */
301         uint8_t  time_to_live;          /**< time to live */
302         uint8_t  next_proto_id;         /**< protocol ID */
303         uint16_t hdr_checksum;          /**< header checksum */
304         uint32_t src_addr;              /**< source address */
305         uint32_t dst_addr;              /**< destination address */
306 } __attribute__((__packed__));
307
308 /* Header lengths. */
309 #define VLAN_HLEN       4
310 #define VLAN_ETH_HLEN   18
311
312 /* Per-device statistics struct */
313 struct device_statistics {
314         uint64_t tx_total;
315         rte_atomic64_t rx_total_atomic;
316         uint64_t rx_total;
317         uint64_t tx;
318         rte_atomic64_t rx_atomic;
319         uint64_t rx;
320 } __rte_cache_aligned;
321 struct device_statistics dev_statistics[MAX_DEVICES];
322
323 /*
324  * Builds up the correct configuration for VMDQ VLAN pool map
325  * according to the pool & queue limits.
326  */
327 static inline int
328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
329 {
330         struct rte_eth_vmdq_rx_conf conf;
331         struct rte_eth_vmdq_rx_conf *def_conf =
332                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
333         unsigned i;
334
335         memset(&conf, 0, sizeof(conf));
336         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
337         conf.nb_pool_maps = num_devices;
338         conf.enable_loop_back = def_conf->enable_loop_back;
339         conf.rx_mode = def_conf->rx_mode;
340
341         for (i = 0; i < conf.nb_pool_maps; i++) {
342                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
343                 conf.pool_map[i].pools = (1UL << i);
344         }
345
346         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
347         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
348                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
349         return 0;
350 }
351
352 /*
353  * Validate the device number according to the max pool number gotten form
354  * dev_info. If the device number is invalid, give the error message and
355  * return -1. Each device must have its own pool.
356  */
357 static inline int
358 validate_num_devices(uint32_t max_nb_devices)
359 {
360         if (num_devices > max_nb_devices) {
361                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
362                 return -1;
363         }
364         return 0;
365 }
366
367 /*
368  * Initialises a given port using global settings and with the rx buffers
369  * coming from the mbuf_pool passed as parameter
370  */
371 static inline int
372 port_init(uint8_t port)
373 {
374         struct rte_eth_dev_info dev_info;
375         struct rte_eth_conf port_conf;
376         struct rte_eth_rxconf *rxconf;
377         struct rte_eth_txconf *txconf;
378         int16_t rx_rings, tx_rings;
379         uint16_t rx_ring_size, tx_ring_size;
380         int retval;
381         uint16_t q;
382
383         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
384         rte_eth_dev_info_get (port, &dev_info);
385
386         if (dev_info.max_rx_queues > MAX_QUEUES) {
387                 rte_exit(EXIT_FAILURE,
388                         "please define MAX_QUEUES no less than %u in %s\n",
389                         dev_info.max_rx_queues, __FILE__);
390         }
391
392         rxconf = &dev_info.default_rxconf;
393         txconf = &dev_info.default_txconf;
394         rxconf->rx_drop_en = 1;
395
396         /* Enable vlan offload */
397         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
398
399         /*
400          * Zero copy defers queue RX/TX start to the time when guest
401          * finishes its startup and packet buffers from that guest are
402          * available.
403          */
404         if (zero_copy) {
405                 rxconf->rx_deferred_start = 1;
406                 rxconf->rx_drop_en = 0;
407                 txconf->tx_deferred_start = 1;
408         }
409
410         /*configure the number of supported virtio devices based on VMDQ limits */
411         num_devices = dev_info.max_vmdq_pools;
412
413         if (zero_copy) {
414                 rx_ring_size = num_rx_descriptor;
415                 tx_ring_size = num_tx_descriptor;
416                 tx_rings = dev_info.max_tx_queues;
417         } else {
418                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
419                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
420                 tx_rings = (uint16_t)rte_lcore_count();
421         }
422
423         retval = validate_num_devices(MAX_DEVICES);
424         if (retval < 0)
425                 return retval;
426
427         /* Get port configuration. */
428         retval = get_eth_conf(&port_conf, num_devices);
429         if (retval < 0)
430                 return retval;
431         /* NIC queues are divided into pf queues and vmdq queues.  */
432         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
433         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
434         num_vmdq_queues = num_devices * queues_per_pool;
435         num_queues = num_pf_queues + num_vmdq_queues;
436         vmdq_queue_base = dev_info.vmdq_queue_base;
437         vmdq_pool_base  = dev_info.vmdq_pool_base;
438         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
439                 num_pf_queues, num_devices, queues_per_pool);
440
441         if (port >= rte_eth_dev_count()) return -1;
442
443         rx_rings = (uint16_t)dev_info.max_rx_queues;
444         /* Configure ethernet device. */
445         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446         if (retval != 0)
447                 return retval;
448
449         /* Setup the queues. */
450         for (q = 0; q < rx_rings; q ++) {
451                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452                                                 rte_eth_dev_socket_id(port),
453                                                 rxconf,
454                                                 vpool_array[q].pool);
455                 if (retval < 0)
456                         return retval;
457         }
458         for (q = 0; q < tx_rings; q ++) {
459                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
460                                                 rte_eth_dev_socket_id(port),
461                                                 txconf);
462                 if (retval < 0)
463                         return retval;
464         }
465
466         /* Start the device. */
467         retval  = rte_eth_dev_start(port);
468         if (retval < 0) {
469                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
470                 return retval;
471         }
472
473         if (promiscuous)
474                 rte_eth_promiscuous_enable(port);
475
476         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
477         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
478         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
479                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
480                         (unsigned)port,
481                         vmdq_ports_eth_addr[port].addr_bytes[0],
482                         vmdq_ports_eth_addr[port].addr_bytes[1],
483                         vmdq_ports_eth_addr[port].addr_bytes[2],
484                         vmdq_ports_eth_addr[port].addr_bytes[3],
485                         vmdq_ports_eth_addr[port].addr_bytes[4],
486                         vmdq_ports_eth_addr[port].addr_bytes[5]);
487
488         return 0;
489 }
490
491 /*
492  * Set character device basename.
493  */
494 static int
495 us_vhost_parse_basename(const char *q_arg)
496 {
497         /* parse number string */
498
499         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
500                 return -1;
501         else
502                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
503
504         return 0;
505 }
506
507 /*
508  * Parse the portmask provided at run time.
509  */
510 static int
511 parse_portmask(const char *portmask)
512 {
513         char *end = NULL;
514         unsigned long pm;
515
516         errno = 0;
517
518         /* parse hexadecimal string */
519         pm = strtoul(portmask, &end, 16);
520         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
521                 return -1;
522
523         if (pm == 0)
524                 return -1;
525
526         return pm;
527
528 }
529
530 /*
531  * Parse num options at run time.
532  */
533 static int
534 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
535 {
536         char *end = NULL;
537         unsigned long num;
538
539         errno = 0;
540
541         /* parse unsigned int string */
542         num = strtoul(q_arg, &end, 10);
543         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
544                 return -1;
545
546         if (num > max_valid_value)
547                 return -1;
548
549         return num;
550
551 }
552
553 /*
554  * Display usage
555  */
556 static void
557 us_vhost_usage(const char *prgname)
558 {
559         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
560         "               --vm2vm [0|1|2]\n"
561         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
562         "               --dev-basename <name>\n"
563         "               --nb-devices ND\n"
564         "               -p PORTMASK: Set mask for ports to be used by application\n"
565         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
566         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
567         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
568         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
569         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
570         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
571         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572         "               --dev-basename: The basename to be used for the character device.\n"
573         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
574                         "zero copy\n"
575         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
576                         "used only when zero copy is enabled.\n"
577         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
578                         "used only when zero copy is enabled.\n",
579                prgname);
580 }
581
582 /*
583  * Parse the arguments given in the command line of the application.
584  */
585 static int
586 us_vhost_parse_args(int argc, char **argv)
587 {
588         int opt, ret;
589         int option_index;
590         unsigned i;
591         const char *prgname = argv[0];
592         static struct option long_option[] = {
593                 {"vm2vm", required_argument, NULL, 0},
594                 {"rx-retry", required_argument, NULL, 0},
595                 {"rx-retry-delay", required_argument, NULL, 0},
596                 {"rx-retry-num", required_argument, NULL, 0},
597                 {"mergeable", required_argument, NULL, 0},
598                 {"vlan-strip", required_argument, NULL, 0},
599                 {"stats", required_argument, NULL, 0},
600                 {"dev-basename", required_argument, NULL, 0},
601                 {"zero-copy", required_argument, NULL, 0},
602                 {"rx-desc-num", required_argument, NULL, 0},
603                 {"tx-desc-num", required_argument, NULL, 0},
604                 {NULL, 0, 0, 0},
605         };
606
607         /* Parse command line */
608         while ((opt = getopt_long(argc, argv, "p:P",
609                         long_option, &option_index)) != EOF) {
610                 switch (opt) {
611                 /* Portmask */
612                 case 'p':
613                         enabled_port_mask = parse_portmask(optarg);
614                         if (enabled_port_mask == 0) {
615                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616                                 us_vhost_usage(prgname);
617                                 return -1;
618                         }
619                         break;
620
621                 case 'P':
622                         promiscuous = 1;
623                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
624                                 ETH_VMDQ_ACCEPT_BROADCAST |
625                                 ETH_VMDQ_ACCEPT_MULTICAST;
626                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
627
628                         break;
629
630                 case 0:
631                         /* Enable/disable vm2vm comms. */
632                         if (!strncmp(long_option[option_index].name, "vm2vm",
633                                 MAX_LONG_OPT_SZ)) {
634                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
635                                 if (ret == -1) {
636                                         RTE_LOG(INFO, VHOST_CONFIG,
637                                                 "Invalid argument for "
638                                                 "vm2vm [0|1|2]\n");
639                                         us_vhost_usage(prgname);
640                                         return -1;
641                                 } else {
642                                         vm2vm_mode = (vm2vm_type)ret;
643                                 }
644                         }
645
646                         /* Enable/disable retries on RX. */
647                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
648                                 ret = parse_num_opt(optarg, 1);
649                                 if (ret == -1) {
650                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 } else {
654                                         enable_retry = ret;
655                                 }
656                         }
657
658                         /* Specify the retries delay time (in useconds) on RX. */
659                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
660                                 ret = parse_num_opt(optarg, INT32_MAX);
661                                 if (ret == -1) {
662                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
663                                         us_vhost_usage(prgname);
664                                         return -1;
665                                 } else {
666                                         burst_rx_delay_time = ret;
667                                 }
668                         }
669
670                         /* Specify the retries number on RX. */
671                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
672                                 ret = parse_num_opt(optarg, INT32_MAX);
673                                 if (ret == -1) {
674                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
675                                         us_vhost_usage(prgname);
676                                         return -1;
677                                 } else {
678                                         burst_rx_retry_num = ret;
679                                 }
680                         }
681
682                         /* Enable/disable RX mergeable buffers. */
683                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
684                                 ret = parse_num_opt(optarg, 1);
685                                 if (ret == -1) {
686                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
687                                         us_vhost_usage(prgname);
688                                         return -1;
689                                 } else {
690                                         mergeable = !!ret;
691                                         if (ret) {
692                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
693                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
694                                                         = JUMBO_FRAME_MAX_SIZE;
695                                         }
696                                 }
697                         }
698
699                         /* Enable/disable RX VLAN strip on host. */
700                         if (!strncmp(long_option[option_index].name,
701                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
702                                 ret = parse_num_opt(optarg, 1);
703                                 if (ret == -1) {
704                                         RTE_LOG(INFO, VHOST_CONFIG,
705                                                 "Invalid argument for VLAN strip [0|1]\n");
706                                         us_vhost_usage(prgname);
707                                         return -1;
708                                 } else {
709                                         vlan_strip = !!ret;
710                                         vmdq_conf_default.rxmode.hw_vlan_strip =
711                                                 vlan_strip;
712                                 }
713                         }
714
715                         /* Enable/disable stats. */
716                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
717                                 ret = parse_num_opt(optarg, INT32_MAX);
718                                 if (ret == -1) {
719                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
720                                         us_vhost_usage(prgname);
721                                         return -1;
722                                 } else {
723                                         enable_stats = ret;
724                                 }
725                         }
726
727                         /* Set character device basename. */
728                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
729                                 if (us_vhost_parse_basename(optarg) == -1) {
730                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
731                                         us_vhost_usage(prgname);
732                                         return -1;
733                                 }
734                         }
735
736                         /* Enable/disable rx/tx zero copy. */
737                         if (!strncmp(long_option[option_index].name,
738                                 "zero-copy", MAX_LONG_OPT_SZ)) {
739                                 ret = parse_num_opt(optarg, 1);
740                                 if (ret == -1) {
741                                         RTE_LOG(INFO, VHOST_CONFIG,
742                                                 "Invalid argument"
743                                                 " for zero-copy [0|1]\n");
744                                         us_vhost_usage(prgname);
745                                         return -1;
746                                 } else
747                                         zero_copy = ret;
748
749                                 if (zero_copy) {
750 #ifdef RTE_MBUF_REFCNT
751                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
752                                         "zero copy vhost APP, please "
753                                         "disable RTE_MBUF_REFCNT\n"
754                                         "in config file and then rebuild DPDK "
755                                         "core lib!\n"
756                                         "Otherwise please disable zero copy "
757                                         "flag in command line!\n");
758                                         return -1;
759 #endif
760                                 }
761                         }
762
763                         /* Specify the descriptor number on RX. */
764                         if (!strncmp(long_option[option_index].name,
765                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
766                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
767                                 if ((ret == -1) || (!POWEROF2(ret))) {
768                                         RTE_LOG(INFO, VHOST_CONFIG,
769                                         "Invalid argument for rx-desc-num[0-N],"
770                                         "power of 2 required.\n");
771                                         us_vhost_usage(prgname);
772                                         return -1;
773                                 } else {
774                                         num_rx_descriptor = ret;
775                                 }
776                         }
777
778                         /* Specify the descriptor number on TX. */
779                         if (!strncmp(long_option[option_index].name,
780                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
781                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
782                                 if ((ret == -1) || (!POWEROF2(ret))) {
783                                         RTE_LOG(INFO, VHOST_CONFIG,
784                                         "Invalid argument for tx-desc-num [0-N],"
785                                         "power of 2 required.\n");
786                                         us_vhost_usage(prgname);
787                                         return -1;
788                                 } else {
789                                         num_tx_descriptor = ret;
790                                 }
791                         }
792
793                         break;
794
795                         /* Invalid option - print options. */
796                 default:
797                         us_vhost_usage(prgname);
798                         return -1;
799                 }
800         }
801
802         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
803                 if (enabled_port_mask & (1 << i))
804                         ports[num_ports++] = (uint8_t)i;
805         }
806
807         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
808                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
809                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
810                 return -1;
811         }
812
813         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
814                 RTE_LOG(INFO, VHOST_PORT,
815                         "Vhost zero copy doesn't support software vm2vm,"
816                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
817                 return -1;
818         }
819
820         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
821                 RTE_LOG(INFO, VHOST_PORT,
822                         "Vhost zero copy doesn't support jumbo frame,"
823                         "please specify '--mergeable 0' to disable the "
824                         "mergeable feature.\n");
825                 return -1;
826         }
827
828         return 0;
829 }
830
831 /*
832  * Update the global var NUM_PORTS and array PORTS according to system ports number
833  * and return valid ports number
834  */
835 static unsigned check_ports_num(unsigned nb_ports)
836 {
837         unsigned valid_num_ports = num_ports;
838         unsigned portid;
839
840         if (num_ports > nb_ports) {
841                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
842                         num_ports, nb_ports);
843                 num_ports = nb_ports;
844         }
845
846         for (portid = 0; portid < num_ports; portid ++) {
847                 if (ports[portid] >= nb_ports) {
848                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
849                                 ports[portid], (nb_ports - 1));
850                         ports[portid] = INVALID_PORT_ID;
851                         valid_num_ports--;
852                 }
853         }
854         return valid_num_ports;
855 }
856
857 /*
858  * Macro to print out packet contents. Wrapped in debug define so that the
859  * data path is not effected when debug is disabled.
860  */
861 #ifdef DEBUG
862 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
863         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
864         unsigned int index;                                                                                                                                                                                             \
865         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
866                                                                                                                                                                                                                                         \
867         if ((header))                                                                                                                                                                                                   \
868                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
869         else                                                                                                                                                                                                                    \
870                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
871         for (index = 0; index < (size); index++) {                                                                                                                                              \
872                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
873                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
874         }                                                                                                                                                                                                                               \
875         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
876                                                                                                                                                                                                                                         \
877         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
878 } while(0)
879 #else
880 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
881 #endif
882
883 /*
884  * Function to convert guest physical addresses to vhost physical addresses.
885  * This is used to convert virtio buffer addresses.
886  */
887 static inline uint64_t __attribute__((always_inline))
888 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
889         uint32_t buf_len, hpa_type *addr_type)
890 {
891         struct virtio_memory_regions_hpa *region;
892         uint32_t regionidx;
893         uint64_t vhost_pa = 0;
894
895         *addr_type = PHYS_ADDR_INVALID;
896
897         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
898                 region = &vdev->regions_hpa[regionidx];
899                 if ((guest_pa >= region->guest_phys_address) &&
900                         (guest_pa <= region->guest_phys_address_end)) {
901                         vhost_pa = region->host_phys_addr_offset + guest_pa;
902                         if (likely((guest_pa + buf_len - 1)
903                                 <= region->guest_phys_address_end))
904                                 *addr_type = PHYS_ADDR_CONTINUOUS;
905                         else
906                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
907                         break;
908                 }
909         }
910
911         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
912                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
913                 (void *)(uintptr_t)vhost_pa);
914
915         return vhost_pa;
916 }
917
918 /*
919  * Compares a packet destination MAC address to a device MAC address.
920  */
921 static inline int __attribute__((always_inline))
922 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
923 {
924         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
925 }
926
927 /*
928  * This function learns the MAC address of the device and registers this along with a
929  * vlan tag to a VMDQ.
930  */
931 static int
932 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
933 {
934         struct ether_hdr *pkt_hdr;
935         struct virtio_net_data_ll *dev_ll;
936         struct virtio_net *dev = vdev->dev;
937         int i, ret;
938
939         /* Learn MAC address of guest device from packet */
940         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
941
942         dev_ll = ll_root_used;
943
944         while (dev_ll != NULL) {
945                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
946                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
947                         return -1;
948                 }
949                 dev_ll = dev_ll->next;
950         }
951
952         for (i = 0; i < ETHER_ADDR_LEN; i++)
953                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
954
955         /* vlan_tag currently uses the device_id. */
956         vdev->vlan_tag = vlan_tags[dev->device_fh];
957
958         /* Print out VMDQ registration info. */
959         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
960                 dev->device_fh,
961                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
962                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
963                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
964                 vdev->vlan_tag);
965
966         /* Register the MAC address. */
967         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
968                                 (uint32_t)dev->device_fh + vmdq_pool_base);
969         if (ret)
970                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
971                                         dev->device_fh);
972
973         /* Enable stripping of the vlan tag as we handle routing. */
974         if (vlan_strip)
975                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
976                         (uint16_t)vdev->vmdq_rx_q, 1);
977
978         /* Set device as ready for RX. */
979         vdev->ready = DEVICE_RX;
980
981         return 0;
982 }
983
984 /*
985  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
986  * queue before disabling RX on the device.
987  */
988 static inline void
989 unlink_vmdq(struct vhost_dev *vdev)
990 {
991         unsigned i = 0;
992         unsigned rx_count;
993         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
994
995         if (vdev->ready == DEVICE_RX) {
996                 /*clear MAC and VLAN settings*/
997                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
998                 for (i = 0; i < 6; i++)
999                         vdev->mac_address.addr_bytes[i] = 0;
1000
1001                 vdev->vlan_tag = 0;
1002
1003                 /*Clear out the receive buffers*/
1004                 rx_count = rte_eth_rx_burst(ports[0],
1005                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1006
1007                 while (rx_count) {
1008                         for (i = 0; i < rx_count; i++)
1009                                 rte_pktmbuf_free(pkts_burst[i]);
1010
1011                         rx_count = rte_eth_rx_burst(ports[0],
1012                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1013                 }
1014
1015                 vdev->ready = DEVICE_MAC_LEARNING;
1016         }
1017 }
1018
1019 /*
1020  * Check if the packet destination MAC address is for a local device. If so then put
1021  * the packet on that devices RX queue. If not then return.
1022  */
1023 static inline int __attribute__((always_inline))
1024 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1025 {
1026         struct virtio_net_data_ll *dev_ll;
1027         struct ether_hdr *pkt_hdr;
1028         uint64_t ret = 0;
1029         struct virtio_net *dev = vdev->dev;
1030         struct virtio_net *tdev; /* destination virito device */
1031
1032         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1033
1034         /*get the used devices list*/
1035         dev_ll = ll_root_used;
1036
1037         while (dev_ll != NULL) {
1038                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1039                                           &dev_ll->vdev->mac_address)) {
1040
1041                         /* Drop the packet if the TX packet is destined for the TX device. */
1042                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1043                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1044                                                         dev->device_fh);
1045                                 return 0;
1046                         }
1047                         tdev = dev_ll->vdev->dev;
1048
1049
1050                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1051
1052                         if (unlikely(dev_ll->vdev->remove)) {
1053                                 /*drop the packet if the device is marked for removal*/
1054                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1055                         } else {
1056                                 /*send the packet to the local virtio device*/
1057                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1058                                 if (enable_stats) {
1059                                         rte_atomic64_add(
1060                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1061                                         1);
1062                                         rte_atomic64_add(
1063                                         &dev_statistics[tdev->device_fh].rx_atomic,
1064                                         ret);
1065                                         dev_statistics[tdev->device_fh].tx_total++;
1066                                         dev_statistics[tdev->device_fh].tx += ret;
1067                                 }
1068                         }
1069
1070                         return 0;
1071                 }
1072                 dev_ll = dev_ll->next;
1073         }
1074
1075         return -1;
1076 }
1077
1078 /*
1079  * Check if the destination MAC of a packet is one local VM,
1080  * and get its vlan tag, and offset if it is.
1081  */
1082 static inline int __attribute__((always_inline))
1083 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1084         uint32_t *offset, uint16_t *vlan_tag)
1085 {
1086         struct virtio_net_data_ll *dev_ll = ll_root_used;
1087         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1088
1089         while (dev_ll != NULL) {
1090                 if ((dev_ll->vdev->ready == DEVICE_RX)
1091                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1092                 &dev_ll->vdev->mac_address)) {
1093                         /*
1094                          * Drop the packet if the TX packet is
1095                          * destined for the TX device.
1096                          */
1097                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1098                                 LOG_DEBUG(VHOST_DATA,
1099                                 "(%"PRIu64") TX: Source and destination"
1100                                 " MAC addresses are the same. Dropping "
1101                                 "packet.\n",
1102                                 dev_ll->vdev->dev->device_fh);
1103                                 return -1;
1104                         }
1105
1106                         /*
1107                          * HW vlan strip will reduce the packet length
1108                          * by minus length of vlan tag, so need restore
1109                          * the packet length by plus it.
1110                          */
1111                         *offset = VLAN_HLEN;
1112                         *vlan_tag =
1113                         (uint16_t)
1114                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1115
1116                         LOG_DEBUG(VHOST_DATA,
1117                         "(%"PRIu64") TX: pkt to local VM device id:"
1118                         "(%"PRIu64") vlan tag: %d.\n",
1119                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1120                         vlan_tag);
1121
1122                         break;
1123                 }
1124                 dev_ll = dev_ll->next;
1125         }
1126         return 0;
1127 }
1128
1129 /*
1130  * This function routes the TX packet to the correct interface. This may be a local device
1131  * or the physical port.
1132  */
1133 static inline void __attribute__((always_inline))
1134 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1135 {
1136         struct mbuf_table *tx_q;
1137         struct rte_mbuf **m_table;
1138         unsigned len, ret, offset = 0;
1139         const uint16_t lcore_id = rte_lcore_id();
1140         struct virtio_net *dev = vdev->dev;
1141         struct ether_hdr *nh;
1142
1143         /*check if destination is local VM*/
1144         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1145                 rte_pktmbuf_free(m);
1146                 return;
1147         }
1148
1149         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1150                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1151                         rte_pktmbuf_free(m);
1152                         return;
1153                 }
1154         }
1155
1156         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1157
1158         /*Add packet to the port tx queue*/
1159         tx_q = &lcore_tx_queue[lcore_id];
1160         len = tx_q->len;
1161
1162         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1164                 /* Guest has inserted the vlan tag. */
1165                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1166                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1167                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1168                         (vh->vlan_tci != vlan_tag_be))
1169                         vh->vlan_tci = vlan_tag_be;
1170         } else {
1171                 m->ol_flags = PKT_TX_VLAN_PKT;
1172
1173                 /*
1174                  * Find the right seg to adjust the data len when offset is
1175                  * bigger than tail room size.
1176                  */
1177                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1178                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1179                                 m->data_len += offset;
1180                         else {
1181                                 struct rte_mbuf *seg = m;
1182
1183                                 while ((seg->next != NULL) &&
1184                                         (offset > rte_pktmbuf_tailroom(seg)))
1185                                         seg = seg->next;
1186
1187                                 seg->data_len += offset;
1188                         }
1189                         m->pkt_len += offset;
1190                 }
1191
1192                 m->vlan_tci = vlan_tag;
1193         }
1194
1195         tx_q->m_table[len] = m;
1196         len++;
1197         if (enable_stats) {
1198                 dev_statistics[dev->device_fh].tx_total++;
1199                 dev_statistics[dev->device_fh].tx++;
1200         }
1201
1202         if (unlikely(len == MAX_PKT_BURST)) {
1203                 m_table = (struct rte_mbuf **)tx_q->m_table;
1204                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1205                 /* Free any buffers not handled by TX and update the port stats. */
1206                 if (unlikely(ret < len)) {
1207                         do {
1208                                 rte_pktmbuf_free(m_table[ret]);
1209                         } while (++ret < len);
1210                 }
1211
1212                 len = 0;
1213         }
1214
1215         tx_q->len = len;
1216         return;
1217 }
1218 /*
1219  * This function is called by each data core. It handles all RX/TX registered with the
1220  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1221  * with all devices in the main linked list.
1222  */
1223 static int
1224 switch_worker(__attribute__((unused)) void *arg)
1225 {
1226         struct rte_mempool *mbuf_pool = arg;
1227         struct virtio_net *dev = NULL;
1228         struct vhost_dev *vdev = NULL;
1229         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1230         struct virtio_net_data_ll *dev_ll;
1231         struct mbuf_table *tx_q;
1232         volatile struct lcore_ll_info *lcore_ll;
1233         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1234         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1235         unsigned ret, i;
1236         const uint16_t lcore_id = rte_lcore_id();
1237         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1238         uint16_t rx_count = 0;
1239         uint16_t tx_count;
1240         uint32_t retry = 0;
1241
1242         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1243         lcore_ll = lcore_info[lcore_id].lcore_ll;
1244         prev_tsc = 0;
1245
1246         tx_q = &lcore_tx_queue[lcore_id];
1247         for (i = 0; i < num_cores; i ++) {
1248                 if (lcore_ids[i] == lcore_id) {
1249                         tx_q->txq_id = i;
1250                         break;
1251                 }
1252         }
1253
1254         while(1) {
1255                 cur_tsc = rte_rdtsc();
1256                 /*
1257                  * TX burst queue drain
1258                  */
1259                 diff_tsc = cur_tsc - prev_tsc;
1260                 if (unlikely(diff_tsc > drain_tsc)) {
1261
1262                         if (tx_q->len) {
1263                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1264
1265                                 /*Tx any packets in the queue*/
1266                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1267                                                                            (struct rte_mbuf **)tx_q->m_table,
1268                                                                            (uint16_t)tx_q->len);
1269                                 if (unlikely(ret < tx_q->len)) {
1270                                         do {
1271                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1272                                         } while (++ret < tx_q->len);
1273                                 }
1274
1275                                 tx_q->len = 0;
1276                         }
1277
1278                         prev_tsc = cur_tsc;
1279
1280                 }
1281
1282                 rte_prefetch0(lcore_ll->ll_root_used);
1283                 /*
1284                  * Inform the configuration core that we have exited the linked list and that no devices are
1285                  * in use if requested.
1286                  */
1287                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1288                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1289
1290                 /*
1291                  * Process devices
1292                  */
1293                 dev_ll = lcore_ll->ll_root_used;
1294
1295                 while (dev_ll != NULL) {
1296                         /*get virtio device ID*/
1297                         vdev = dev_ll->vdev;
1298                         dev = vdev->dev;
1299
1300                         if (unlikely(vdev->remove)) {
1301                                 dev_ll = dev_ll->next;
1302                                 unlink_vmdq(vdev);
1303                                 vdev->ready = DEVICE_SAFE_REMOVE;
1304                                 continue;
1305                         }
1306                         if (likely(vdev->ready == DEVICE_RX)) {
1307                                 /*Handle guest RX*/
1308                                 rx_count = rte_eth_rx_burst(ports[0],
1309                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1310
1311                                 if (rx_count) {
1312                                         /*
1313                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1314                                         * Here MAX_PKT_BURST must be less than virtio queue size
1315                                         */
1316                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1317                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1318                                                         rte_delay_us(burst_rx_delay_time);
1319                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1320                                                                 break;
1321                                                 }
1322                                         }
1323                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1324                                         if (enable_stats) {
1325                                                 rte_atomic64_add(
1326                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1327                                                 rx_count);
1328                                                 rte_atomic64_add(
1329                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1330                                         }
1331                                         while (likely(rx_count)) {
1332                                                 rx_count--;
1333                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1334                                         }
1335
1336                                 }
1337                         }
1338
1339                         if (likely(!vdev->remove)) {
1340                                 /* Handle guest TX*/
1341                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1342                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1343                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1344                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1345                                                 while (tx_count)
1346                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1347                                         }
1348                                 }
1349                                 while (tx_count)
1350                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1351                         }
1352
1353                         /*move to the next device in the list*/
1354                         dev_ll = dev_ll->next;
1355                 }
1356         }
1357
1358         return 0;
1359 }
1360
1361 /*
1362  * This function gets available ring number for zero copy rx.
1363  * Only one thread will call this funciton for a paticular virtio device,
1364  * so, it is designed as non-thread-safe function.
1365  */
1366 static inline uint32_t __attribute__((always_inline))
1367 get_available_ring_num_zcp(struct virtio_net *dev)
1368 {
1369         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1370         uint16_t avail_idx;
1371
1372         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1373         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1374 }
1375
1376 /*
1377  * This function gets available ring index for zero copy rx,
1378  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1379  * Only one thread will call this funciton for a paticular virtio device,
1380  * so, it is designed as non-thread-safe function.
1381  */
1382 static inline uint32_t __attribute__((always_inline))
1383 get_available_ring_index_zcp(struct virtio_net *dev,
1384         uint16_t *res_base_idx, uint32_t count)
1385 {
1386         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1387         uint16_t avail_idx;
1388         uint32_t retry = 0;
1389         uint16_t free_entries;
1390
1391         *res_base_idx = vq->last_used_idx_res;
1392         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1393         free_entries = (avail_idx - *res_base_idx);
1394
1395         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1396                         "avail idx: %d, "
1397                         "res base idx:%d, free entries:%d\n",
1398                         dev->device_fh, avail_idx, *res_base_idx,
1399                         free_entries);
1400
1401         /*
1402          * If retry is enabled and the queue is full then we wait
1403          * and retry to avoid packet loss.
1404          */
1405         if (enable_retry && unlikely(count > free_entries)) {
1406                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1407                         rte_delay_us(burst_rx_delay_time);
1408                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1409                         free_entries = (avail_idx - *res_base_idx);
1410                         if (count <= free_entries)
1411                                 break;
1412                 }
1413         }
1414
1415         /*check that we have enough buffers*/
1416         if (unlikely(count > free_entries))
1417                 count = free_entries;
1418
1419         if (unlikely(count == 0)) {
1420                 LOG_DEBUG(VHOST_DATA,
1421                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1422                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1423                         dev->device_fh, avail_idx,
1424                         *res_base_idx, free_entries);
1425                 return 0;
1426         }
1427
1428         vq->last_used_idx_res = *res_base_idx + count;
1429
1430         return count;
1431 }
1432
1433 /*
1434  * This function put descriptor back to used list.
1435  */
1436 static inline void __attribute__((always_inline))
1437 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1438 {
1439         uint16_t res_cur_idx = vq->last_used_idx;
1440         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1441         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1442         rte_compiler_barrier();
1443         *(volatile uint16_t *)&vq->used->idx += 1;
1444         vq->last_used_idx += 1;
1445
1446         /* Kick the guest if necessary. */
1447         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1448                 eventfd_write((int)vq->kickfd, 1);
1449 }
1450
1451 /*
1452  * This function get available descriptor from vitio vring and un-attached mbuf
1453  * from vpool->ring, and then attach them together. It needs adjust the offset
1454  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1455  * frame data may be put to wrong location in mbuf.
1456  */
1457 static inline void __attribute__((always_inline))
1458 attach_rxmbuf_zcp(struct virtio_net *dev)
1459 {
1460         uint16_t res_base_idx, desc_idx;
1461         uint64_t buff_addr, phys_addr;
1462         struct vhost_virtqueue *vq;
1463         struct vring_desc *desc;
1464         struct rte_mbuf *mbuf = NULL;
1465         struct vpool *vpool;
1466         hpa_type addr_type;
1467         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1468
1469         vpool = &vpool_array[vdev->vmdq_rx_q];
1470         vq = dev->virtqueue[VIRTIO_RXQ];
1471
1472         do {
1473                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1474                                 1) != 1))
1475                         return;
1476                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1477
1478                 desc = &vq->desc[desc_idx];
1479                 if (desc->flags & VRING_DESC_F_NEXT) {
1480                         desc = &vq->desc[desc->next];
1481                         buff_addr = gpa_to_vva(dev, desc->addr);
1482                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1483                                         &addr_type);
1484                 } else {
1485                         buff_addr = gpa_to_vva(dev,
1486                                         desc->addr + vq->vhost_hlen);
1487                         phys_addr = gpa_to_hpa(vdev,
1488                                         desc->addr + vq->vhost_hlen,
1489                                         desc->len, &addr_type);
1490                 }
1491
1492                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1493                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1494                                 " address found when attaching RX frame buffer"
1495                                 " address!\n", dev->device_fh);
1496                         put_desc_to_used_list_zcp(vq, desc_idx);
1497                         continue;
1498                 }
1499
1500                 /*
1501                  * Check if the frame buffer address from guest crosses
1502                  * sub-region or not.
1503                  */
1504                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1505                         RTE_LOG(ERR, VHOST_DATA,
1506                                 "(%"PRIu64") Frame buffer address cross "
1507                                 "sub-regioin found when attaching RX frame "
1508                                 "buffer address!\n",
1509                                 dev->device_fh);
1510                         put_desc_to_used_list_zcp(vq, desc_idx);
1511                         continue;
1512                 }
1513         } while (unlikely(phys_addr == 0));
1514
1515         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1516         if (unlikely(mbuf == NULL)) {
1517                 LOG_DEBUG(VHOST_DATA,
1518                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1519                         "ring_sc_dequeue fail.\n",
1520                         dev->device_fh);
1521                 put_desc_to_used_list_zcp(vq, desc_idx);
1522                 return;
1523         }
1524
1525         if (unlikely(vpool->buf_size > desc->len)) {
1526                 LOG_DEBUG(VHOST_DATA,
1527                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1528                         "length(%d) of descriptor idx: %d less than room "
1529                         "size required: %d\n",
1530                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1531                 put_desc_to_used_list_zcp(vq, desc_idx);
1532                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1533                 return;
1534         }
1535
1536         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1537         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1538         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1539         mbuf->data_len = desc->len;
1540         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1541
1542         LOG_DEBUG(VHOST_DATA,
1543                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1544                 "descriptor idx:%d\n",
1545                 dev->device_fh, res_base_idx, desc_idx);
1546
1547         __rte_mbuf_raw_free(mbuf);
1548
1549         return;
1550 }
1551
1552 /*
1553  * Detach an attched packet mbuf -
1554  *  - restore original mbuf address and length values.
1555  *  - reset pktmbuf data and data_len to their default values.
1556  *  All other fields of the given packet mbuf will be left intact.
1557  *
1558  * @param m
1559  *   The attached packet mbuf.
1560  */
1561 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1562 {
1563         const struct rte_mempool *mp = m->pool;
1564         void *buf = RTE_MBUF_TO_BADDR(m);
1565         uint32_t buf_ofs;
1566         uint32_t buf_len = mp->elt_size - sizeof(*m);
1567         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1568
1569         m->buf_addr = buf;
1570         m->buf_len = (uint16_t)buf_len;
1571
1572         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1573                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1574         m->data_off = buf_ofs;
1575
1576         m->data_len = 0;
1577 }
1578
1579 /*
1580  * This function is called after packets have been transimited. It fetchs mbuf
1581  * from vpool->pool, detached it and put into vpool->ring. It also update the
1582  * used index and kick the guest if necessary.
1583  */
1584 static inline uint32_t __attribute__((always_inline))
1585 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1586 {
1587         struct rte_mbuf *mbuf;
1588         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1589         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1590         uint32_t index = 0;
1591         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1592
1593         LOG_DEBUG(VHOST_DATA,
1594                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1595                 "clean is: %d\n",
1596                 dev->device_fh, mbuf_count);
1597         LOG_DEBUG(VHOST_DATA,
1598                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1599                 "clean  is : %d\n",
1600                 dev->device_fh, rte_ring_count(vpool->ring));
1601
1602         for (index = 0; index < mbuf_count; index++) {
1603                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1604                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1605                         pktmbuf_detach_zcp(mbuf);
1606                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1607
1608                 /* Update used index buffer information. */
1609                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1610                 vq->used->ring[used_idx].len = 0;
1611
1612                 used_idx = (used_idx + 1) & (vq->size - 1);
1613         }
1614
1615         LOG_DEBUG(VHOST_DATA,
1616                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1617                 "clean is: %d\n",
1618                 dev->device_fh, rte_mempool_count(vpool->pool));
1619         LOG_DEBUG(VHOST_DATA,
1620                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1621                 "clean  is : %d\n",
1622                 dev->device_fh, rte_ring_count(vpool->ring));
1623         LOG_DEBUG(VHOST_DATA,
1624                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1625                 "vq->last_used_idx:%d\n",
1626                 dev->device_fh, vq->last_used_idx);
1627
1628         vq->last_used_idx += mbuf_count;
1629
1630         LOG_DEBUG(VHOST_DATA,
1631                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1632                 "vq->last_used_idx:%d\n",
1633                 dev->device_fh, vq->last_used_idx);
1634
1635         rte_compiler_barrier();
1636
1637         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1638
1639         /* Kick guest if required. */
1640         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1641                 eventfd_write((int)vq->kickfd, 1);
1642
1643         return 0;
1644 }
1645
1646 /*
1647  * This function is called when a virtio device is destroy.
1648  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1649  */
1650 static void mbuf_destroy_zcp(struct vpool *vpool)
1651 {
1652         struct rte_mbuf *mbuf = NULL;
1653         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1654
1655         LOG_DEBUG(VHOST_CONFIG,
1656                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1657                 "mbuf_destroy_zcp is: %d\n",
1658                 mbuf_count);
1659         LOG_DEBUG(VHOST_CONFIG,
1660                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1661                 "mbuf_destroy_zcp  is : %d\n",
1662                 rte_ring_count(vpool->ring));
1663
1664         for (index = 0; index < mbuf_count; index++) {
1665                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1666                 if (likely(mbuf != NULL)) {
1667                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1668                                 pktmbuf_detach_zcp(mbuf);
1669                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1670                 }
1671         }
1672
1673         LOG_DEBUG(VHOST_CONFIG,
1674                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1675                 "mbuf_destroy_zcp is: %d\n",
1676                 rte_mempool_count(vpool->pool));
1677         LOG_DEBUG(VHOST_CONFIG,
1678                 "in mbuf_destroy_zcp: mbuf count in ring after "
1679                 "mbuf_destroy_zcp is : %d\n",
1680                 rte_ring_count(vpool->ring));
1681 }
1682
1683 /*
1684  * This function update the use flag and counter.
1685  */
1686 static inline uint32_t __attribute__((always_inline))
1687 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1688         uint32_t count)
1689 {
1690         struct vhost_virtqueue *vq;
1691         struct vring_desc *desc;
1692         struct rte_mbuf *buff;
1693         /* The virtio_hdr is initialised to 0. */
1694         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1695                 = {{0, 0, 0, 0, 0, 0}, 0};
1696         uint64_t buff_hdr_addr = 0;
1697         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1698         uint32_t head_idx, packet_success = 0;
1699         uint16_t res_cur_idx;
1700
1701         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1702
1703         if (count == 0)
1704                 return 0;
1705
1706         vq = dev->virtqueue[VIRTIO_RXQ];
1707         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1708
1709         res_cur_idx = vq->last_used_idx;
1710         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1711                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1712
1713         /* Retrieve all of the head indexes first to avoid caching issues. */
1714         for (head_idx = 0; head_idx < count; head_idx++)
1715                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1716
1717         /*Prefetch descriptor index. */
1718         rte_prefetch0(&vq->desc[head[packet_success]]);
1719
1720         while (packet_success != count) {
1721                 /* Get descriptor from available ring */
1722                 desc = &vq->desc[head[packet_success]];
1723
1724                 buff = pkts[packet_success];
1725                 LOG_DEBUG(VHOST_DATA,
1726                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1727                         "pkt[%d] descriptor idx: %d\n",
1728                         dev->device_fh, packet_success,
1729                         MBUF_HEADROOM_UINT32(buff));
1730
1731                 PRINT_PACKET(dev,
1732                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1733                         + RTE_PKTMBUF_HEADROOM),
1734                         rte_pktmbuf_data_len(buff), 0);
1735
1736                 /* Buffer address translation for virtio header. */
1737                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1738                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1739
1740                 /*
1741                  * If the descriptors are chained the header and data are
1742                  * placed in separate buffers.
1743                  */
1744                 if (desc->flags & VRING_DESC_F_NEXT) {
1745                         desc->len = vq->vhost_hlen;
1746                         desc = &vq->desc[desc->next];
1747                         desc->len = rte_pktmbuf_data_len(buff);
1748                 } else {
1749                         desc->len = packet_len;
1750                 }
1751
1752                 /* Update used ring with desc information */
1753                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1754                         = head[packet_success];
1755                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1756                         = packet_len;
1757                 res_cur_idx++;
1758                 packet_success++;
1759
1760                 /* A header is required per buffer. */
1761                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1762                         (const void *)&virtio_hdr, vq->vhost_hlen);
1763
1764                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1765
1766                 if (likely(packet_success < count)) {
1767                         /* Prefetch descriptor index. */
1768                         rte_prefetch0(&vq->desc[head[packet_success]]);
1769                 }
1770         }
1771
1772         rte_compiler_barrier();
1773
1774         LOG_DEBUG(VHOST_DATA,
1775                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1776                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1777                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1778
1779         *(volatile uint16_t *)&vq->used->idx += count;
1780         vq->last_used_idx += count;
1781
1782         LOG_DEBUG(VHOST_DATA,
1783                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1784                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1785                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1786
1787         /* Kick the guest if necessary. */
1788         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1789                 eventfd_write((int)vq->kickfd, 1);
1790
1791         return count;
1792 }
1793
1794 /*
1795  * This function routes the TX packet to the correct interface.
1796  * This may be a local device or the physical port.
1797  */
1798 static inline void __attribute__((always_inline))
1799 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1800         uint32_t desc_idx, uint8_t need_copy)
1801 {
1802         struct mbuf_table *tx_q;
1803         struct rte_mbuf **m_table;
1804         struct rte_mbuf *mbuf = NULL;
1805         unsigned len, ret, offset = 0;
1806         struct vpool *vpool;
1807         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1808         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1809
1810         /*Add packet to the port tx queue*/
1811         tx_q = &tx_queue_zcp[vmdq_rx_q];
1812         len = tx_q->len;
1813
1814         /* Allocate an mbuf and populate the structure. */
1815         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1816         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1817         if (unlikely(mbuf == NULL)) {
1818                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1819                 RTE_LOG(ERR, VHOST_DATA,
1820                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1821                         dev->device_fh);
1822                 put_desc_to_used_list_zcp(vq, desc_idx);
1823                 return;
1824         }
1825
1826         if (vm2vm_mode == VM2VM_HARDWARE) {
1827                 /* Avoid using a vlan tag from any vm for external pkt, such as
1828                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1829                  * selection, MAC address determines it as an external pkt
1830                  * which should go to network, while vlan tag determine it as
1831                  * a vm2vm pkt should forward to another vm. Hardware confuse
1832                  * such a ambiguous situation, so pkt will lost.
1833                  */
1834                 vlan_tag = external_pkt_default_vlan_tag;
1835                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1836                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1837                         __rte_mbuf_raw_free(mbuf);
1838                         return;
1839                 }
1840         }
1841
1842         mbuf->nb_segs = m->nb_segs;
1843         mbuf->next = m->next;
1844         mbuf->data_len = m->data_len + offset;
1845         mbuf->pkt_len = mbuf->data_len;
1846         if (unlikely(need_copy)) {
1847                 /* Copy the packet contents to the mbuf. */
1848                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1849                         rte_pktmbuf_mtod(m, void *),
1850                         m->data_len);
1851         } else {
1852                 mbuf->data_off = m->data_off;
1853                 mbuf->buf_physaddr = m->buf_physaddr;
1854                 mbuf->buf_addr = m->buf_addr;
1855         }
1856         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1857         mbuf->vlan_tci = vlan_tag;
1858         mbuf->l2_len = sizeof(struct ether_hdr);
1859         mbuf->l3_len = sizeof(struct ipv4_hdr);
1860         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1861
1862         tx_q->m_table[len] = mbuf;
1863         len++;
1864
1865         LOG_DEBUG(VHOST_DATA,
1866                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1867                 dev->device_fh,
1868                 mbuf->nb_segs,
1869                 (mbuf->next == NULL) ? "null" : "non-null");
1870
1871         if (enable_stats) {
1872                 dev_statistics[dev->device_fh].tx_total++;
1873                 dev_statistics[dev->device_fh].tx++;
1874         }
1875
1876         if (unlikely(len == MAX_PKT_BURST)) {
1877                 m_table = (struct rte_mbuf **)tx_q->m_table;
1878                 ret = rte_eth_tx_burst(ports[0],
1879                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1880
1881                 /*
1882                  * Free any buffers not handled by TX and update
1883                  * the port stats.
1884                  */
1885                 if (unlikely(ret < len)) {
1886                         do {
1887                                 rte_pktmbuf_free(m_table[ret]);
1888                         } while (++ret < len);
1889                 }
1890
1891                 len = 0;
1892                 txmbuf_clean_zcp(dev, vpool);
1893         }
1894
1895         tx_q->len = len;
1896
1897         return;
1898 }
1899
1900 /*
1901  * This function TX all available packets in virtio TX queue for one
1902  * virtio-net device. If it is first packet, it learns MAC address and
1903  * setup VMDQ.
1904  */
1905 static inline void __attribute__((always_inline))
1906 virtio_dev_tx_zcp(struct virtio_net *dev)
1907 {
1908         struct rte_mbuf m;
1909         struct vhost_virtqueue *vq;
1910         struct vring_desc *desc;
1911         uint64_t buff_addr = 0, phys_addr;
1912         uint32_t head[MAX_PKT_BURST];
1913         uint32_t i;
1914         uint16_t free_entries, packet_success = 0;
1915         uint16_t avail_idx;
1916         uint8_t need_copy = 0;
1917         hpa_type addr_type;
1918         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1919
1920         vq = dev->virtqueue[VIRTIO_TXQ];
1921         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1922
1923         /* If there are no available buffers then return. */
1924         if (vq->last_used_idx_res == avail_idx)
1925                 return;
1926
1927         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1928
1929         /* Prefetch available ring to retrieve head indexes. */
1930         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1931
1932         /* Get the number of free entries in the ring */
1933         free_entries = (avail_idx - vq->last_used_idx_res);
1934
1935         /* Limit to MAX_PKT_BURST. */
1936         free_entries
1937                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1938
1939         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1940                 dev->device_fh, free_entries);
1941
1942         /* Retrieve all of the head indexes first to avoid caching issues. */
1943         for (i = 0; i < free_entries; i++)
1944                 head[i]
1945                         = vq->avail->ring[(vq->last_used_idx_res + i)
1946                         & (vq->size - 1)];
1947
1948         vq->last_used_idx_res += free_entries;
1949
1950         /* Prefetch descriptor index. */
1951         rte_prefetch0(&vq->desc[head[packet_success]]);
1952         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1953
1954         while (packet_success < free_entries) {
1955                 desc = &vq->desc[head[packet_success]];
1956
1957                 /* Discard first buffer as it is the virtio header */
1958                 desc = &vq->desc[desc->next];
1959
1960                 /* Buffer address translation. */
1961                 buff_addr = gpa_to_vva(dev, desc->addr);
1962                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1963                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1964                         &addr_type);
1965
1966                 if (likely(packet_success < (free_entries - 1)))
1967                         /* Prefetch descriptor index. */
1968                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1969
1970                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1971                         RTE_LOG(ERR, VHOST_DATA,
1972                                 "(%"PRIu64") Invalid frame buffer address found"
1973                                 "when TX packets!\n",
1974                                 dev->device_fh);
1975                         packet_success++;
1976                         continue;
1977                 }
1978
1979                 /* Prefetch buffer address. */
1980                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1981
1982                 /*
1983                  * Setup dummy mbuf. This is copied to a real mbuf if
1984                  * transmitted out the physical port.
1985                  */
1986                 m.data_len = desc->len;
1987                 m.nb_segs = 1;
1988                 m.next = NULL;
1989                 m.data_off = 0;
1990                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1991                 m.buf_physaddr = phys_addr;
1992
1993                 /*
1994                  * Check if the frame buffer address from guest crosses
1995                  * sub-region or not.
1996                  */
1997                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1998                         RTE_LOG(ERR, VHOST_DATA,
1999                                 "(%"PRIu64") Frame buffer address cross "
2000                                 "sub-regioin found when attaching TX frame "
2001                                 "buffer address!\n",
2002                                 dev->device_fh);
2003                         need_copy = 1;
2004                 } else
2005                         need_copy = 0;
2006
2007                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2008
2009                 /*
2010                  * If this is the first received packet we need to learn
2011                  * the MAC and setup VMDQ
2012                  */
2013                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2014                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2015                                 /*
2016                                  * Discard frame if device is scheduled for
2017                                  * removal or a duplicate MAC address is found.
2018                                  */
2019                                 packet_success += free_entries;
2020                                 vq->last_used_idx += packet_success;
2021                                 break;
2022                         }
2023                 }
2024
2025                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2026                 packet_success++;
2027         }
2028 }
2029
2030 /*
2031  * This function is called by each data core. It handles all RX/TX registered
2032  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2033  * addresses are compared with all devices in the main linked list.
2034  */
2035 static int
2036 switch_worker_zcp(__attribute__((unused)) void *arg)
2037 {
2038         struct virtio_net *dev = NULL;
2039         struct vhost_dev  *vdev = NULL;
2040         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2041         struct virtio_net_data_ll *dev_ll;
2042         struct mbuf_table *tx_q;
2043         volatile struct lcore_ll_info *lcore_ll;
2044         const uint64_t drain_tsc
2045                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2046                 * BURST_TX_DRAIN_US;
2047         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2048         unsigned ret;
2049         const uint16_t lcore_id = rte_lcore_id();
2050         uint16_t count_in_ring, rx_count = 0;
2051
2052         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2053
2054         lcore_ll = lcore_info[lcore_id].lcore_ll;
2055         prev_tsc = 0;
2056
2057         while (1) {
2058                 cur_tsc = rte_rdtsc();
2059
2060                 /* TX burst queue drain */
2061                 diff_tsc = cur_tsc - prev_tsc;
2062                 if (unlikely(diff_tsc > drain_tsc)) {
2063                         /*
2064                          * Get mbuf from vpool.pool and detach mbuf and
2065                          * put back into vpool.ring.
2066                          */
2067                         dev_ll = lcore_ll->ll_root_used;
2068                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2069                                 /* Get virtio device ID */
2070                                 vdev = dev_ll->vdev;
2071                                 dev = vdev->dev;
2072
2073                                 if (likely(!vdev->remove)) {
2074                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2075                                         if (tx_q->len) {
2076                                                 LOG_DEBUG(VHOST_DATA,
2077                                                 "TX queue drained after timeout"
2078                                                 " with burst size %u\n",
2079                                                 tx_q->len);
2080
2081                                                 /*
2082                                                  * Tx any packets in the queue
2083                                                  */
2084                                                 ret = rte_eth_tx_burst(
2085                                                         ports[0],
2086                                                         (uint16_t)tx_q->txq_id,
2087                                                         (struct rte_mbuf **)
2088                                                         tx_q->m_table,
2089                                                         (uint16_t)tx_q->len);
2090                                                 if (unlikely(ret < tx_q->len)) {
2091                                                         do {
2092                                                                 rte_pktmbuf_free(
2093                                                                         tx_q->m_table[ret]);
2094                                                         } while (++ret < tx_q->len);
2095                                                 }
2096                                                 tx_q->len = 0;
2097
2098                                                 txmbuf_clean_zcp(dev,
2099                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2100                                         }
2101                                 }
2102                                 dev_ll = dev_ll->next;
2103                         }
2104                         prev_tsc = cur_tsc;
2105                 }
2106
2107                 rte_prefetch0(lcore_ll->ll_root_used);
2108
2109                 /*
2110                  * Inform the configuration core that we have exited the linked
2111                  * list and that no devices are in use if requested.
2112                  */
2113                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2114                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2115
2116                 /* Process devices */
2117                 dev_ll = lcore_ll->ll_root_used;
2118
2119                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2120                         vdev = dev_ll->vdev;
2121                         dev  = vdev->dev;
2122                         if (unlikely(vdev->remove)) {
2123                                 dev_ll = dev_ll->next;
2124                                 unlink_vmdq(vdev);
2125                                 vdev->ready = DEVICE_SAFE_REMOVE;
2126                                 continue;
2127                         }
2128
2129                         if (likely(vdev->ready == DEVICE_RX)) {
2130                                 uint32_t index = vdev->vmdq_rx_q;
2131                                 uint16_t i;
2132                                 count_in_ring
2133                                 = rte_ring_count(vpool_array[index].ring);
2134                                 uint16_t free_entries
2135                                 = (uint16_t)get_available_ring_num_zcp(dev);
2136
2137                                 /*
2138                                  * Attach all mbufs in vpool.ring and put back
2139                                  * into vpool.pool.
2140                                  */
2141                                 for (i = 0;
2142                                 i < RTE_MIN(free_entries,
2143                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2144                                 i++)
2145                                         attach_rxmbuf_zcp(dev);
2146
2147                                 /* Handle guest RX */
2148                                 rx_count = rte_eth_rx_burst(ports[0],
2149                                         vdev->vmdq_rx_q, pkts_burst,
2150                                         MAX_PKT_BURST);
2151
2152                                 if (rx_count) {
2153                                         ret_count = virtio_dev_rx_zcp(dev,
2154                                                         pkts_burst, rx_count);
2155                                         if (enable_stats) {
2156                                                 dev_statistics[dev->device_fh].rx_total
2157                                                         += rx_count;
2158                                                 dev_statistics[dev->device_fh].rx
2159                                                         += ret_count;
2160                                         }
2161                                         while (likely(rx_count)) {
2162                                                 rx_count--;
2163                                                 pktmbuf_detach_zcp(
2164                                                         pkts_burst[rx_count]);
2165                                                 rte_ring_sp_enqueue(
2166                                                         vpool_array[index].ring,
2167                                                         (void *)pkts_burst[rx_count]);
2168                                         }
2169                                 }
2170                         }
2171
2172                         if (likely(!vdev->remove))
2173                                 /* Handle guest TX */
2174                                 virtio_dev_tx_zcp(dev);
2175
2176                         /* Move to the next device in the list */
2177                         dev_ll = dev_ll->next;
2178                 }
2179         }
2180
2181         return 0;
2182 }
2183
2184
2185 /*
2186  * Add an entry to a used linked list. A free entry must first be found
2187  * in the free linked list using get_data_ll_free_entry();
2188  */
2189 static void
2190 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2191         struct virtio_net_data_ll *ll_dev)
2192 {
2193         struct virtio_net_data_ll *ll = *ll_root_addr;
2194
2195         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2196         ll_dev->next = NULL;
2197         rte_compiler_barrier();
2198
2199         /* If ll == NULL then this is the first device. */
2200         if (ll) {
2201                 /* Increment to the tail of the linked list. */
2202                 while ((ll->next != NULL) )
2203                         ll = ll->next;
2204
2205                 ll->next = ll_dev;
2206         } else {
2207                 *ll_root_addr = ll_dev;
2208         }
2209 }
2210
2211 /*
2212  * Remove an entry from a used linked list. The entry must then be added to
2213  * the free linked list using put_data_ll_free_entry().
2214  */
2215 static void
2216 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2217         struct virtio_net_data_ll *ll_dev,
2218         struct virtio_net_data_ll *ll_dev_last)
2219 {
2220         struct virtio_net_data_ll *ll = *ll_root_addr;
2221
2222         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2223                 return;
2224
2225         if (ll_dev == ll)
2226                 *ll_root_addr = ll_dev->next;
2227         else
2228                 if (likely(ll_dev_last != NULL))
2229                         ll_dev_last->next = ll_dev->next;
2230                 else
2231                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2232 }
2233
2234 /*
2235  * Find and return an entry from the free linked list.
2236  */
2237 static struct virtio_net_data_ll *
2238 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2239 {
2240         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2241         struct virtio_net_data_ll *ll_dev;
2242
2243         if (ll_free == NULL)
2244                 return NULL;
2245
2246         ll_dev = ll_free;
2247         *ll_root_addr = ll_free->next;
2248
2249         return ll_dev;
2250 }
2251
2252 /*
2253  * Place an entry back on to the free linked list.
2254  */
2255 static void
2256 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2257         struct virtio_net_data_ll *ll_dev)
2258 {
2259         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2260
2261         if (ll_dev == NULL)
2262                 return;
2263
2264         ll_dev->next = ll_free;
2265         *ll_root_addr = ll_dev;
2266 }
2267
2268 /*
2269  * Creates a linked list of a given size.
2270  */
2271 static struct virtio_net_data_ll *
2272 alloc_data_ll(uint32_t size)
2273 {
2274         struct virtio_net_data_ll *ll_new;
2275         uint32_t i;
2276
2277         /* Malloc and then chain the linked list. */
2278         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2279         if (ll_new == NULL) {
2280                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2281                 return NULL;
2282         }
2283
2284         for (i = 0; i < size - 1; i++) {
2285                 ll_new[i].vdev = NULL;
2286                 ll_new[i].next = &ll_new[i+1];
2287         }
2288         ll_new[i].next = NULL;
2289
2290         return (ll_new);
2291 }
2292
2293 /*
2294  * Create the main linked list along with each individual cores linked list. A used and a free list
2295  * are created to manage entries.
2296  */
2297 static int
2298 init_data_ll (void)
2299 {
2300         int lcore;
2301
2302         RTE_LCORE_FOREACH_SLAVE(lcore) {
2303                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2304                 if (lcore_info[lcore].lcore_ll == NULL) {
2305                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2306                         return -1;
2307                 }
2308
2309                 lcore_info[lcore].lcore_ll->device_num = 0;
2310                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2311                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2312                 if (num_devices % num_switching_cores)
2313                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2314                 else
2315                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2316         }
2317
2318         /* Allocate devices up to a maximum of MAX_DEVICES. */
2319         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2320
2321         return 0;
2322 }
2323
2324 /*
2325  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2326  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2327  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2328  */
2329 static void
2330 destroy_device (volatile struct virtio_net *dev)
2331 {
2332         struct virtio_net_data_ll *ll_lcore_dev_cur;
2333         struct virtio_net_data_ll *ll_main_dev_cur;
2334         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2335         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2336         struct vhost_dev *vdev;
2337         int lcore;
2338
2339         dev->flags &= ~VIRTIO_DEV_RUNNING;
2340
2341         vdev = (struct vhost_dev *)dev->priv;
2342         /*set the remove flag. */
2343         vdev->remove = 1;
2344         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2345                 rte_pause();
2346         }
2347
2348         /* Search for entry to be removed from lcore ll */
2349         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2350         while (ll_lcore_dev_cur != NULL) {
2351                 if (ll_lcore_dev_cur->vdev == vdev) {
2352                         break;
2353                 } else {
2354                         ll_lcore_dev_last = ll_lcore_dev_cur;
2355                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2356                 }
2357         }
2358
2359         if (ll_lcore_dev_cur == NULL) {
2360                 RTE_LOG(ERR, VHOST_CONFIG,
2361                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2362                         dev->device_fh);
2363                 return;
2364         }
2365
2366         /* Search for entry to be removed from main ll */
2367         ll_main_dev_cur = ll_root_used;
2368         ll_main_dev_last = NULL;
2369         while (ll_main_dev_cur != NULL) {
2370                 if (ll_main_dev_cur->vdev == vdev) {
2371                         break;
2372                 } else {
2373                         ll_main_dev_last = ll_main_dev_cur;
2374                         ll_main_dev_cur = ll_main_dev_cur->next;
2375                 }
2376         }
2377
2378         /* Remove entries from the lcore and main ll. */
2379         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2380         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2381
2382         /* Set the dev_removal_flag on each lcore. */
2383         RTE_LCORE_FOREACH_SLAVE(lcore) {
2384                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2385         }
2386
2387         /*
2388          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2389          * they can no longer access the device removed from the linked lists and that the devices
2390          * are no longer in use.
2391          */
2392         RTE_LCORE_FOREACH_SLAVE(lcore) {
2393                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2394                         rte_pause();
2395                 }
2396         }
2397
2398         /* Add the entries back to the lcore and main free ll.*/
2399         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2400         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2401
2402         /* Decrement number of device on the lcore. */
2403         lcore_info[vdev->coreid].lcore_ll->device_num--;
2404
2405         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2406
2407         if (zero_copy) {
2408                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2409
2410                 /* Stop the RX queue. */
2411                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2412                         LOG_DEBUG(VHOST_CONFIG,
2413                                 "(%"PRIu64") In destroy_device: Failed to stop "
2414                                 "rx queue:%d\n",
2415                                 dev->device_fh,
2416                                 vdev->vmdq_rx_q);
2417                 }
2418
2419                 LOG_DEBUG(VHOST_CONFIG,
2420                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2421                         "mempool back to ring for RX queue: %d\n",
2422                         dev->device_fh, vdev->vmdq_rx_q);
2423
2424                 mbuf_destroy_zcp(vpool);
2425
2426                 /* Stop the TX queue. */
2427                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2428                         LOG_DEBUG(VHOST_CONFIG,
2429                                 "(%"PRIu64") In destroy_device: Failed to "
2430                                 "stop tx queue:%d\n",
2431                                 dev->device_fh, vdev->vmdq_rx_q);
2432                 }
2433
2434                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2435
2436                 LOG_DEBUG(VHOST_CONFIG,
2437                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2438                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2439                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2440                         dev->device_fh);
2441
2442                 mbuf_destroy_zcp(vpool);
2443                 rte_free(vdev->regions_hpa);
2444         }
2445         rte_free(vdev);
2446
2447 }
2448
2449 /*
2450  * Calculate the region count of physical continous regions for one particular
2451  * region of whose vhost virtual address is continous. The particular region
2452  * start from vva_start, with size of 'size' in argument.
2453  */
2454 static uint32_t
2455 check_hpa_regions(uint64_t vva_start, uint64_t size)
2456 {
2457         uint32_t i, nregions = 0, page_size = getpagesize();
2458         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2459         if (vva_start % page_size) {
2460                 LOG_DEBUG(VHOST_CONFIG,
2461                         "in check_countinous: vva start(%p) mod page_size(%d) "
2462                         "has remainder\n",
2463                         (void *)(uintptr_t)vva_start, page_size);
2464                 return 0;
2465         }
2466         if (size % page_size) {
2467                 LOG_DEBUG(VHOST_CONFIG,
2468                         "in check_countinous: "
2469                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2470                         size, page_size);
2471                 return 0;
2472         }
2473         for (i = 0; i < size - page_size; i = i + page_size) {
2474                 cur_phys_addr
2475                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2476                 next_phys_addr = rte_mem_virt2phy(
2477                         (void *)(uintptr_t)(vva_start + i + page_size));
2478                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2479                         ++nregions;
2480                         LOG_DEBUG(VHOST_CONFIG,
2481                                 "in check_continuous: hva addr:(%p) is not "
2482                                 "continuous with hva addr:(%p), diff:%d\n",
2483                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2484                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2485                                 + page_size), page_size);
2486                         LOG_DEBUG(VHOST_CONFIG,
2487                                 "in check_continuous: hpa addr:(%p) is not "
2488                                 "continuous with hpa addr:(%p), "
2489                                 "diff:(%"PRIu64")\n",
2490                                 (void *)(uintptr_t)cur_phys_addr,
2491                                 (void *)(uintptr_t)next_phys_addr,
2492                                 (next_phys_addr-cur_phys_addr));
2493                 }
2494         }
2495         return nregions;
2496 }
2497
2498 /*
2499  * Divide each region whose vhost virtual address is continous into a few
2500  * sub-regions, make sure the physical address within each sub-region are
2501  * continous. And fill offset(to GPA) and size etc. information of each
2502  * sub-region into regions_hpa.
2503  */
2504 static uint32_t
2505 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2506 {
2507         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2508         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2509
2510         if (mem_region_hpa == NULL)
2511                 return 0;
2512
2513         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2514                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2515                         virtio_memory->regions[regionidx].address_offset;
2516                 mem_region_hpa[regionidx_hpa].guest_phys_address
2517                         = virtio_memory->regions[regionidx].guest_phys_address;
2518                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2519                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2520                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2521                 LOG_DEBUG(VHOST_CONFIG,
2522                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2523                         regionidx_hpa,
2524                         (void *)(uintptr_t)
2525                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2526                 LOG_DEBUG(VHOST_CONFIG,
2527                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2528                         regionidx_hpa,
2529                         (void *)(uintptr_t)
2530                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2531                 for (i = 0, k = 0;
2532                         i < virtio_memory->regions[regionidx].memory_size -
2533                                 page_size;
2534                         i += page_size) {
2535                         cur_phys_addr = rte_mem_virt2phy(
2536                                         (void *)(uintptr_t)(vva_start + i));
2537                         next_phys_addr = rte_mem_virt2phy(
2538                                         (void *)(uintptr_t)(vva_start +
2539                                         i + page_size));
2540                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2541                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2542                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2543                                         k + page_size;
2544                                 mem_region_hpa[regionidx_hpa].memory_size
2545                                         = k + page_size;
2546                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2547                                         "phys addr end  [%d]:(%p)\n",
2548                                         regionidx_hpa,
2549                                         (void *)(uintptr_t)
2550                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2551                                 LOG_DEBUG(VHOST_CONFIG,
2552                                         "in fill_hpa_regions: guest phys addr "
2553                                         "size [%d]:(%p)\n",
2554                                         regionidx_hpa,
2555                                         (void *)(uintptr_t)
2556                                         (mem_region_hpa[regionidx_hpa].memory_size));
2557                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2558                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2559                                 ++regionidx_hpa;
2560                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2561                                         next_phys_addr -
2562                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2563                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2564                                         " phys addr start[%d]:(%p)\n",
2565                                         regionidx_hpa,
2566                                         (void *)(uintptr_t)
2567                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2568                                 LOG_DEBUG(VHOST_CONFIG,
2569                                         "in fill_hpa_regions: host  phys addr "
2570                                         "start[%d]:(%p)\n",
2571                                         regionidx_hpa,
2572                                         (void *)(uintptr_t)
2573                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2574                                 k = 0;
2575                         } else {
2576                                 k += page_size;
2577                         }
2578                 }
2579                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2580                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2581                         + k + page_size;
2582                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2583                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2584                         "[%d]:(%p)\n", regionidx_hpa,
2585                         (void *)(uintptr_t)
2586                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2587                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2588                         "[%d]:(%p)\n", regionidx_hpa,
2589                         (void *)(uintptr_t)
2590                         (mem_region_hpa[regionidx_hpa].memory_size));
2591                 ++regionidx_hpa;
2592         }
2593         return regionidx_hpa;
2594 }
2595
2596 /*
2597  * A new device is added to a data core. First the device is added to the main linked list
2598  * and the allocated to a specific data core.
2599  */
2600 static int
2601 new_device (struct virtio_net *dev)
2602 {
2603         struct virtio_net_data_ll *ll_dev;
2604         int lcore, core_add = 0;
2605         uint32_t device_num_min = num_devices;
2606         struct vhost_dev *vdev;
2607         uint32_t regionidx;
2608
2609         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2610         if (vdev == NULL) {
2611                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2612                         dev->device_fh);
2613                 return -1;
2614         }
2615         vdev->dev = dev;
2616         dev->priv = vdev;
2617
2618         if (zero_copy) {
2619                 vdev->nregions_hpa = dev->mem->nregions;
2620                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2621                         vdev->nregions_hpa
2622                                 += check_hpa_regions(
2623                                         dev->mem->regions[regionidx].guest_phys_address
2624                                         + dev->mem->regions[regionidx].address_offset,
2625                                         dev->mem->regions[regionidx].memory_size);
2626
2627                 }
2628
2629                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2630                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2631                         RTE_CACHE_LINE_SIZE);
2632                 if (vdev->regions_hpa == NULL) {
2633                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2634                         rte_free(vdev);
2635                         return -1;
2636                 }
2637
2638
2639                 if (fill_hpa_memory_regions(
2640                         vdev->regions_hpa, dev->mem
2641                         ) != vdev->nregions_hpa) {
2642
2643                         RTE_LOG(ERR, VHOST_CONFIG,
2644                                 "hpa memory regions number mismatch: "
2645                                 "[%d]\n", vdev->nregions_hpa);
2646                         rte_free(vdev->regions_hpa);
2647                         rte_free(vdev);
2648                         return -1;
2649                 }
2650         }
2651
2652
2653         /* Add device to main ll */
2654         ll_dev = get_data_ll_free_entry(&ll_root_free);
2655         if (ll_dev == NULL) {
2656                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2657                         "of %d devices per core has been reached\n",
2658                         dev->device_fh, num_devices);
2659                 if (vdev->regions_hpa)
2660                         rte_free(vdev->regions_hpa);
2661                 rte_free(vdev);
2662                 return -1;
2663         }
2664         ll_dev->vdev = vdev;
2665         add_data_ll_entry(&ll_root_used, ll_dev);
2666         vdev->vmdq_rx_q
2667                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2668
2669         if (zero_copy) {
2670                 uint32_t index = vdev->vmdq_rx_q;
2671                 uint32_t count_in_ring, i;
2672                 struct mbuf_table *tx_q;
2673
2674                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2675
2676                 LOG_DEBUG(VHOST_CONFIG,
2677                         "(%"PRIu64") in new_device: mbuf count in mempool "
2678                         "before attach is: %d\n",
2679                         dev->device_fh,
2680                         rte_mempool_count(vpool_array[index].pool));
2681                 LOG_DEBUG(VHOST_CONFIG,
2682                         "(%"PRIu64") in new_device: mbuf count in  ring "
2683                         "before attach  is : %d\n",
2684                         dev->device_fh, count_in_ring);
2685
2686                 /*
2687                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2688                  */
2689                 for (i = 0; i < count_in_ring; i++)
2690                         attach_rxmbuf_zcp(dev);
2691
2692                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2693                         "mempool after attach is: %d\n",
2694                         dev->device_fh,
2695                         rte_mempool_count(vpool_array[index].pool));
2696                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2697                         "ring after attach  is : %d\n",
2698                         dev->device_fh,
2699                         rte_ring_count(vpool_array[index].ring));
2700
2701                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2702                 tx_q->txq_id = vdev->vmdq_rx_q;
2703
2704                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2705                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2706
2707                         LOG_DEBUG(VHOST_CONFIG,
2708                                 "(%"PRIu64") In new_device: Failed to start "
2709                                 "tx queue:%d\n",
2710                                 dev->device_fh, vdev->vmdq_rx_q);
2711
2712                         mbuf_destroy_zcp(vpool);
2713                         rte_free(vdev->regions_hpa);
2714                         rte_free(vdev);
2715                         return -1;
2716                 }
2717
2718                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2719                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2720
2721                         LOG_DEBUG(VHOST_CONFIG,
2722                                 "(%"PRIu64") In new_device: Failed to start "
2723                                 "rx queue:%d\n",
2724                                 dev->device_fh, vdev->vmdq_rx_q);
2725
2726                         /* Stop the TX queue. */
2727                         if (rte_eth_dev_tx_queue_stop(ports[0],
2728                                 vdev->vmdq_rx_q) != 0) {
2729                                 LOG_DEBUG(VHOST_CONFIG,
2730                                         "(%"PRIu64") In new_device: Failed to "
2731                                         "stop tx queue:%d\n",
2732                                         dev->device_fh, vdev->vmdq_rx_q);
2733                         }
2734
2735                         mbuf_destroy_zcp(vpool);
2736                         rte_free(vdev->regions_hpa);
2737                         rte_free(vdev);
2738                         return -1;
2739                 }
2740
2741         }
2742
2743         /*reset ready flag*/
2744         vdev->ready = DEVICE_MAC_LEARNING;
2745         vdev->remove = 0;
2746
2747         /* Find a suitable lcore to add the device. */
2748         RTE_LCORE_FOREACH_SLAVE(lcore) {
2749                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2750                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2751                         core_add = lcore;
2752                 }
2753         }
2754         /* Add device to lcore ll */
2755         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2756         if (ll_dev == NULL) {
2757                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2758                 vdev->ready = DEVICE_SAFE_REMOVE;
2759                 destroy_device(dev);
2760                 if (vdev->regions_hpa)
2761                         rte_free(vdev->regions_hpa);
2762                 rte_free(vdev);
2763                 return -1;
2764         }
2765         ll_dev->vdev = vdev;
2766         vdev->coreid = core_add;
2767
2768         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2769
2770         /* Initialize device stats */
2771         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2772
2773         /* Disable notifications. */
2774         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2775         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2776         lcore_info[vdev->coreid].lcore_ll->device_num++;
2777         dev->flags |= VIRTIO_DEV_RUNNING;
2778
2779         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2780
2781         return 0;
2782 }
2783
2784 /*
2785  * These callback allow devices to be added to the data core when configuration
2786  * has been fully complete.
2787  */
2788 static const struct virtio_net_device_ops virtio_net_device_ops =
2789 {
2790         .new_device =  new_device,
2791         .destroy_device = destroy_device,
2792 };
2793
2794 /*
2795  * This is a thread will wake up after a period to print stats if the user has
2796  * enabled them.
2797  */
2798 static void
2799 print_stats(void)
2800 {
2801         struct virtio_net_data_ll *dev_ll;
2802         uint64_t tx_dropped, rx_dropped;
2803         uint64_t tx, tx_total, rx, rx_total;
2804         uint32_t device_fh;
2805         const char clr[] = { 27, '[', '2', 'J', '\0' };
2806         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2807
2808         while(1) {
2809                 sleep(enable_stats);
2810
2811                 /* Clear screen and move to top left */
2812                 printf("%s%s", clr, top_left);
2813
2814                 printf("\nDevice statistics ====================================");
2815
2816                 dev_ll = ll_root_used;
2817                 while (dev_ll != NULL) {
2818                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2819                         tx_total = dev_statistics[device_fh].tx_total;
2820                         tx = dev_statistics[device_fh].tx;
2821                         tx_dropped = tx_total - tx;
2822                         if (zero_copy == 0) {
2823                                 rx_total = rte_atomic64_read(
2824                                         &dev_statistics[device_fh].rx_total_atomic);
2825                                 rx = rte_atomic64_read(
2826                                         &dev_statistics[device_fh].rx_atomic);
2827                         } else {
2828                                 rx_total = dev_statistics[device_fh].rx_total;
2829                                 rx = dev_statistics[device_fh].rx;
2830                         }
2831                         rx_dropped = rx_total - rx;
2832
2833                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2834                                         "\nTX total:            %"PRIu64""
2835                                         "\nTX dropped:          %"PRIu64""
2836                                         "\nTX successful:               %"PRIu64""
2837                                         "\nRX total:            %"PRIu64""
2838                                         "\nRX dropped:          %"PRIu64""
2839                                         "\nRX successful:               %"PRIu64"",
2840                                         device_fh,
2841                                         tx_total,
2842                                         tx_dropped,
2843                                         tx,
2844                                         rx_total,
2845                                         rx_dropped,
2846                                         rx);
2847
2848                         dev_ll = dev_ll->next;
2849                 }
2850                 printf("\n======================================================\n");
2851         }
2852 }
2853
2854 static void
2855 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2856         char *ring_name, uint32_t nb_mbuf)
2857 {
2858         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2859         vpool_array[index].pool
2860                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2861                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2862                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2863                 rte_pktmbuf_init, NULL, socket, 0);
2864         if (vpool_array[index].pool != NULL) {
2865                 vpool_array[index].ring
2866                         = rte_ring_create(ring_name,
2867                                 rte_align32pow2(nb_mbuf + 1),
2868                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2869                 if (likely(vpool_array[index].ring != NULL)) {
2870                         LOG_DEBUG(VHOST_CONFIG,
2871                                 "in setup_mempool_tbl: mbuf count in "
2872                                 "mempool is: %d\n",
2873                                 rte_mempool_count(vpool_array[index].pool));
2874                         LOG_DEBUG(VHOST_CONFIG,
2875                                 "in setup_mempool_tbl: mbuf count in "
2876                                 "ring   is: %d\n",
2877                                 rte_ring_count(vpool_array[index].ring));
2878                 } else {
2879                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2880                                 ring_name);
2881                 }
2882
2883                 /* Need consider head room. */
2884                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2885         } else {
2886                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2887         }
2888 }
2889
2890
2891 /*
2892  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2893  * device is also registered here to handle the IOCTLs.
2894  */
2895 int
2896 main(int argc, char *argv[])
2897 {
2898         struct rte_mempool *mbuf_pool = NULL;
2899         unsigned lcore_id, core_id = 0;
2900         unsigned nb_ports, valid_num_ports;
2901         int ret;
2902         uint8_t portid;
2903         uint16_t queue_id;
2904         static pthread_t tid;
2905
2906         /* init EAL */
2907         ret = rte_eal_init(argc, argv);
2908         if (ret < 0)
2909                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2910         argc -= ret;
2911         argv += ret;
2912
2913         /* parse app arguments */
2914         ret = us_vhost_parse_args(argc, argv);
2915         if (ret < 0)
2916                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2917
2918         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2919                 if (rte_lcore_is_enabled(lcore_id))
2920                         lcore_ids[core_id ++] = lcore_id;
2921
2922         if (rte_lcore_count() > RTE_MAX_LCORE)
2923                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2924
2925         /*set the number of swithcing cores available*/
2926         num_switching_cores = rte_lcore_count()-1;
2927
2928         /* Get the number of physical ports. */
2929         nb_ports = rte_eth_dev_count();
2930         if (nb_ports > RTE_MAX_ETHPORTS)
2931                 nb_ports = RTE_MAX_ETHPORTS;
2932
2933         /*
2934          * Update the global var NUM_PORTS and global array PORTS
2935          * and get value of var VALID_NUM_PORTS according to system ports number
2936          */
2937         valid_num_ports = check_ports_num(nb_ports);
2938
2939         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2940                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2941                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2942                 return -1;
2943         }
2944
2945         if (zero_copy == 0) {
2946                 /* Create the mbuf pool. */
2947                 mbuf_pool = rte_mempool_create(
2948                                 "MBUF_POOL",
2949                                 NUM_MBUFS_PER_PORT
2950                                 * valid_num_ports,
2951                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2952                                 sizeof(struct rte_pktmbuf_pool_private),
2953                                 rte_pktmbuf_pool_init, NULL,
2954                                 rte_pktmbuf_init, NULL,
2955                                 rte_socket_id(), 0);
2956                 if (mbuf_pool == NULL)
2957                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2958
2959                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2960                         vpool_array[queue_id].pool = mbuf_pool;
2961
2962                 if (vm2vm_mode == VM2VM_HARDWARE) {
2963                         /* Enable VT loop back to let L2 switch to do it. */
2964                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2965                         LOG_DEBUG(VHOST_CONFIG,
2966                                 "Enable loop back for L2 switch in vmdq.\n");
2967                 }
2968         } else {
2969                 uint32_t nb_mbuf;
2970                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2971                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2972
2973                 nb_mbuf = num_rx_descriptor
2974                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2975                         + num_switching_cores * MAX_PKT_BURST;
2976
2977                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2978                         snprintf(pool_name, sizeof(pool_name),
2979                                 "rxmbuf_pool_%u", queue_id);
2980                         snprintf(ring_name, sizeof(ring_name),
2981                                 "rxmbuf_ring_%u", queue_id);
2982                         setup_mempool_tbl(rte_socket_id(), queue_id,
2983                                 pool_name, ring_name, nb_mbuf);
2984                 }
2985
2986                 nb_mbuf = num_tx_descriptor
2987                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2988                                 + num_switching_cores * MAX_PKT_BURST;
2989
2990                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2991                         snprintf(pool_name, sizeof(pool_name),
2992                                 "txmbuf_pool_%u", queue_id);
2993                         snprintf(ring_name, sizeof(ring_name),
2994                                 "txmbuf_ring_%u", queue_id);
2995                         setup_mempool_tbl(rte_socket_id(),
2996                                 (queue_id + MAX_QUEUES),
2997                                 pool_name, ring_name, nb_mbuf);
2998                 }
2999
3000                 if (vm2vm_mode == VM2VM_HARDWARE) {
3001                         /* Enable VT loop back to let L2 switch to do it. */
3002                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3003                         LOG_DEBUG(VHOST_CONFIG,
3004                                 "Enable loop back for L2 switch in vmdq.\n");
3005                 }
3006         }
3007         /* Set log level. */
3008         rte_set_log_level(LOG_LEVEL);
3009
3010         /* initialize all ports */
3011         for (portid = 0; portid < nb_ports; portid++) {
3012                 /* skip ports that are not enabled */
3013                 if ((enabled_port_mask & (1 << portid)) == 0) {
3014                         RTE_LOG(INFO, VHOST_PORT,
3015                                 "Skipping disabled port %d\n", portid);
3016                         continue;
3017                 }
3018                 if (port_init(portid) != 0)
3019                         rte_exit(EXIT_FAILURE,
3020                                 "Cannot initialize network ports\n");
3021         }
3022
3023         /* Initialise all linked lists. */
3024         if (init_data_ll() == -1)
3025                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3026
3027         /* Initialize device stats */
3028         memset(&dev_statistics, 0, sizeof(dev_statistics));
3029
3030         /* Enable stats if the user option is set. */
3031         if (enable_stats)
3032                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3033
3034         /* Launch all data cores. */
3035         if (zero_copy == 0) {
3036                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3037                         rte_eal_remote_launch(switch_worker,
3038                                 mbuf_pool, lcore_id);
3039                 }
3040         } else {
3041                 uint32_t count_in_mempool, index, i;
3042                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3043                         /* For all RX and TX queues. */
3044                         count_in_mempool
3045                                 = rte_mempool_count(vpool_array[index].pool);
3046
3047                         /*
3048                          * Transfer all un-attached mbufs from vpool.pool
3049                          * to vpoo.ring.
3050                          */
3051                         for (i = 0; i < count_in_mempool; i++) {
3052                                 struct rte_mbuf *mbuf
3053                                         = __rte_mbuf_raw_alloc(
3054                                                 vpool_array[index].pool);
3055                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3056                                                 (void *)mbuf);
3057                         }
3058
3059                         LOG_DEBUG(VHOST_CONFIG,
3060                                 "in main: mbuf count in mempool at initial "
3061                                 "is: %d\n", count_in_mempool);
3062                         LOG_DEBUG(VHOST_CONFIG,
3063                                 "in main: mbuf count in  ring at initial  is :"
3064                                 " %d\n",
3065                                 rte_ring_count(vpool_array[index].ring));
3066                 }
3067
3068                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3069                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3070                                 lcore_id);
3071         }
3072
3073         if (mergeable == 0)
3074                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3075
3076         /* Register CUSE device to handle IOCTLs. */
3077         ret = rte_vhost_driver_register((char *)&dev_basename);
3078         if (ret != 0)
3079                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3080
3081         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3082
3083         /* Start CUSE session. */
3084         rte_vhost_driver_session_start();
3085         return 0;
3086
3087 }
3088