mbuf: introduce indirect attached flag
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 512
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
83 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
84
85 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
86 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
87
88 #define JUMBO_FRAME_MAX_SIZE    0x2600
89
90 /* State of virtio device. */
91 #define DEVICE_MAC_LEARNING 0
92 #define DEVICE_RX                       1
93 #define DEVICE_SAFE_REMOVE      2
94
95 /* Config_core_flag status definitions. */
96 #define REQUEST_DEV_REMOVAL 1
97 #define ACK_DEV_REMOVAL 0
98
99 /* Configurable number of RX/TX ring descriptors */
100 #define RTE_TEST_RX_DESC_DEFAULT 1024
101 #define RTE_TEST_TX_DESC_DEFAULT 512
102
103 /*
104  * Need refine these 2 macros for legacy and DPDK based front end:
105  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
106  * And then adjust power 2.
107  */
108 /*
109  * For legacy front end, 128 descriptors,
110  * half for virtio header, another half for mbuf.
111  */
112 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
113 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
114
115 /* Get first 4 bytes in mbuf headroom. */
116 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
117                 + sizeof(struct rte_mbuf)))
118
119 /* true if x is a power of 2 */
120 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
121
122 #define INVALID_PORT_ID 0xFF
123
124 /* Max number of devices. Limited by vmdq. */
125 #define MAX_DEVICES 64
126
127 /* Size of buffers used for snprintfs. */
128 #define MAX_PRINT_BUFF 6072
129
130 /* Maximum character device basename size. */
131 #define MAX_BASENAME_SZ 10
132
133 /* Maximum long option length for option parsing. */
134 #define MAX_LONG_OPT_SZ 64
135
136 /* Used to compare MAC addresses. */
137 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
138
139 /* Number of descriptors per cacheline. */
140 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
141
142 #define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
143
144 /* mask of enabled ports */
145 static uint32_t enabled_port_mask = 0;
146
147 /* Promiscuous mode */
148 static uint32_t promiscuous;
149
150 /*Number of switching cores enabled*/
151 static uint32_t num_switching_cores = 0;
152
153 /* number of devices/queues to support*/
154 static uint32_t num_queues = 0;
155 static uint32_t num_devices;
156
157 /*
158  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
159  * disabled on default.
160  */
161 static uint32_t zero_copy;
162 static int mergeable;
163
164 /* Do vlan strip on host, enabled on default */
165 static uint32_t vlan_strip = 1;
166
167 /* number of descriptors to apply*/
168 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
169 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
170
171 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
172 #define MAX_RING_DESC 4096
173
174 struct vpool {
175         struct rte_mempool *pool;
176         struct rte_ring *ring;
177         uint32_t buf_size;
178 } vpool_array[MAX_QUEUES+MAX_QUEUES];
179
180 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
181 typedef enum {
182         VM2VM_DISABLED = 0,
183         VM2VM_SOFTWARE = 1,
184         VM2VM_HARDWARE = 2,
185         VM2VM_LAST
186 } vm2vm_type;
187 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
188
189 /* The type of host physical address translated from guest physical address. */
190 typedef enum {
191         PHYS_ADDR_CONTINUOUS = 0,
192         PHYS_ADDR_CROSS_SUBREG = 1,
193         PHYS_ADDR_INVALID = 2,
194         PHYS_ADDR_LAST
195 } hpa_type;
196
197 /* Enable stats. */
198 static uint32_t enable_stats = 0;
199 /* Enable retries on RX. */
200 static uint32_t enable_retry = 1;
201 /* Specify timeout (in useconds) between retries on RX. */
202 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
203 /* Specify the number of retries on RX. */
204 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
205
206 /* Character device basename. Can be set by user. */
207 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
208
209 /* empty vmdq configuration structure. Filled in programatically */
210 static struct rte_eth_conf vmdq_conf_default = {
211         .rxmode = {
212                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
213                 .split_hdr_size = 0,
214                 .header_split   = 0, /**< Header Split disabled */
215                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
216                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
217                 /*
218                  * It is necessary for 1G NIC such as I350,
219                  * this fixes bug of ipv4 forwarding in guest can't
220                  * forward pakets from one virtio dev to another virtio dev.
221                  */
222                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
223                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
224                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
225         },
226
227         .txmode = {
228                 .mq_mode = ETH_MQ_TX_NONE,
229         },
230         .rx_adv_conf = {
231                 /*
232                  * should be overridden separately in code with
233                  * appropriate values
234                  */
235                 .vmdq_rx_conf = {
236                         .nb_queue_pools = ETH_8_POOLS,
237                         .enable_default_pool = 0,
238                         .default_pool = 0,
239                         .nb_pool_maps = 0,
240                         .pool_map = {{0, 0},},
241                 },
242         },
243 };
244
245 static unsigned lcore_ids[RTE_MAX_LCORE];
246 static uint8_t ports[RTE_MAX_ETHPORTS];
247 static unsigned num_ports = 0; /**< The number of ports specified in command line */
248 static uint16_t num_pf_queues, num_vmdq_queues;
249 static uint16_t vmdq_pool_base, vmdq_queue_base;
250 static uint16_t queues_per_pool;
251
252 static const uint16_t external_pkt_default_vlan_tag = 2000;
253 const uint16_t vlan_tags[] = {
254         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
255         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
256         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
257         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
258         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
259         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
260         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
261         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
262 };
263
264 /* ethernet addresses of ports */
265 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
266
267 /* heads for the main used and free linked lists for the data path. */
268 static struct virtio_net_data_ll *ll_root_used = NULL;
269 static struct virtio_net_data_ll *ll_root_free = NULL;
270
271 /* Array of data core structures containing information on individual core linked lists. */
272 static struct lcore_info lcore_info[RTE_MAX_LCORE];
273
274 /* Used for queueing bursts of TX packets. */
275 struct mbuf_table {
276         unsigned len;
277         unsigned txq_id;
278         struct rte_mbuf *m_table[MAX_PKT_BURST];
279 };
280
281 /* TX queue for each data core. */
282 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
283
284 /* TX queue fori each virtio device for zero copy. */
285 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
286
287 /* Vlan header struct used to insert vlan tags on TX. */
288 struct vlan_ethhdr {
289         unsigned char   h_dest[ETH_ALEN];
290         unsigned char   h_source[ETH_ALEN];
291         __be16          h_vlan_proto;
292         __be16          h_vlan_TCI;
293         __be16          h_vlan_encapsulated_proto;
294 };
295
296 /* IPv4 Header */
297 struct ipv4_hdr {
298         uint8_t  version_ihl;           /**< version and header length */
299         uint8_t  type_of_service;       /**< type of service */
300         uint16_t total_length;          /**< length of packet */
301         uint16_t packet_id;             /**< packet ID */
302         uint16_t fragment_offset;       /**< fragmentation offset */
303         uint8_t  time_to_live;          /**< time to live */
304         uint8_t  next_proto_id;         /**< protocol ID */
305         uint16_t hdr_checksum;          /**< header checksum */
306         uint32_t src_addr;              /**< source address */
307         uint32_t dst_addr;              /**< destination address */
308 } __attribute__((__packed__));
309
310 /* Header lengths. */
311 #define VLAN_HLEN       4
312 #define VLAN_ETH_HLEN   18
313
314 /* Per-device statistics struct */
315 struct device_statistics {
316         uint64_t tx_total;
317         rte_atomic64_t rx_total_atomic;
318         uint64_t rx_total;
319         uint64_t tx;
320         rte_atomic64_t rx_atomic;
321         uint64_t rx;
322 } __rte_cache_aligned;
323 struct device_statistics dev_statistics[MAX_DEVICES];
324
325 /*
326  * Builds up the correct configuration for VMDQ VLAN pool map
327  * according to the pool & queue limits.
328  */
329 static inline int
330 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
331 {
332         struct rte_eth_vmdq_rx_conf conf;
333         struct rte_eth_vmdq_rx_conf *def_conf =
334                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
335         unsigned i;
336
337         memset(&conf, 0, sizeof(conf));
338         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
339         conf.nb_pool_maps = num_devices;
340         conf.enable_loop_back = def_conf->enable_loop_back;
341         conf.rx_mode = def_conf->rx_mode;
342
343         for (i = 0; i < conf.nb_pool_maps; i++) {
344                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
345                 conf.pool_map[i].pools = (1UL << i);
346         }
347
348         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
349         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
350                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
351         return 0;
352 }
353
354 /*
355  * Validate the device number according to the max pool number gotten form
356  * dev_info. If the device number is invalid, give the error message and
357  * return -1. Each device must have its own pool.
358  */
359 static inline int
360 validate_num_devices(uint32_t max_nb_devices)
361 {
362         if (num_devices > max_nb_devices) {
363                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
364                 return -1;
365         }
366         return 0;
367 }
368
369 /*
370  * Initialises a given port using global settings and with the rx buffers
371  * coming from the mbuf_pool passed as parameter
372  */
373 static inline int
374 port_init(uint8_t port)
375 {
376         struct rte_eth_dev_info dev_info;
377         struct rte_eth_conf port_conf;
378         struct rte_eth_rxconf *rxconf;
379         struct rte_eth_txconf *txconf;
380         int16_t rx_rings, tx_rings;
381         uint16_t rx_ring_size, tx_ring_size;
382         int retval;
383         uint16_t q;
384
385         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
386         rte_eth_dev_info_get (port, &dev_info);
387
388         if (dev_info.max_rx_queues > MAX_QUEUES) {
389                 rte_exit(EXIT_FAILURE,
390                         "please define MAX_QUEUES no less than %u in %s\n",
391                         dev_info.max_rx_queues, __FILE__);
392         }
393
394         rxconf = &dev_info.default_rxconf;
395         txconf = &dev_info.default_txconf;
396         rxconf->rx_drop_en = 1;
397
398         /* Enable vlan offload */
399         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
400
401         /*
402          * Zero copy defers queue RX/TX start to the time when guest
403          * finishes its startup and packet buffers from that guest are
404          * available.
405          */
406         if (zero_copy) {
407                 rxconf->rx_deferred_start = 1;
408                 rxconf->rx_drop_en = 0;
409                 txconf->tx_deferred_start = 1;
410         }
411
412         /*configure the number of supported virtio devices based on VMDQ limits */
413         num_devices = dev_info.max_vmdq_pools;
414
415         if (zero_copy) {
416                 rx_ring_size = num_rx_descriptor;
417                 tx_ring_size = num_tx_descriptor;
418                 tx_rings = dev_info.max_tx_queues;
419         } else {
420                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
421                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
422                 tx_rings = (uint16_t)rte_lcore_count();
423         }
424
425         retval = validate_num_devices(MAX_DEVICES);
426         if (retval < 0)
427                 return retval;
428
429         /* Get port configuration. */
430         retval = get_eth_conf(&port_conf, num_devices);
431         if (retval < 0)
432                 return retval;
433         /* NIC queues are divided into pf queues and vmdq queues.  */
434         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
435         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
436         num_vmdq_queues = num_devices * queues_per_pool;
437         num_queues = num_pf_queues + num_vmdq_queues;
438         vmdq_queue_base = dev_info.vmdq_queue_base;
439         vmdq_pool_base  = dev_info.vmdq_pool_base;
440         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
441                 num_pf_queues, num_devices, queues_per_pool);
442
443         if (port >= rte_eth_dev_count()) return -1;
444
445         rx_rings = (uint16_t)dev_info.max_rx_queues;
446         /* Configure ethernet device. */
447         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
448         if (retval != 0)
449                 return retval;
450
451         /* Setup the queues. */
452         for (q = 0; q < rx_rings; q ++) {
453                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
454                                                 rte_eth_dev_socket_id(port),
455                                                 rxconf,
456                                                 vpool_array[q].pool);
457                 if (retval < 0)
458                         return retval;
459         }
460         for (q = 0; q < tx_rings; q ++) {
461                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462                                                 rte_eth_dev_socket_id(port),
463                                                 txconf);
464                 if (retval < 0)
465                         return retval;
466         }
467
468         /* Start the device. */
469         retval  = rte_eth_dev_start(port);
470         if (retval < 0) {
471                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
472                 return retval;
473         }
474
475         if (promiscuous)
476                 rte_eth_promiscuous_enable(port);
477
478         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
479         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
480         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
481                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482                         (unsigned)port,
483                         vmdq_ports_eth_addr[port].addr_bytes[0],
484                         vmdq_ports_eth_addr[port].addr_bytes[1],
485                         vmdq_ports_eth_addr[port].addr_bytes[2],
486                         vmdq_ports_eth_addr[port].addr_bytes[3],
487                         vmdq_ports_eth_addr[port].addr_bytes[4],
488                         vmdq_ports_eth_addr[port].addr_bytes[5]);
489
490         return 0;
491 }
492
493 /*
494  * Set character device basename.
495  */
496 static int
497 us_vhost_parse_basename(const char *q_arg)
498 {
499         /* parse number string */
500
501         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
502                 return -1;
503         else
504                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
505
506         return 0;
507 }
508
509 /*
510  * Parse the portmask provided at run time.
511  */
512 static int
513 parse_portmask(const char *portmask)
514 {
515         char *end = NULL;
516         unsigned long pm;
517
518         errno = 0;
519
520         /* parse hexadecimal string */
521         pm = strtoul(portmask, &end, 16);
522         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
523                 return -1;
524
525         if (pm == 0)
526                 return -1;
527
528         return pm;
529
530 }
531
532 /*
533  * Parse num options at run time.
534  */
535 static int
536 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
537 {
538         char *end = NULL;
539         unsigned long num;
540
541         errno = 0;
542
543         /* parse unsigned int string */
544         num = strtoul(q_arg, &end, 10);
545         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
546                 return -1;
547
548         if (num > max_valid_value)
549                 return -1;
550
551         return num;
552
553 }
554
555 /*
556  * Display usage
557  */
558 static void
559 us_vhost_usage(const char *prgname)
560 {
561         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562         "               --vm2vm [0|1|2]\n"
563         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
564         "               --dev-basename <name>\n"
565         "               --nb-devices ND\n"
566         "               -p PORTMASK: Set mask for ports to be used by application\n"
567         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
568         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
569         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
570         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
571         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
572         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
573         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574         "               --dev-basename: The basename to be used for the character device.\n"
575         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
576                         "zero copy\n"
577         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
578                         "used only when zero copy is enabled.\n"
579         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
580                         "used only when zero copy is enabled.\n",
581                prgname);
582 }
583
584 /*
585  * Parse the arguments given in the command line of the application.
586  */
587 static int
588 us_vhost_parse_args(int argc, char **argv)
589 {
590         int opt, ret;
591         int option_index;
592         unsigned i;
593         const char *prgname = argv[0];
594         static struct option long_option[] = {
595                 {"vm2vm", required_argument, NULL, 0},
596                 {"rx-retry", required_argument, NULL, 0},
597                 {"rx-retry-delay", required_argument, NULL, 0},
598                 {"rx-retry-num", required_argument, NULL, 0},
599                 {"mergeable", required_argument, NULL, 0},
600                 {"vlan-strip", required_argument, NULL, 0},
601                 {"stats", required_argument, NULL, 0},
602                 {"dev-basename", required_argument, NULL, 0},
603                 {"zero-copy", required_argument, NULL, 0},
604                 {"rx-desc-num", required_argument, NULL, 0},
605                 {"tx-desc-num", required_argument, NULL, 0},
606                 {NULL, 0, 0, 0},
607         };
608
609         /* Parse command line */
610         while ((opt = getopt_long(argc, argv, "p:P",
611                         long_option, &option_index)) != EOF) {
612                 switch (opt) {
613                 /* Portmask */
614                 case 'p':
615                         enabled_port_mask = parse_portmask(optarg);
616                         if (enabled_port_mask == 0) {
617                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
618                                 us_vhost_usage(prgname);
619                                 return -1;
620                         }
621                         break;
622
623                 case 'P':
624                         promiscuous = 1;
625                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
626                                 ETH_VMDQ_ACCEPT_BROADCAST |
627                                 ETH_VMDQ_ACCEPT_MULTICAST;
628                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
629
630                         break;
631
632                 case 0:
633                         /* Enable/disable vm2vm comms. */
634                         if (!strncmp(long_option[option_index].name, "vm2vm",
635                                 MAX_LONG_OPT_SZ)) {
636                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
637                                 if (ret == -1) {
638                                         RTE_LOG(INFO, VHOST_CONFIG,
639                                                 "Invalid argument for "
640                                                 "vm2vm [0|1|2]\n");
641                                         us_vhost_usage(prgname);
642                                         return -1;
643                                 } else {
644                                         vm2vm_mode = (vm2vm_type)ret;
645                                 }
646                         }
647
648                         /* Enable/disable retries on RX. */
649                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
650                                 ret = parse_num_opt(optarg, 1);
651                                 if (ret == -1) {
652                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
653                                         us_vhost_usage(prgname);
654                                         return -1;
655                                 } else {
656                                         enable_retry = ret;
657                                 }
658                         }
659
660                         /* Specify the retries delay time (in useconds) on RX. */
661                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
662                                 ret = parse_num_opt(optarg, INT32_MAX);
663                                 if (ret == -1) {
664                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
665                                         us_vhost_usage(prgname);
666                                         return -1;
667                                 } else {
668                                         burst_rx_delay_time = ret;
669                                 }
670                         }
671
672                         /* Specify the retries number on RX. */
673                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
674                                 ret = parse_num_opt(optarg, INT32_MAX);
675                                 if (ret == -1) {
676                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
677                                         us_vhost_usage(prgname);
678                                         return -1;
679                                 } else {
680                                         burst_rx_retry_num = ret;
681                                 }
682                         }
683
684                         /* Enable/disable RX mergeable buffers. */
685                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
686                                 ret = parse_num_opt(optarg, 1);
687                                 if (ret == -1) {
688                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
689                                         us_vhost_usage(prgname);
690                                         return -1;
691                                 } else {
692                                         mergeable = !!ret;
693                                         if (ret) {
694                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
695                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
696                                                         = JUMBO_FRAME_MAX_SIZE;
697                                         }
698                                 }
699                         }
700
701                         /* Enable/disable RX VLAN strip on host. */
702                         if (!strncmp(long_option[option_index].name,
703                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
704                                 ret = parse_num_opt(optarg, 1);
705                                 if (ret == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG,
707                                                 "Invalid argument for VLAN strip [0|1]\n");
708                                         us_vhost_usage(prgname);
709                                         return -1;
710                                 } else {
711                                         vlan_strip = !!ret;
712                                         vmdq_conf_default.rxmode.hw_vlan_strip =
713                                                 vlan_strip;
714                                 }
715                         }
716
717                         /* Enable/disable stats. */
718                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
719                                 ret = parse_num_opt(optarg, INT32_MAX);
720                                 if (ret == -1) {
721                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
722                                         us_vhost_usage(prgname);
723                                         return -1;
724                                 } else {
725                                         enable_stats = ret;
726                                 }
727                         }
728
729                         /* Set character device basename. */
730                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
731                                 if (us_vhost_parse_basename(optarg) == -1) {
732                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
733                                         us_vhost_usage(prgname);
734                                         return -1;
735                                 }
736                         }
737
738                         /* Enable/disable rx/tx zero copy. */
739                         if (!strncmp(long_option[option_index].name,
740                                 "zero-copy", MAX_LONG_OPT_SZ)) {
741                                 ret = parse_num_opt(optarg, 1);
742                                 if (ret == -1) {
743                                         RTE_LOG(INFO, VHOST_CONFIG,
744                                                 "Invalid argument"
745                                                 " for zero-copy [0|1]\n");
746                                         us_vhost_usage(prgname);
747                                         return -1;
748                                 } else
749                                         zero_copy = ret;
750
751                                 if (zero_copy) {
752 #ifdef RTE_MBUF_REFCNT
753                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
754                                         "zero copy vhost APP, please "
755                                         "disable RTE_MBUF_REFCNT\n"
756                                         "in config file and then rebuild DPDK "
757                                         "core lib!\n"
758                                         "Otherwise please disable zero copy "
759                                         "flag in command line!\n");
760                                         return -1;
761 #endif
762                                 }
763                         }
764
765                         /* Specify the descriptor number on RX. */
766                         if (!strncmp(long_option[option_index].name,
767                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
768                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
769                                 if ((ret == -1) || (!POWEROF2(ret))) {
770                                         RTE_LOG(INFO, VHOST_CONFIG,
771                                         "Invalid argument for rx-desc-num[0-N],"
772                                         "power of 2 required.\n");
773                                         us_vhost_usage(prgname);
774                                         return -1;
775                                 } else {
776                                         num_rx_descriptor = ret;
777                                 }
778                         }
779
780                         /* Specify the descriptor number on TX. */
781                         if (!strncmp(long_option[option_index].name,
782                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
783                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
784                                 if ((ret == -1) || (!POWEROF2(ret))) {
785                                         RTE_LOG(INFO, VHOST_CONFIG,
786                                         "Invalid argument for tx-desc-num [0-N],"
787                                         "power of 2 required.\n");
788                                         us_vhost_usage(prgname);
789                                         return -1;
790                                 } else {
791                                         num_tx_descriptor = ret;
792                                 }
793                         }
794
795                         break;
796
797                         /* Invalid option - print options. */
798                 default:
799                         us_vhost_usage(prgname);
800                         return -1;
801                 }
802         }
803
804         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
805                 if (enabled_port_mask & (1 << i))
806                         ports[num_ports++] = (uint8_t)i;
807         }
808
809         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
810                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
811                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
812                 return -1;
813         }
814
815         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
816                 RTE_LOG(INFO, VHOST_PORT,
817                         "Vhost zero copy doesn't support software vm2vm,"
818                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
819                 return -1;
820         }
821
822         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
823                 RTE_LOG(INFO, VHOST_PORT,
824                         "Vhost zero copy doesn't support jumbo frame,"
825                         "please specify '--mergeable 0' to disable the "
826                         "mergeable feature.\n");
827                 return -1;
828         }
829
830         return 0;
831 }
832
833 /*
834  * Update the global var NUM_PORTS and array PORTS according to system ports number
835  * and return valid ports number
836  */
837 static unsigned check_ports_num(unsigned nb_ports)
838 {
839         unsigned valid_num_ports = num_ports;
840         unsigned portid;
841
842         if (num_ports > nb_ports) {
843                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
844                         num_ports, nb_ports);
845                 num_ports = nb_ports;
846         }
847
848         for (portid = 0; portid < num_ports; portid ++) {
849                 if (ports[portid] >= nb_ports) {
850                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
851                                 ports[portid], (nb_ports - 1));
852                         ports[portid] = INVALID_PORT_ID;
853                         valid_num_ports--;
854                 }
855         }
856         return valid_num_ports;
857 }
858
859 /*
860  * Macro to print out packet contents. Wrapped in debug define so that the
861  * data path is not effected when debug is disabled.
862  */
863 #ifdef DEBUG
864 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
865         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
866         unsigned int index;                                                                                                                                                                                             \
867         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
868                                                                                                                                                                                                                                         \
869         if ((header))                                                                                                                                                                                                   \
870                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
871         else                                                                                                                                                                                                                    \
872                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
873         for (index = 0; index < (size); index++) {                                                                                                                                              \
874                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
875                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
876         }                                                                                                                                                                                                                               \
877         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
878                                                                                                                                                                                                                                         \
879         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
880 } while(0)
881 #else
882 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
883 #endif
884
885 /*
886  * Function to convert guest physical addresses to vhost physical addresses.
887  * This is used to convert virtio buffer addresses.
888  */
889 static inline uint64_t __attribute__((always_inline))
890 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
891         uint32_t buf_len, hpa_type *addr_type)
892 {
893         struct virtio_memory_regions_hpa *region;
894         uint32_t regionidx;
895         uint64_t vhost_pa = 0;
896
897         *addr_type = PHYS_ADDR_INVALID;
898
899         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
900                 region = &vdev->regions_hpa[regionidx];
901                 if ((guest_pa >= region->guest_phys_address) &&
902                         (guest_pa <= region->guest_phys_address_end)) {
903                         vhost_pa = region->host_phys_addr_offset + guest_pa;
904                         if (likely((guest_pa + buf_len - 1)
905                                 <= region->guest_phys_address_end))
906                                 *addr_type = PHYS_ADDR_CONTINUOUS;
907                         else
908                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
909                         break;
910                 }
911         }
912
913         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
914                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
915                 (void *)(uintptr_t)vhost_pa);
916
917         return vhost_pa;
918 }
919
920 /*
921  * Compares a packet destination MAC address to a device MAC address.
922  */
923 static inline int __attribute__((always_inline))
924 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
925 {
926         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
927 }
928
929 /*
930  * This function learns the MAC address of the device and registers this along with a
931  * vlan tag to a VMDQ.
932  */
933 static int
934 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
935 {
936         struct ether_hdr *pkt_hdr;
937         struct virtio_net_data_ll *dev_ll;
938         struct virtio_net *dev = vdev->dev;
939         int i, ret;
940
941         /* Learn MAC address of guest device from packet */
942         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
943
944         dev_ll = ll_root_used;
945
946         while (dev_ll != NULL) {
947                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
948                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
949                         return -1;
950                 }
951                 dev_ll = dev_ll->next;
952         }
953
954         for (i = 0; i < ETHER_ADDR_LEN; i++)
955                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
956
957         /* vlan_tag currently uses the device_id. */
958         vdev->vlan_tag = vlan_tags[dev->device_fh];
959
960         /* Print out VMDQ registration info. */
961         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
962                 dev->device_fh,
963                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
964                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
965                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
966                 vdev->vlan_tag);
967
968         /* Register the MAC address. */
969         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
970                                 (uint32_t)dev->device_fh + vmdq_pool_base);
971         if (ret)
972                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
973                                         dev->device_fh);
974
975         /* Enable stripping of the vlan tag as we handle routing. */
976         if (vlan_strip)
977                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
978                         (uint16_t)vdev->vmdq_rx_q, 1);
979
980         /* Set device as ready for RX. */
981         vdev->ready = DEVICE_RX;
982
983         return 0;
984 }
985
986 /*
987  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
988  * queue before disabling RX on the device.
989  */
990 static inline void
991 unlink_vmdq(struct vhost_dev *vdev)
992 {
993         unsigned i = 0;
994         unsigned rx_count;
995         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
996
997         if (vdev->ready == DEVICE_RX) {
998                 /*clear MAC and VLAN settings*/
999                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1000                 for (i = 0; i < 6; i++)
1001                         vdev->mac_address.addr_bytes[i] = 0;
1002
1003                 vdev->vlan_tag = 0;
1004
1005                 /*Clear out the receive buffers*/
1006                 rx_count = rte_eth_rx_burst(ports[0],
1007                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1008
1009                 while (rx_count) {
1010                         for (i = 0; i < rx_count; i++)
1011                                 rte_pktmbuf_free(pkts_burst[i]);
1012
1013                         rx_count = rte_eth_rx_burst(ports[0],
1014                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1015                 }
1016
1017                 vdev->ready = DEVICE_MAC_LEARNING;
1018         }
1019 }
1020
1021 /*
1022  * Check if the packet destination MAC address is for a local device. If so then put
1023  * the packet on that devices RX queue. If not then return.
1024  */
1025 static inline int __attribute__((always_inline))
1026 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1027 {
1028         struct virtio_net_data_ll *dev_ll;
1029         struct ether_hdr *pkt_hdr;
1030         uint64_t ret = 0;
1031         struct virtio_net *dev = vdev->dev;
1032         struct virtio_net *tdev; /* destination virito device */
1033
1034         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1035
1036         /*get the used devices list*/
1037         dev_ll = ll_root_used;
1038
1039         while (dev_ll != NULL) {
1040                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1041                                           &dev_ll->vdev->mac_address)) {
1042
1043                         /* Drop the packet if the TX packet is destined for the TX device. */
1044                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1045                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1046                                                         dev->device_fh);
1047                                 return 0;
1048                         }
1049                         tdev = dev_ll->vdev->dev;
1050
1051
1052                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1053
1054                         if (unlikely(dev_ll->vdev->remove)) {
1055                                 /*drop the packet if the device is marked for removal*/
1056                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1057                         } else {
1058                                 /*send the packet to the local virtio device*/
1059                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1060                                 if (enable_stats) {
1061                                         rte_atomic64_add(
1062                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1063                                         1);
1064                                         rte_atomic64_add(
1065                                         &dev_statistics[tdev->device_fh].rx_atomic,
1066                                         ret);
1067                                         dev_statistics[tdev->device_fh].tx_total++;
1068                                         dev_statistics[tdev->device_fh].tx += ret;
1069                                 }
1070                         }
1071
1072                         return 0;
1073                 }
1074                 dev_ll = dev_ll->next;
1075         }
1076
1077         return -1;
1078 }
1079
1080 /*
1081  * Check if the destination MAC of a packet is one local VM,
1082  * and get its vlan tag, and offset if it is.
1083  */
1084 static inline int __attribute__((always_inline))
1085 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1086         uint32_t *offset, uint16_t *vlan_tag)
1087 {
1088         struct virtio_net_data_ll *dev_ll = ll_root_used;
1089         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1090
1091         while (dev_ll != NULL) {
1092                 if ((dev_ll->vdev->ready == DEVICE_RX)
1093                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1094                 &dev_ll->vdev->mac_address)) {
1095                         /*
1096                          * Drop the packet if the TX packet is
1097                          * destined for the TX device.
1098                          */
1099                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1100                                 LOG_DEBUG(VHOST_DATA,
1101                                 "(%"PRIu64") TX: Source and destination"
1102                                 " MAC addresses are the same. Dropping "
1103                                 "packet.\n",
1104                                 dev_ll->vdev->dev->device_fh);
1105                                 return -1;
1106                         }
1107
1108                         /*
1109                          * HW vlan strip will reduce the packet length
1110                          * by minus length of vlan tag, so need restore
1111                          * the packet length by plus it.
1112                          */
1113                         *offset = VLAN_HLEN;
1114                         *vlan_tag =
1115                         (uint16_t)
1116                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1117
1118                         LOG_DEBUG(VHOST_DATA,
1119                         "(%"PRIu64") TX: pkt to local VM device id:"
1120                         "(%"PRIu64") vlan tag: %d.\n",
1121                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1122                         vlan_tag);
1123
1124                         break;
1125                 }
1126                 dev_ll = dev_ll->next;
1127         }
1128         return 0;
1129 }
1130
1131 /*
1132  * This function routes the TX packet to the correct interface. This may be a local device
1133  * or the physical port.
1134  */
1135 static inline void __attribute__((always_inline))
1136 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1137 {
1138         struct mbuf_table *tx_q;
1139         struct rte_mbuf **m_table;
1140         unsigned len, ret, offset = 0;
1141         const uint16_t lcore_id = rte_lcore_id();
1142         struct virtio_net *dev = vdev->dev;
1143         struct ether_hdr *nh;
1144
1145         /*check if destination is local VM*/
1146         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1147                 rte_pktmbuf_free(m);
1148                 return;
1149         }
1150
1151         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1152                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1153                         rte_pktmbuf_free(m);
1154                         return;
1155                 }
1156         }
1157
1158         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1159
1160         /*Add packet to the port tx queue*/
1161         tx_q = &lcore_tx_queue[lcore_id];
1162         len = tx_q->len;
1163
1164         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1165         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1166                 /* Guest has inserted the vlan tag. */
1167                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1168                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1169                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1170                         (vh->vlan_tci != vlan_tag_be))
1171                         vh->vlan_tci = vlan_tag_be;
1172         } else {
1173                 m->ol_flags = PKT_TX_VLAN_PKT;
1174
1175                 /*
1176                  * Find the right seg to adjust the data len when offset is
1177                  * bigger than tail room size.
1178                  */
1179                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1180                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1181                                 m->data_len += offset;
1182                         else {
1183                                 struct rte_mbuf *seg = m;
1184
1185                                 while ((seg->next != NULL) &&
1186                                         (offset > rte_pktmbuf_tailroom(seg)))
1187                                         seg = seg->next;
1188
1189                                 seg->data_len += offset;
1190                         }
1191                         m->pkt_len += offset;
1192                 }
1193
1194                 m->vlan_tci = vlan_tag;
1195         }
1196
1197         tx_q->m_table[len] = m;
1198         len++;
1199         if (enable_stats) {
1200                 dev_statistics[dev->device_fh].tx_total++;
1201                 dev_statistics[dev->device_fh].tx++;
1202         }
1203
1204         if (unlikely(len == MAX_PKT_BURST)) {
1205                 m_table = (struct rte_mbuf **)tx_q->m_table;
1206                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1207                 /* Free any buffers not handled by TX and update the port stats. */
1208                 if (unlikely(ret < len)) {
1209                         do {
1210                                 rte_pktmbuf_free(m_table[ret]);
1211                         } while (++ret < len);
1212                 }
1213
1214                 len = 0;
1215         }
1216
1217         tx_q->len = len;
1218         return;
1219 }
1220 /*
1221  * This function is called by each data core. It handles all RX/TX registered with the
1222  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1223  * with all devices in the main linked list.
1224  */
1225 static int
1226 switch_worker(__attribute__((unused)) void *arg)
1227 {
1228         struct rte_mempool *mbuf_pool = arg;
1229         struct virtio_net *dev = NULL;
1230         struct vhost_dev *vdev = NULL;
1231         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1232         struct virtio_net_data_ll *dev_ll;
1233         struct mbuf_table *tx_q;
1234         volatile struct lcore_ll_info *lcore_ll;
1235         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1236         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1237         unsigned ret, i;
1238         const uint16_t lcore_id = rte_lcore_id();
1239         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1240         uint16_t rx_count = 0;
1241         uint16_t tx_count;
1242         uint32_t retry = 0;
1243
1244         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1245         lcore_ll = lcore_info[lcore_id].lcore_ll;
1246         prev_tsc = 0;
1247
1248         tx_q = &lcore_tx_queue[lcore_id];
1249         for (i = 0; i < num_cores; i ++) {
1250                 if (lcore_ids[i] == lcore_id) {
1251                         tx_q->txq_id = i;
1252                         break;
1253                 }
1254         }
1255
1256         while(1) {
1257                 cur_tsc = rte_rdtsc();
1258                 /*
1259                  * TX burst queue drain
1260                  */
1261                 diff_tsc = cur_tsc - prev_tsc;
1262                 if (unlikely(diff_tsc > drain_tsc)) {
1263
1264                         if (tx_q->len) {
1265                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1266
1267                                 /*Tx any packets in the queue*/
1268                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1269                                                                            (struct rte_mbuf **)tx_q->m_table,
1270                                                                            (uint16_t)tx_q->len);
1271                                 if (unlikely(ret < tx_q->len)) {
1272                                         do {
1273                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1274                                         } while (++ret < tx_q->len);
1275                                 }
1276
1277                                 tx_q->len = 0;
1278                         }
1279
1280                         prev_tsc = cur_tsc;
1281
1282                 }
1283
1284                 rte_prefetch0(lcore_ll->ll_root_used);
1285                 /*
1286                  * Inform the configuration core that we have exited the linked list and that no devices are
1287                  * in use if requested.
1288                  */
1289                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1290                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1291
1292                 /*
1293                  * Process devices
1294                  */
1295                 dev_ll = lcore_ll->ll_root_used;
1296
1297                 while (dev_ll != NULL) {
1298                         /*get virtio device ID*/
1299                         vdev = dev_ll->vdev;
1300                         dev = vdev->dev;
1301
1302                         if (unlikely(vdev->remove)) {
1303                                 dev_ll = dev_ll->next;
1304                                 unlink_vmdq(vdev);
1305                                 vdev->ready = DEVICE_SAFE_REMOVE;
1306                                 continue;
1307                         }
1308                         if (likely(vdev->ready == DEVICE_RX)) {
1309                                 /*Handle guest RX*/
1310                                 rx_count = rte_eth_rx_burst(ports[0],
1311                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1312
1313                                 if (rx_count) {
1314                                         /*
1315                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1316                                         * Here MAX_PKT_BURST must be less than virtio queue size
1317                                         */
1318                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1319                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1320                                                         rte_delay_us(burst_rx_delay_time);
1321                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1322                                                                 break;
1323                                                 }
1324                                         }
1325                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1326                                         if (enable_stats) {
1327                                                 rte_atomic64_add(
1328                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1329                                                 rx_count);
1330                                                 rte_atomic64_add(
1331                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1332                                         }
1333                                         while (likely(rx_count)) {
1334                                                 rx_count--;
1335                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1336                                         }
1337
1338                                 }
1339                         }
1340
1341                         if (likely(!vdev->remove)) {
1342                                 /* Handle guest TX*/
1343                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1344                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1345                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1346                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1347                                                 while (tx_count)
1348                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1349                                         }
1350                                 }
1351                                 while (tx_count)
1352                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1353                         }
1354
1355                         /*move to the next device in the list*/
1356                         dev_ll = dev_ll->next;
1357                 }
1358         }
1359
1360         return 0;
1361 }
1362
1363 /*
1364  * This function gets available ring number for zero copy rx.
1365  * Only one thread will call this funciton for a paticular virtio device,
1366  * so, it is designed as non-thread-safe function.
1367  */
1368 static inline uint32_t __attribute__((always_inline))
1369 get_available_ring_num_zcp(struct virtio_net *dev)
1370 {
1371         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1372         uint16_t avail_idx;
1373
1374         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1375         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1376 }
1377
1378 /*
1379  * This function gets available ring index for zero copy rx,
1380  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1381  * Only one thread will call this funciton for a paticular virtio device,
1382  * so, it is designed as non-thread-safe function.
1383  */
1384 static inline uint32_t __attribute__((always_inline))
1385 get_available_ring_index_zcp(struct virtio_net *dev,
1386         uint16_t *res_base_idx, uint32_t count)
1387 {
1388         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1389         uint16_t avail_idx;
1390         uint32_t retry = 0;
1391         uint16_t free_entries;
1392
1393         *res_base_idx = vq->last_used_idx_res;
1394         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1395         free_entries = (avail_idx - *res_base_idx);
1396
1397         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1398                         "avail idx: %d, "
1399                         "res base idx:%d, free entries:%d\n",
1400                         dev->device_fh, avail_idx, *res_base_idx,
1401                         free_entries);
1402
1403         /*
1404          * If retry is enabled and the queue is full then we wait
1405          * and retry to avoid packet loss.
1406          */
1407         if (enable_retry && unlikely(count > free_entries)) {
1408                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1409                         rte_delay_us(burst_rx_delay_time);
1410                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1411                         free_entries = (avail_idx - *res_base_idx);
1412                         if (count <= free_entries)
1413                                 break;
1414                 }
1415         }
1416
1417         /*check that we have enough buffers*/
1418         if (unlikely(count > free_entries))
1419                 count = free_entries;
1420
1421         if (unlikely(count == 0)) {
1422                 LOG_DEBUG(VHOST_DATA,
1423                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1424                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1425                         dev->device_fh, avail_idx,
1426                         *res_base_idx, free_entries);
1427                 return 0;
1428         }
1429
1430         vq->last_used_idx_res = *res_base_idx + count;
1431
1432         return count;
1433 }
1434
1435 /*
1436  * This function put descriptor back to used list.
1437  */
1438 static inline void __attribute__((always_inline))
1439 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1440 {
1441         uint16_t res_cur_idx = vq->last_used_idx;
1442         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1443         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1444         rte_compiler_barrier();
1445         *(volatile uint16_t *)&vq->used->idx += 1;
1446         vq->last_used_idx += 1;
1447
1448         /* Kick the guest if necessary. */
1449         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1450                 eventfd_write((int)vq->kickfd, 1);
1451 }
1452
1453 /*
1454  * This function get available descriptor from vitio vring and un-attached mbuf
1455  * from vpool->ring, and then attach them together. It needs adjust the offset
1456  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1457  * frame data may be put to wrong location in mbuf.
1458  */
1459 static inline void __attribute__((always_inline))
1460 attach_rxmbuf_zcp(struct virtio_net *dev)
1461 {
1462         uint16_t res_base_idx, desc_idx;
1463         uint64_t buff_addr, phys_addr;
1464         struct vhost_virtqueue *vq;
1465         struct vring_desc *desc;
1466         struct rte_mbuf *mbuf = NULL;
1467         struct vpool *vpool;
1468         hpa_type addr_type;
1469         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1470
1471         vpool = &vpool_array[vdev->vmdq_rx_q];
1472         vq = dev->virtqueue[VIRTIO_RXQ];
1473
1474         do {
1475                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1476                                 1) != 1))
1477                         return;
1478                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1479
1480                 desc = &vq->desc[desc_idx];
1481                 if (desc->flags & VRING_DESC_F_NEXT) {
1482                         desc = &vq->desc[desc->next];
1483                         buff_addr = gpa_to_vva(dev, desc->addr);
1484                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1485                                         &addr_type);
1486                 } else {
1487                         buff_addr = gpa_to_vva(dev,
1488                                         desc->addr + vq->vhost_hlen);
1489                         phys_addr = gpa_to_hpa(vdev,
1490                                         desc->addr + vq->vhost_hlen,
1491                                         desc->len, &addr_type);
1492                 }
1493
1494                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1495                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1496                                 " address found when attaching RX frame buffer"
1497                                 " address!\n", dev->device_fh);
1498                         put_desc_to_used_list_zcp(vq, desc_idx);
1499                         continue;
1500                 }
1501
1502                 /*
1503                  * Check if the frame buffer address from guest crosses
1504                  * sub-region or not.
1505                  */
1506                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1507                         RTE_LOG(ERR, VHOST_DATA,
1508                                 "(%"PRIu64") Frame buffer address cross "
1509                                 "sub-regioin found when attaching RX frame "
1510                                 "buffer address!\n",
1511                                 dev->device_fh);
1512                         put_desc_to_used_list_zcp(vq, desc_idx);
1513                         continue;
1514                 }
1515         } while (unlikely(phys_addr == 0));
1516
1517         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1518         if (unlikely(mbuf == NULL)) {
1519                 LOG_DEBUG(VHOST_DATA,
1520                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1521                         "ring_sc_dequeue fail.\n",
1522                         dev->device_fh);
1523                 put_desc_to_used_list_zcp(vq, desc_idx);
1524                 return;
1525         }
1526
1527         if (unlikely(vpool->buf_size > desc->len)) {
1528                 LOG_DEBUG(VHOST_DATA,
1529                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1530                         "length(%d) of descriptor idx: %d less than room "
1531                         "size required: %d\n",
1532                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1533                 put_desc_to_used_list_zcp(vq, desc_idx);
1534                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1535                 return;
1536         }
1537
1538         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1539         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1540         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1541         mbuf->data_len = desc->len;
1542         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1543
1544         LOG_DEBUG(VHOST_DATA,
1545                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1546                 "descriptor idx:%d\n",
1547                 dev->device_fh, res_base_idx, desc_idx);
1548
1549         __rte_mbuf_raw_free(mbuf);
1550
1551         return;
1552 }
1553
1554 /*
1555  * Detach an attched packet mbuf -
1556  *  - restore original mbuf address and length values.
1557  *  - reset pktmbuf data and data_len to their default values.
1558  *  All other fields of the given packet mbuf will be left intact.
1559  *
1560  * @param m
1561  *   The attached packet mbuf.
1562  */
1563 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1564 {
1565         const struct rte_mempool *mp = m->pool;
1566         void *buf = RTE_MBUF_TO_BADDR(m);
1567         uint32_t buf_ofs;
1568         uint32_t buf_len = mp->elt_size - sizeof(*m);
1569         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1570
1571         m->buf_addr = buf;
1572         m->buf_len = (uint16_t)buf_len;
1573
1574         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1575                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1576         m->data_off = buf_ofs;
1577
1578         m->data_len = 0;
1579 }
1580
1581 /*
1582  * This function is called after packets have been transimited. It fetchs mbuf
1583  * from vpool->pool, detached it and put into vpool->ring. It also update the
1584  * used index and kick the guest if necessary.
1585  */
1586 static inline uint32_t __attribute__((always_inline))
1587 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1588 {
1589         struct rte_mbuf *mbuf;
1590         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1591         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1592         uint32_t index = 0;
1593         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1594
1595         LOG_DEBUG(VHOST_DATA,
1596                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1597                 "clean is: %d\n",
1598                 dev->device_fh, mbuf_count);
1599         LOG_DEBUG(VHOST_DATA,
1600                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1601                 "clean  is : %d\n",
1602                 dev->device_fh, rte_ring_count(vpool->ring));
1603
1604         for (index = 0; index < mbuf_count; index++) {
1605                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1606                 if (likely(MBUF_EXT_MEM(mbuf)))
1607                         pktmbuf_detach_zcp(mbuf);
1608                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1609
1610                 /* Update used index buffer information. */
1611                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1612                 vq->used->ring[used_idx].len = 0;
1613
1614                 used_idx = (used_idx + 1) & (vq->size - 1);
1615         }
1616
1617         LOG_DEBUG(VHOST_DATA,
1618                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1619                 "clean is: %d\n",
1620                 dev->device_fh, rte_mempool_count(vpool->pool));
1621         LOG_DEBUG(VHOST_DATA,
1622                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1623                 "clean  is : %d\n",
1624                 dev->device_fh, rte_ring_count(vpool->ring));
1625         LOG_DEBUG(VHOST_DATA,
1626                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1627                 "vq->last_used_idx:%d\n",
1628                 dev->device_fh, vq->last_used_idx);
1629
1630         vq->last_used_idx += mbuf_count;
1631
1632         LOG_DEBUG(VHOST_DATA,
1633                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1634                 "vq->last_used_idx:%d\n",
1635                 dev->device_fh, vq->last_used_idx);
1636
1637         rte_compiler_barrier();
1638
1639         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1640
1641         /* Kick guest if required. */
1642         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1643                 eventfd_write((int)vq->kickfd, 1);
1644
1645         return 0;
1646 }
1647
1648 /*
1649  * This function is called when a virtio device is destroy.
1650  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1651  */
1652 static void mbuf_destroy_zcp(struct vpool *vpool)
1653 {
1654         struct rte_mbuf *mbuf = NULL;
1655         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1656
1657         LOG_DEBUG(VHOST_CONFIG,
1658                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1659                 "mbuf_destroy_zcp is: %d\n",
1660                 mbuf_count);
1661         LOG_DEBUG(VHOST_CONFIG,
1662                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1663                 "mbuf_destroy_zcp  is : %d\n",
1664                 rte_ring_count(vpool->ring));
1665
1666         for (index = 0; index < mbuf_count; index++) {
1667                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1668                 if (likely(mbuf != NULL)) {
1669                         if (likely(MBUF_EXT_MEM(mbuf)))
1670                                 pktmbuf_detach_zcp(mbuf);
1671                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1672                 }
1673         }
1674
1675         LOG_DEBUG(VHOST_CONFIG,
1676                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1677                 "mbuf_destroy_zcp is: %d\n",
1678                 rte_mempool_count(vpool->pool));
1679         LOG_DEBUG(VHOST_CONFIG,
1680                 "in mbuf_destroy_zcp: mbuf count in ring after "
1681                 "mbuf_destroy_zcp is : %d\n",
1682                 rte_ring_count(vpool->ring));
1683 }
1684
1685 /*
1686  * This function update the use flag and counter.
1687  */
1688 static inline uint32_t __attribute__((always_inline))
1689 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1690         uint32_t count)
1691 {
1692         struct vhost_virtqueue *vq;
1693         struct vring_desc *desc;
1694         struct rte_mbuf *buff;
1695         /* The virtio_hdr is initialised to 0. */
1696         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1697                 = {{0, 0, 0, 0, 0, 0}, 0};
1698         uint64_t buff_hdr_addr = 0;
1699         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1700         uint32_t head_idx, packet_success = 0;
1701         uint16_t res_cur_idx;
1702
1703         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1704
1705         if (count == 0)
1706                 return 0;
1707
1708         vq = dev->virtqueue[VIRTIO_RXQ];
1709         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1710
1711         res_cur_idx = vq->last_used_idx;
1712         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1713                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1714
1715         /* Retrieve all of the head indexes first to avoid caching issues. */
1716         for (head_idx = 0; head_idx < count; head_idx++)
1717                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1718
1719         /*Prefetch descriptor index. */
1720         rte_prefetch0(&vq->desc[head[packet_success]]);
1721
1722         while (packet_success != count) {
1723                 /* Get descriptor from available ring */
1724                 desc = &vq->desc[head[packet_success]];
1725
1726                 buff = pkts[packet_success];
1727                 LOG_DEBUG(VHOST_DATA,
1728                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1729                         "pkt[%d] descriptor idx: %d\n",
1730                         dev->device_fh, packet_success,
1731                         MBUF_HEADROOM_UINT32(buff));
1732
1733                 PRINT_PACKET(dev,
1734                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1735                         + RTE_PKTMBUF_HEADROOM),
1736                         rte_pktmbuf_data_len(buff), 0);
1737
1738                 /* Buffer address translation for virtio header. */
1739                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1740                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1741
1742                 /*
1743                  * If the descriptors are chained the header and data are
1744                  * placed in separate buffers.
1745                  */
1746                 if (desc->flags & VRING_DESC_F_NEXT) {
1747                         desc->len = vq->vhost_hlen;
1748                         desc = &vq->desc[desc->next];
1749                         desc->len = rte_pktmbuf_data_len(buff);
1750                 } else {
1751                         desc->len = packet_len;
1752                 }
1753
1754                 /* Update used ring with desc information */
1755                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1756                         = head[packet_success];
1757                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1758                         = packet_len;
1759                 res_cur_idx++;
1760                 packet_success++;
1761
1762                 /* A header is required per buffer. */
1763                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1764                         (const void *)&virtio_hdr, vq->vhost_hlen);
1765
1766                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1767
1768                 if (likely(packet_success < count)) {
1769                         /* Prefetch descriptor index. */
1770                         rte_prefetch0(&vq->desc[head[packet_success]]);
1771                 }
1772         }
1773
1774         rte_compiler_barrier();
1775
1776         LOG_DEBUG(VHOST_DATA,
1777                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1778                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1779                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1780
1781         *(volatile uint16_t *)&vq->used->idx += count;
1782         vq->last_used_idx += count;
1783
1784         LOG_DEBUG(VHOST_DATA,
1785                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1786                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1787                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1788
1789         /* Kick the guest if necessary. */
1790         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1791                 eventfd_write((int)vq->kickfd, 1);
1792
1793         return count;
1794 }
1795
1796 /*
1797  * This function routes the TX packet to the correct interface.
1798  * This may be a local device or the physical port.
1799  */
1800 static inline void __attribute__((always_inline))
1801 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1802         uint32_t desc_idx, uint8_t need_copy)
1803 {
1804         struct mbuf_table *tx_q;
1805         struct rte_mbuf **m_table;
1806         struct rte_mbuf *mbuf = NULL;
1807         unsigned len, ret, offset = 0;
1808         struct vpool *vpool;
1809         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1810         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1811
1812         /*Add packet to the port tx queue*/
1813         tx_q = &tx_queue_zcp[vmdq_rx_q];
1814         len = tx_q->len;
1815
1816         /* Allocate an mbuf and populate the structure. */
1817         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1818         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1819         if (unlikely(mbuf == NULL)) {
1820                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1821                 RTE_LOG(ERR, VHOST_DATA,
1822                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1823                         dev->device_fh);
1824                 put_desc_to_used_list_zcp(vq, desc_idx);
1825                 return;
1826         }
1827
1828         if (vm2vm_mode == VM2VM_HARDWARE) {
1829                 /* Avoid using a vlan tag from any vm for external pkt, such as
1830                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1831                  * selection, MAC address determines it as an external pkt
1832                  * which should go to network, while vlan tag determine it as
1833                  * a vm2vm pkt should forward to another vm. Hardware confuse
1834                  * such a ambiguous situation, so pkt will lost.
1835                  */
1836                 vlan_tag = external_pkt_default_vlan_tag;
1837                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1838                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1839                         __rte_mbuf_raw_free(mbuf);
1840                         return;
1841                 }
1842         }
1843
1844         mbuf->nb_segs = m->nb_segs;
1845         mbuf->next = m->next;
1846         mbuf->data_len = m->data_len + offset;
1847         mbuf->pkt_len = mbuf->data_len;
1848         if (unlikely(need_copy)) {
1849                 /* Copy the packet contents to the mbuf. */
1850                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1851                         rte_pktmbuf_mtod(m, void *),
1852                         m->data_len);
1853         } else {
1854                 mbuf->data_off = m->data_off;
1855                 mbuf->buf_physaddr = m->buf_physaddr;
1856                 mbuf->buf_addr = m->buf_addr;
1857         }
1858         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1859         mbuf->vlan_tci = vlan_tag;
1860         mbuf->l2_len = sizeof(struct ether_hdr);
1861         mbuf->l3_len = sizeof(struct ipv4_hdr);
1862         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1863
1864         tx_q->m_table[len] = mbuf;
1865         len++;
1866
1867         LOG_DEBUG(VHOST_DATA,
1868                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1869                 dev->device_fh,
1870                 mbuf->nb_segs,
1871                 (mbuf->next == NULL) ? "null" : "non-null");
1872
1873         if (enable_stats) {
1874                 dev_statistics[dev->device_fh].tx_total++;
1875                 dev_statistics[dev->device_fh].tx++;
1876         }
1877
1878         if (unlikely(len == MAX_PKT_BURST)) {
1879                 m_table = (struct rte_mbuf **)tx_q->m_table;
1880                 ret = rte_eth_tx_burst(ports[0],
1881                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1882
1883                 /*
1884                  * Free any buffers not handled by TX and update
1885                  * the port stats.
1886                  */
1887                 if (unlikely(ret < len)) {
1888                         do {
1889                                 rte_pktmbuf_free(m_table[ret]);
1890                         } while (++ret < len);
1891                 }
1892
1893                 len = 0;
1894                 txmbuf_clean_zcp(dev, vpool);
1895         }
1896
1897         tx_q->len = len;
1898
1899         return;
1900 }
1901
1902 /*
1903  * This function TX all available packets in virtio TX queue for one
1904  * virtio-net device. If it is first packet, it learns MAC address and
1905  * setup VMDQ.
1906  */
1907 static inline void __attribute__((always_inline))
1908 virtio_dev_tx_zcp(struct virtio_net *dev)
1909 {
1910         struct rte_mbuf m;
1911         struct vhost_virtqueue *vq;
1912         struct vring_desc *desc;
1913         uint64_t buff_addr = 0, phys_addr;
1914         uint32_t head[MAX_PKT_BURST];
1915         uint32_t i;
1916         uint16_t free_entries, packet_success = 0;
1917         uint16_t avail_idx;
1918         uint8_t need_copy = 0;
1919         hpa_type addr_type;
1920         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1921
1922         vq = dev->virtqueue[VIRTIO_TXQ];
1923         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1924
1925         /* If there are no available buffers then return. */
1926         if (vq->last_used_idx_res == avail_idx)
1927                 return;
1928
1929         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1930
1931         /* Prefetch available ring to retrieve head indexes. */
1932         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1933
1934         /* Get the number of free entries in the ring */
1935         free_entries = (avail_idx - vq->last_used_idx_res);
1936
1937         /* Limit to MAX_PKT_BURST. */
1938         free_entries
1939                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1940
1941         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1942                 dev->device_fh, free_entries);
1943
1944         /* Retrieve all of the head indexes first to avoid caching issues. */
1945         for (i = 0; i < free_entries; i++)
1946                 head[i]
1947                         = vq->avail->ring[(vq->last_used_idx_res + i)
1948                         & (vq->size - 1)];
1949
1950         vq->last_used_idx_res += free_entries;
1951
1952         /* Prefetch descriptor index. */
1953         rte_prefetch0(&vq->desc[head[packet_success]]);
1954         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1955
1956         while (packet_success < free_entries) {
1957                 desc = &vq->desc[head[packet_success]];
1958
1959                 /* Discard first buffer as it is the virtio header */
1960                 desc = &vq->desc[desc->next];
1961
1962                 /* Buffer address translation. */
1963                 buff_addr = gpa_to_vva(dev, desc->addr);
1964                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1965                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1966                         &addr_type);
1967
1968                 if (likely(packet_success < (free_entries - 1)))
1969                         /* Prefetch descriptor index. */
1970                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1971
1972                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1973                         RTE_LOG(ERR, VHOST_DATA,
1974                                 "(%"PRIu64") Invalid frame buffer address found"
1975                                 "when TX packets!\n",
1976                                 dev->device_fh);
1977                         packet_success++;
1978                         continue;
1979                 }
1980
1981                 /* Prefetch buffer address. */
1982                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1983
1984                 /*
1985                  * Setup dummy mbuf. This is copied to a real mbuf if
1986                  * transmitted out the physical port.
1987                  */
1988                 m.data_len = desc->len;
1989                 m.nb_segs = 1;
1990                 m.next = NULL;
1991                 m.data_off = 0;
1992                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1993                 m.buf_physaddr = phys_addr;
1994
1995                 /*
1996                  * Check if the frame buffer address from guest crosses
1997                  * sub-region or not.
1998                  */
1999                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2000                         RTE_LOG(ERR, VHOST_DATA,
2001                                 "(%"PRIu64") Frame buffer address cross "
2002                                 "sub-regioin found when attaching TX frame "
2003                                 "buffer address!\n",
2004                                 dev->device_fh);
2005                         need_copy = 1;
2006                 } else
2007                         need_copy = 0;
2008
2009                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2010
2011                 /*
2012                  * If this is the first received packet we need to learn
2013                  * the MAC and setup VMDQ
2014                  */
2015                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2016                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2017                                 /*
2018                                  * Discard frame if device is scheduled for
2019                                  * removal or a duplicate MAC address is found.
2020                                  */
2021                                 packet_success += free_entries;
2022                                 vq->last_used_idx += packet_success;
2023                                 break;
2024                         }
2025                 }
2026
2027                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2028                 packet_success++;
2029         }
2030 }
2031
2032 /*
2033  * This function is called by each data core. It handles all RX/TX registered
2034  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2035  * addresses are compared with all devices in the main linked list.
2036  */
2037 static int
2038 switch_worker_zcp(__attribute__((unused)) void *arg)
2039 {
2040         struct virtio_net *dev = NULL;
2041         struct vhost_dev  *vdev = NULL;
2042         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2043         struct virtio_net_data_ll *dev_ll;
2044         struct mbuf_table *tx_q;
2045         volatile struct lcore_ll_info *lcore_ll;
2046         const uint64_t drain_tsc
2047                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2048                 * BURST_TX_DRAIN_US;
2049         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2050         unsigned ret;
2051         const uint16_t lcore_id = rte_lcore_id();
2052         uint16_t count_in_ring, rx_count = 0;
2053
2054         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2055
2056         lcore_ll = lcore_info[lcore_id].lcore_ll;
2057         prev_tsc = 0;
2058
2059         while (1) {
2060                 cur_tsc = rte_rdtsc();
2061
2062                 /* TX burst queue drain */
2063                 diff_tsc = cur_tsc - prev_tsc;
2064                 if (unlikely(diff_tsc > drain_tsc)) {
2065                         /*
2066                          * Get mbuf from vpool.pool and detach mbuf and
2067                          * put back into vpool.ring.
2068                          */
2069                         dev_ll = lcore_ll->ll_root_used;
2070                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2071                                 /* Get virtio device ID */
2072                                 vdev = dev_ll->vdev;
2073                                 dev = vdev->dev;
2074
2075                                 if (likely(!vdev->remove)) {
2076                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2077                                         if (tx_q->len) {
2078                                                 LOG_DEBUG(VHOST_DATA,
2079                                                 "TX queue drained after timeout"
2080                                                 " with burst size %u\n",
2081                                                 tx_q->len);
2082
2083                                                 /*
2084                                                  * Tx any packets in the queue
2085                                                  */
2086                                                 ret = rte_eth_tx_burst(
2087                                                         ports[0],
2088                                                         (uint16_t)tx_q->txq_id,
2089                                                         (struct rte_mbuf **)
2090                                                         tx_q->m_table,
2091                                                         (uint16_t)tx_q->len);
2092                                                 if (unlikely(ret < tx_q->len)) {
2093                                                         do {
2094                                                                 rte_pktmbuf_free(
2095                                                                         tx_q->m_table[ret]);
2096                                                         } while (++ret < tx_q->len);
2097                                                 }
2098                                                 tx_q->len = 0;
2099
2100                                                 txmbuf_clean_zcp(dev,
2101                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2102                                         }
2103                                 }
2104                                 dev_ll = dev_ll->next;
2105                         }
2106                         prev_tsc = cur_tsc;
2107                 }
2108
2109                 rte_prefetch0(lcore_ll->ll_root_used);
2110
2111                 /*
2112                  * Inform the configuration core that we have exited the linked
2113                  * list and that no devices are in use if requested.
2114                  */
2115                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2116                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2117
2118                 /* Process devices */
2119                 dev_ll = lcore_ll->ll_root_used;
2120
2121                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2122                         vdev = dev_ll->vdev;
2123                         dev  = vdev->dev;
2124                         if (unlikely(vdev->remove)) {
2125                                 dev_ll = dev_ll->next;
2126                                 unlink_vmdq(vdev);
2127                                 vdev->ready = DEVICE_SAFE_REMOVE;
2128                                 continue;
2129                         }
2130
2131                         if (likely(vdev->ready == DEVICE_RX)) {
2132                                 uint32_t index = vdev->vmdq_rx_q;
2133                                 uint16_t i;
2134                                 count_in_ring
2135                                 = rte_ring_count(vpool_array[index].ring);
2136                                 uint16_t free_entries
2137                                 = (uint16_t)get_available_ring_num_zcp(dev);
2138
2139                                 /*
2140                                  * Attach all mbufs in vpool.ring and put back
2141                                  * into vpool.pool.
2142                                  */
2143                                 for (i = 0;
2144                                 i < RTE_MIN(free_entries,
2145                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2146                                 i++)
2147                                         attach_rxmbuf_zcp(dev);
2148
2149                                 /* Handle guest RX */
2150                                 rx_count = rte_eth_rx_burst(ports[0],
2151                                         vdev->vmdq_rx_q, pkts_burst,
2152                                         MAX_PKT_BURST);
2153
2154                                 if (rx_count) {
2155                                         ret_count = virtio_dev_rx_zcp(dev,
2156                                                         pkts_burst, rx_count);
2157                                         if (enable_stats) {
2158                                                 dev_statistics[dev->device_fh].rx_total
2159                                                         += rx_count;
2160                                                 dev_statistics[dev->device_fh].rx
2161                                                         += ret_count;
2162                                         }
2163                                         while (likely(rx_count)) {
2164                                                 rx_count--;
2165                                                 pktmbuf_detach_zcp(
2166                                                         pkts_burst[rx_count]);
2167                                                 rte_ring_sp_enqueue(
2168                                                         vpool_array[index].ring,
2169                                                         (void *)pkts_burst[rx_count]);
2170                                         }
2171                                 }
2172                         }
2173
2174                         if (likely(!vdev->remove))
2175                                 /* Handle guest TX */
2176                                 virtio_dev_tx_zcp(dev);
2177
2178                         /* Move to the next device in the list */
2179                         dev_ll = dev_ll->next;
2180                 }
2181         }
2182
2183         return 0;
2184 }
2185
2186
2187 /*
2188  * Add an entry to a used linked list. A free entry must first be found
2189  * in the free linked list using get_data_ll_free_entry();
2190  */
2191 static void
2192 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2193         struct virtio_net_data_ll *ll_dev)
2194 {
2195         struct virtio_net_data_ll *ll = *ll_root_addr;
2196
2197         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2198         ll_dev->next = NULL;
2199         rte_compiler_barrier();
2200
2201         /* If ll == NULL then this is the first device. */
2202         if (ll) {
2203                 /* Increment to the tail of the linked list. */
2204                 while ((ll->next != NULL) )
2205                         ll = ll->next;
2206
2207                 ll->next = ll_dev;
2208         } else {
2209                 *ll_root_addr = ll_dev;
2210         }
2211 }
2212
2213 /*
2214  * Remove an entry from a used linked list. The entry must then be added to
2215  * the free linked list using put_data_ll_free_entry().
2216  */
2217 static void
2218 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2219         struct virtio_net_data_ll *ll_dev,
2220         struct virtio_net_data_ll *ll_dev_last)
2221 {
2222         struct virtio_net_data_ll *ll = *ll_root_addr;
2223
2224         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2225                 return;
2226
2227         if (ll_dev == ll)
2228                 *ll_root_addr = ll_dev->next;
2229         else
2230                 if (likely(ll_dev_last != NULL))
2231                         ll_dev_last->next = ll_dev->next;
2232                 else
2233                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2234 }
2235
2236 /*
2237  * Find and return an entry from the free linked list.
2238  */
2239 static struct virtio_net_data_ll *
2240 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2241 {
2242         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2243         struct virtio_net_data_ll *ll_dev;
2244
2245         if (ll_free == NULL)
2246                 return NULL;
2247
2248         ll_dev = ll_free;
2249         *ll_root_addr = ll_free->next;
2250
2251         return ll_dev;
2252 }
2253
2254 /*
2255  * Place an entry back on to the free linked list.
2256  */
2257 static void
2258 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2259         struct virtio_net_data_ll *ll_dev)
2260 {
2261         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2262
2263         if (ll_dev == NULL)
2264                 return;
2265
2266         ll_dev->next = ll_free;
2267         *ll_root_addr = ll_dev;
2268 }
2269
2270 /*
2271  * Creates a linked list of a given size.
2272  */
2273 static struct virtio_net_data_ll *
2274 alloc_data_ll(uint32_t size)
2275 {
2276         struct virtio_net_data_ll *ll_new;
2277         uint32_t i;
2278
2279         /* Malloc and then chain the linked list. */
2280         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2281         if (ll_new == NULL) {
2282                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2283                 return NULL;
2284         }
2285
2286         for (i = 0; i < size - 1; i++) {
2287                 ll_new[i].vdev = NULL;
2288                 ll_new[i].next = &ll_new[i+1];
2289         }
2290         ll_new[i].next = NULL;
2291
2292         return (ll_new);
2293 }
2294
2295 /*
2296  * Create the main linked list along with each individual cores linked list. A used and a free list
2297  * are created to manage entries.
2298  */
2299 static int
2300 init_data_ll (void)
2301 {
2302         int lcore;
2303
2304         RTE_LCORE_FOREACH_SLAVE(lcore) {
2305                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2306                 if (lcore_info[lcore].lcore_ll == NULL) {
2307                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2308                         return -1;
2309                 }
2310
2311                 lcore_info[lcore].lcore_ll->device_num = 0;
2312                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2313                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2314                 if (num_devices % num_switching_cores)
2315                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2316                 else
2317                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2318         }
2319
2320         /* Allocate devices up to a maximum of MAX_DEVICES. */
2321         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2322
2323         return 0;
2324 }
2325
2326 /*
2327  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2328  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2329  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2330  */
2331 static void
2332 destroy_device (volatile struct virtio_net *dev)
2333 {
2334         struct virtio_net_data_ll *ll_lcore_dev_cur;
2335         struct virtio_net_data_ll *ll_main_dev_cur;
2336         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2337         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2338         struct vhost_dev *vdev;
2339         int lcore;
2340
2341         dev->flags &= ~VIRTIO_DEV_RUNNING;
2342
2343         vdev = (struct vhost_dev *)dev->priv;
2344         /*set the remove flag. */
2345         vdev->remove = 1;
2346         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2347                 rte_pause();
2348         }
2349
2350         /* Search for entry to be removed from lcore ll */
2351         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2352         while (ll_lcore_dev_cur != NULL) {
2353                 if (ll_lcore_dev_cur->vdev == vdev) {
2354                         break;
2355                 } else {
2356                         ll_lcore_dev_last = ll_lcore_dev_cur;
2357                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2358                 }
2359         }
2360
2361         if (ll_lcore_dev_cur == NULL) {
2362                 RTE_LOG(ERR, VHOST_CONFIG,
2363                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2364                         dev->device_fh);
2365                 return;
2366         }
2367
2368         /* Search for entry to be removed from main ll */
2369         ll_main_dev_cur = ll_root_used;
2370         ll_main_dev_last = NULL;
2371         while (ll_main_dev_cur != NULL) {
2372                 if (ll_main_dev_cur->vdev == vdev) {
2373                         break;
2374                 } else {
2375                         ll_main_dev_last = ll_main_dev_cur;
2376                         ll_main_dev_cur = ll_main_dev_cur->next;
2377                 }
2378         }
2379
2380         /* Remove entries from the lcore and main ll. */
2381         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2382         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2383
2384         /* Set the dev_removal_flag on each lcore. */
2385         RTE_LCORE_FOREACH_SLAVE(lcore) {
2386                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2387         }
2388
2389         /*
2390          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2391          * they can no longer access the device removed from the linked lists and that the devices
2392          * are no longer in use.
2393          */
2394         RTE_LCORE_FOREACH_SLAVE(lcore) {
2395                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2396                         rte_pause();
2397                 }
2398         }
2399
2400         /* Add the entries back to the lcore and main free ll.*/
2401         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2402         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2403
2404         /* Decrement number of device on the lcore. */
2405         lcore_info[vdev->coreid].lcore_ll->device_num--;
2406
2407         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2408
2409         if (zero_copy) {
2410                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2411
2412                 /* Stop the RX queue. */
2413                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2414                         LOG_DEBUG(VHOST_CONFIG,
2415                                 "(%"PRIu64") In destroy_device: Failed to stop "
2416                                 "rx queue:%d\n",
2417                                 dev->device_fh,
2418                                 vdev->vmdq_rx_q);
2419                 }
2420
2421                 LOG_DEBUG(VHOST_CONFIG,
2422                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2423                         "mempool back to ring for RX queue: %d\n",
2424                         dev->device_fh, vdev->vmdq_rx_q);
2425
2426                 mbuf_destroy_zcp(vpool);
2427
2428                 /* Stop the TX queue. */
2429                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2430                         LOG_DEBUG(VHOST_CONFIG,
2431                                 "(%"PRIu64") In destroy_device: Failed to "
2432                                 "stop tx queue:%d\n",
2433                                 dev->device_fh, vdev->vmdq_rx_q);
2434                 }
2435
2436                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2437
2438                 LOG_DEBUG(VHOST_CONFIG,
2439                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2440                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2441                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2442                         dev->device_fh);
2443
2444                 mbuf_destroy_zcp(vpool);
2445                 rte_free(vdev->regions_hpa);
2446         }
2447         rte_free(vdev);
2448
2449 }
2450
2451 /*
2452  * Calculate the region count of physical continous regions for one particular
2453  * region of whose vhost virtual address is continous. The particular region
2454  * start from vva_start, with size of 'size' in argument.
2455  */
2456 static uint32_t
2457 check_hpa_regions(uint64_t vva_start, uint64_t size)
2458 {
2459         uint32_t i, nregions = 0, page_size = getpagesize();
2460         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2461         if (vva_start % page_size) {
2462                 LOG_DEBUG(VHOST_CONFIG,
2463                         "in check_countinous: vva start(%p) mod page_size(%d) "
2464                         "has remainder\n",
2465                         (void *)(uintptr_t)vva_start, page_size);
2466                 return 0;
2467         }
2468         if (size % page_size) {
2469                 LOG_DEBUG(VHOST_CONFIG,
2470                         "in check_countinous: "
2471                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2472                         size, page_size);
2473                 return 0;
2474         }
2475         for (i = 0; i < size - page_size; i = i + page_size) {
2476                 cur_phys_addr
2477                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2478                 next_phys_addr = rte_mem_virt2phy(
2479                         (void *)(uintptr_t)(vva_start + i + page_size));
2480                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2481                         ++nregions;
2482                         LOG_DEBUG(VHOST_CONFIG,
2483                                 "in check_continuous: hva addr:(%p) is not "
2484                                 "continuous with hva addr:(%p), diff:%d\n",
2485                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2486                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2487                                 + page_size), page_size);
2488                         LOG_DEBUG(VHOST_CONFIG,
2489                                 "in check_continuous: hpa addr:(%p) is not "
2490                                 "continuous with hpa addr:(%p), "
2491                                 "diff:(%"PRIu64")\n",
2492                                 (void *)(uintptr_t)cur_phys_addr,
2493                                 (void *)(uintptr_t)next_phys_addr,
2494                                 (next_phys_addr-cur_phys_addr));
2495                 }
2496         }
2497         return nregions;
2498 }
2499
2500 /*
2501  * Divide each region whose vhost virtual address is continous into a few
2502  * sub-regions, make sure the physical address within each sub-region are
2503  * continous. And fill offset(to GPA) and size etc. information of each
2504  * sub-region into regions_hpa.
2505  */
2506 static uint32_t
2507 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2508 {
2509         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2510         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2511
2512         if (mem_region_hpa == NULL)
2513                 return 0;
2514
2515         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2516                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2517                         virtio_memory->regions[regionidx].address_offset;
2518                 mem_region_hpa[regionidx_hpa].guest_phys_address
2519                         = virtio_memory->regions[regionidx].guest_phys_address;
2520                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2521                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2522                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2523                 LOG_DEBUG(VHOST_CONFIG,
2524                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2525                         regionidx_hpa,
2526                         (void *)(uintptr_t)
2527                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2528                 LOG_DEBUG(VHOST_CONFIG,
2529                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2530                         regionidx_hpa,
2531                         (void *)(uintptr_t)
2532                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2533                 for (i = 0, k = 0;
2534                         i < virtio_memory->regions[regionidx].memory_size -
2535                                 page_size;
2536                         i += page_size) {
2537                         cur_phys_addr = rte_mem_virt2phy(
2538                                         (void *)(uintptr_t)(vva_start + i));
2539                         next_phys_addr = rte_mem_virt2phy(
2540                                         (void *)(uintptr_t)(vva_start +
2541                                         i + page_size));
2542                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2543                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2544                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2545                                         k + page_size;
2546                                 mem_region_hpa[regionidx_hpa].memory_size
2547                                         = k + page_size;
2548                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2549                                         "phys addr end  [%d]:(%p)\n",
2550                                         regionidx_hpa,
2551                                         (void *)(uintptr_t)
2552                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2553                                 LOG_DEBUG(VHOST_CONFIG,
2554                                         "in fill_hpa_regions: guest phys addr "
2555                                         "size [%d]:(%p)\n",
2556                                         regionidx_hpa,
2557                                         (void *)(uintptr_t)
2558                                         (mem_region_hpa[regionidx_hpa].memory_size));
2559                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2560                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2561                                 ++regionidx_hpa;
2562                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2563                                         next_phys_addr -
2564                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2565                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2566                                         " phys addr start[%d]:(%p)\n",
2567                                         regionidx_hpa,
2568                                         (void *)(uintptr_t)
2569                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2570                                 LOG_DEBUG(VHOST_CONFIG,
2571                                         "in fill_hpa_regions: host  phys addr "
2572                                         "start[%d]:(%p)\n",
2573                                         regionidx_hpa,
2574                                         (void *)(uintptr_t)
2575                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2576                                 k = 0;
2577                         } else {
2578                                 k += page_size;
2579                         }
2580                 }
2581                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2582                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2583                         + k + page_size;
2584                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2585                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2586                         "[%d]:(%p)\n", regionidx_hpa,
2587                         (void *)(uintptr_t)
2588                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2589                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2590                         "[%d]:(%p)\n", regionidx_hpa,
2591                         (void *)(uintptr_t)
2592                         (mem_region_hpa[regionidx_hpa].memory_size));
2593                 ++regionidx_hpa;
2594         }
2595         return regionidx_hpa;
2596 }
2597
2598 /*
2599  * A new device is added to a data core. First the device is added to the main linked list
2600  * and the allocated to a specific data core.
2601  */
2602 static int
2603 new_device (struct virtio_net *dev)
2604 {
2605         struct virtio_net_data_ll *ll_dev;
2606         int lcore, core_add = 0;
2607         uint32_t device_num_min = num_devices;
2608         struct vhost_dev *vdev;
2609         uint32_t regionidx;
2610
2611         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2612         if (vdev == NULL) {
2613                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2614                         dev->device_fh);
2615                 return -1;
2616         }
2617         vdev->dev = dev;
2618         dev->priv = vdev;
2619
2620         if (zero_copy) {
2621                 vdev->nregions_hpa = dev->mem->nregions;
2622                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2623                         vdev->nregions_hpa
2624                                 += check_hpa_regions(
2625                                         dev->mem->regions[regionidx].guest_phys_address
2626                                         + dev->mem->regions[regionidx].address_offset,
2627                                         dev->mem->regions[regionidx].memory_size);
2628
2629                 }
2630
2631                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2632                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2633                         RTE_CACHE_LINE_SIZE);
2634                 if (vdev->regions_hpa == NULL) {
2635                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2636                         rte_free(vdev);
2637                         return -1;
2638                 }
2639
2640
2641                 if (fill_hpa_memory_regions(
2642                         vdev->regions_hpa, dev->mem
2643                         ) != vdev->nregions_hpa) {
2644
2645                         RTE_LOG(ERR, VHOST_CONFIG,
2646                                 "hpa memory regions number mismatch: "
2647                                 "[%d]\n", vdev->nregions_hpa);
2648                         rte_free(vdev->regions_hpa);
2649                         rte_free(vdev);
2650                         return -1;
2651                 }
2652         }
2653
2654
2655         /* Add device to main ll */
2656         ll_dev = get_data_ll_free_entry(&ll_root_free);
2657         if (ll_dev == NULL) {
2658                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2659                         "of %d devices per core has been reached\n",
2660                         dev->device_fh, num_devices);
2661                 if (vdev->regions_hpa)
2662                         rte_free(vdev->regions_hpa);
2663                 rte_free(vdev);
2664                 return -1;
2665         }
2666         ll_dev->vdev = vdev;
2667         add_data_ll_entry(&ll_root_used, ll_dev);
2668         vdev->vmdq_rx_q
2669                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2670
2671         if (zero_copy) {
2672                 uint32_t index = vdev->vmdq_rx_q;
2673                 uint32_t count_in_ring, i;
2674                 struct mbuf_table *tx_q;
2675
2676                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2677
2678                 LOG_DEBUG(VHOST_CONFIG,
2679                         "(%"PRIu64") in new_device: mbuf count in mempool "
2680                         "before attach is: %d\n",
2681                         dev->device_fh,
2682                         rte_mempool_count(vpool_array[index].pool));
2683                 LOG_DEBUG(VHOST_CONFIG,
2684                         "(%"PRIu64") in new_device: mbuf count in  ring "
2685                         "before attach  is : %d\n",
2686                         dev->device_fh, count_in_ring);
2687
2688                 /*
2689                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2690                  */
2691                 for (i = 0; i < count_in_ring; i++)
2692                         attach_rxmbuf_zcp(dev);
2693
2694                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2695                         "mempool after attach is: %d\n",
2696                         dev->device_fh,
2697                         rte_mempool_count(vpool_array[index].pool));
2698                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2699                         "ring after attach  is : %d\n",
2700                         dev->device_fh,
2701                         rte_ring_count(vpool_array[index].ring));
2702
2703                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2704                 tx_q->txq_id = vdev->vmdq_rx_q;
2705
2706                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2707                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2708
2709                         LOG_DEBUG(VHOST_CONFIG,
2710                                 "(%"PRIu64") In new_device: Failed to start "
2711                                 "tx queue:%d\n",
2712                                 dev->device_fh, vdev->vmdq_rx_q);
2713
2714                         mbuf_destroy_zcp(vpool);
2715                         rte_free(vdev->regions_hpa);
2716                         rte_free(vdev);
2717                         return -1;
2718                 }
2719
2720                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2721                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2722
2723                         LOG_DEBUG(VHOST_CONFIG,
2724                                 "(%"PRIu64") In new_device: Failed to start "
2725                                 "rx queue:%d\n",
2726                                 dev->device_fh, vdev->vmdq_rx_q);
2727
2728                         /* Stop the TX queue. */
2729                         if (rte_eth_dev_tx_queue_stop(ports[0],
2730                                 vdev->vmdq_rx_q) != 0) {
2731                                 LOG_DEBUG(VHOST_CONFIG,
2732                                         "(%"PRIu64") In new_device: Failed to "
2733                                         "stop tx queue:%d\n",
2734                                         dev->device_fh, vdev->vmdq_rx_q);
2735                         }
2736
2737                         mbuf_destroy_zcp(vpool);
2738                         rte_free(vdev->regions_hpa);
2739                         rte_free(vdev);
2740                         return -1;
2741                 }
2742
2743         }
2744
2745         /*reset ready flag*/
2746         vdev->ready = DEVICE_MAC_LEARNING;
2747         vdev->remove = 0;
2748
2749         /* Find a suitable lcore to add the device. */
2750         RTE_LCORE_FOREACH_SLAVE(lcore) {
2751                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2752                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2753                         core_add = lcore;
2754                 }
2755         }
2756         /* Add device to lcore ll */
2757         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2758         if (ll_dev == NULL) {
2759                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2760                 vdev->ready = DEVICE_SAFE_REMOVE;
2761                 destroy_device(dev);
2762                 if (vdev->regions_hpa)
2763                         rte_free(vdev->regions_hpa);
2764                 rte_free(vdev);
2765                 return -1;
2766         }
2767         ll_dev->vdev = vdev;
2768         vdev->coreid = core_add;
2769
2770         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2771
2772         /* Initialize device stats */
2773         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2774
2775         /* Disable notifications. */
2776         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2777         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2778         lcore_info[vdev->coreid].lcore_ll->device_num++;
2779         dev->flags |= VIRTIO_DEV_RUNNING;
2780
2781         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2782
2783         return 0;
2784 }
2785
2786 /*
2787  * These callback allow devices to be added to the data core when configuration
2788  * has been fully complete.
2789  */
2790 static const struct virtio_net_device_ops virtio_net_device_ops =
2791 {
2792         .new_device =  new_device,
2793         .destroy_device = destroy_device,
2794 };
2795
2796 /*
2797  * This is a thread will wake up after a period to print stats if the user has
2798  * enabled them.
2799  */
2800 static void
2801 print_stats(void)
2802 {
2803         struct virtio_net_data_ll *dev_ll;
2804         uint64_t tx_dropped, rx_dropped;
2805         uint64_t tx, tx_total, rx, rx_total;
2806         uint32_t device_fh;
2807         const char clr[] = { 27, '[', '2', 'J', '\0' };
2808         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2809
2810         while(1) {
2811                 sleep(enable_stats);
2812
2813                 /* Clear screen and move to top left */
2814                 printf("%s%s", clr, top_left);
2815
2816                 printf("\nDevice statistics ====================================");
2817
2818                 dev_ll = ll_root_used;
2819                 while (dev_ll != NULL) {
2820                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2821                         tx_total = dev_statistics[device_fh].tx_total;
2822                         tx = dev_statistics[device_fh].tx;
2823                         tx_dropped = tx_total - tx;
2824                         if (zero_copy == 0) {
2825                                 rx_total = rte_atomic64_read(
2826                                         &dev_statistics[device_fh].rx_total_atomic);
2827                                 rx = rte_atomic64_read(
2828                                         &dev_statistics[device_fh].rx_atomic);
2829                         } else {
2830                                 rx_total = dev_statistics[device_fh].rx_total;
2831                                 rx = dev_statistics[device_fh].rx;
2832                         }
2833                         rx_dropped = rx_total - rx;
2834
2835                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2836                                         "\nTX total:            %"PRIu64""
2837                                         "\nTX dropped:          %"PRIu64""
2838                                         "\nTX successful:               %"PRIu64""
2839                                         "\nRX total:            %"PRIu64""
2840                                         "\nRX dropped:          %"PRIu64""
2841                                         "\nRX successful:               %"PRIu64"",
2842                                         device_fh,
2843                                         tx_total,
2844                                         tx_dropped,
2845                                         tx,
2846                                         rx_total,
2847                                         rx_dropped,
2848                                         rx);
2849
2850                         dev_ll = dev_ll->next;
2851                 }
2852                 printf("\n======================================================\n");
2853         }
2854 }
2855
2856 static void
2857 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2858         char *ring_name, uint32_t nb_mbuf)
2859 {
2860         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2861         vpool_array[index].pool
2862                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2863                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2864                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2865                 rte_pktmbuf_init, NULL, socket, 0);
2866         if (vpool_array[index].pool != NULL) {
2867                 vpool_array[index].ring
2868                         = rte_ring_create(ring_name,
2869                                 rte_align32pow2(nb_mbuf + 1),
2870                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2871                 if (likely(vpool_array[index].ring != NULL)) {
2872                         LOG_DEBUG(VHOST_CONFIG,
2873                                 "in setup_mempool_tbl: mbuf count in "
2874                                 "mempool is: %d\n",
2875                                 rte_mempool_count(vpool_array[index].pool));
2876                         LOG_DEBUG(VHOST_CONFIG,
2877                                 "in setup_mempool_tbl: mbuf count in "
2878                                 "ring   is: %d\n",
2879                                 rte_ring_count(vpool_array[index].ring));
2880                 } else {
2881                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2882                                 ring_name);
2883                 }
2884
2885                 /* Need consider head room. */
2886                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2887         } else {
2888                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2889         }
2890 }
2891
2892
2893 /*
2894  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2895  * device is also registered here to handle the IOCTLs.
2896  */
2897 int
2898 main(int argc, char *argv[])
2899 {
2900         struct rte_mempool *mbuf_pool = NULL;
2901         unsigned lcore_id, core_id = 0;
2902         unsigned nb_ports, valid_num_ports;
2903         int ret;
2904         uint8_t portid;
2905         uint16_t queue_id;
2906         static pthread_t tid;
2907
2908         /* init EAL */
2909         ret = rte_eal_init(argc, argv);
2910         if (ret < 0)
2911                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2912         argc -= ret;
2913         argv += ret;
2914
2915         /* parse app arguments */
2916         ret = us_vhost_parse_args(argc, argv);
2917         if (ret < 0)
2918                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2919
2920         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2921                 if (rte_lcore_is_enabled(lcore_id))
2922                         lcore_ids[core_id ++] = lcore_id;
2923
2924         if (rte_lcore_count() > RTE_MAX_LCORE)
2925                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2926
2927         /*set the number of swithcing cores available*/
2928         num_switching_cores = rte_lcore_count()-1;
2929
2930         /* Get the number of physical ports. */
2931         nb_ports = rte_eth_dev_count();
2932         if (nb_ports > RTE_MAX_ETHPORTS)
2933                 nb_ports = RTE_MAX_ETHPORTS;
2934
2935         /*
2936          * Update the global var NUM_PORTS and global array PORTS
2937          * and get value of var VALID_NUM_PORTS according to system ports number
2938          */
2939         valid_num_ports = check_ports_num(nb_ports);
2940
2941         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2942                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2943                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2944                 return -1;
2945         }
2946
2947         if (zero_copy == 0) {
2948                 /* Create the mbuf pool. */
2949                 mbuf_pool = rte_mempool_create(
2950                                 "MBUF_POOL",
2951                                 NUM_MBUFS_PER_PORT
2952                                 * valid_num_ports,
2953                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2954                                 sizeof(struct rte_pktmbuf_pool_private),
2955                                 rte_pktmbuf_pool_init, NULL,
2956                                 rte_pktmbuf_init, NULL,
2957                                 rte_socket_id(), 0);
2958                 if (mbuf_pool == NULL)
2959                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2960
2961                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2962                         vpool_array[queue_id].pool = mbuf_pool;
2963
2964                 if (vm2vm_mode == VM2VM_HARDWARE) {
2965                         /* Enable VT loop back to let L2 switch to do it. */
2966                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2967                         LOG_DEBUG(VHOST_CONFIG,
2968                                 "Enable loop back for L2 switch in vmdq.\n");
2969                 }
2970         } else {
2971                 uint32_t nb_mbuf;
2972                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2973                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2974
2975                 nb_mbuf = num_rx_descriptor
2976                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2977                         + num_switching_cores * MAX_PKT_BURST;
2978
2979                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2980                         snprintf(pool_name, sizeof(pool_name),
2981                                 "rxmbuf_pool_%u", queue_id);
2982                         snprintf(ring_name, sizeof(ring_name),
2983                                 "rxmbuf_ring_%u", queue_id);
2984                         setup_mempool_tbl(rte_socket_id(), queue_id,
2985                                 pool_name, ring_name, nb_mbuf);
2986                 }
2987
2988                 nb_mbuf = num_tx_descriptor
2989                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2990                                 + num_switching_cores * MAX_PKT_BURST;
2991
2992                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2993                         snprintf(pool_name, sizeof(pool_name),
2994                                 "txmbuf_pool_%u", queue_id);
2995                         snprintf(ring_name, sizeof(ring_name),
2996                                 "txmbuf_ring_%u", queue_id);
2997                         setup_mempool_tbl(rte_socket_id(),
2998                                 (queue_id + MAX_QUEUES),
2999                                 pool_name, ring_name, nb_mbuf);
3000                 }
3001
3002                 if (vm2vm_mode == VM2VM_HARDWARE) {
3003                         /* Enable VT loop back to let L2 switch to do it. */
3004                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3005                         LOG_DEBUG(VHOST_CONFIG,
3006                                 "Enable loop back for L2 switch in vmdq.\n");
3007                 }
3008         }
3009         /* Set log level. */
3010         rte_set_log_level(LOG_LEVEL);
3011
3012         /* initialize all ports */
3013         for (portid = 0; portid < nb_ports; portid++) {
3014                 /* skip ports that are not enabled */
3015                 if ((enabled_port_mask & (1 << portid)) == 0) {
3016                         RTE_LOG(INFO, VHOST_PORT,
3017                                 "Skipping disabled port %d\n", portid);
3018                         continue;
3019                 }
3020                 if (port_init(portid) != 0)
3021                         rte_exit(EXIT_FAILURE,
3022                                 "Cannot initialize network ports\n");
3023         }
3024
3025         /* Initialise all linked lists. */
3026         if (init_data_ll() == -1)
3027                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3028
3029         /* Initialize device stats */
3030         memset(&dev_statistics, 0, sizeof(dev_statistics));
3031
3032         /* Enable stats if the user option is set. */
3033         if (enable_stats)
3034                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3035
3036         /* Launch all data cores. */
3037         if (zero_copy == 0) {
3038                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3039                         rte_eal_remote_launch(switch_worker,
3040                                 mbuf_pool, lcore_id);
3041                 }
3042         } else {
3043                 uint32_t count_in_mempool, index, i;
3044                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3045                         /* For all RX and TX queues. */
3046                         count_in_mempool
3047                                 = rte_mempool_count(vpool_array[index].pool);
3048
3049                         /*
3050                          * Transfer all un-attached mbufs from vpool.pool
3051                          * to vpoo.ring.
3052                          */
3053                         for (i = 0; i < count_in_mempool; i++) {
3054                                 struct rte_mbuf *mbuf
3055                                         = __rte_mbuf_raw_alloc(
3056                                                 vpool_array[index].pool);
3057                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3058                                                 (void *)mbuf);
3059                         }
3060
3061                         LOG_DEBUG(VHOST_CONFIG,
3062                                 "in main: mbuf count in mempool at initial "
3063                                 "is: %d\n", count_in_mempool);
3064                         LOG_DEBUG(VHOST_CONFIG,
3065                                 "in main: mbuf count in  ring at initial  is :"
3066                                 " %d\n",
3067                                 rte_ring_count(vpool_array[index].ring));
3068                 }
3069
3070                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3071                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3072                                 lcore_id);
3073         }
3074
3075         if (mergeable == 0)
3076                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3077
3078         /* Register CUSE device to handle IOCTLs. */
3079         ret = rte_vhost_driver_register((char *)&dev_basename);
3080         if (ret != 0)
3081                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3082
3083         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3084
3085         /* Start CUSE session. */
3086         rte_vhost_driver_session_start();
3087         return 0;
3088
3089 }
3090