examples/vhost: use burst enqueue and dequeue from lib
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
103 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
104
105 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
106 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
107
108 #define JUMBO_FRAME_MAX_SIZE    0x2600
109
110 /* State of virtio device. */
111 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_RX                       1
113 #define DEVICE_SAFE_REMOVE      2
114
115 /* Config_core_flag status definitions. */
116 #define REQUEST_DEV_REMOVAL 1
117 #define ACK_DEV_REMOVAL 0
118
119 /* Configurable number of RX/TX ring descriptors */
120 #define RTE_TEST_RX_DESC_DEFAULT 1024
121 #define RTE_TEST_TX_DESC_DEFAULT 512
122
123 /*
124  * Need refine these 2 macros for legacy and DPDK based front end:
125  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
126  * And then adjust power 2.
127  */
128 /*
129  * For legacy front end, 128 descriptors,
130  * half for virtio header, another half for mbuf.
131  */
132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
134
135 /* Get first 4 bytes in mbuf headroom. */
136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
137                 + sizeof(struct rte_mbuf)))
138
139 /* true if x is a power of 2 */
140 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141
142 #define INVALID_PORT_ID 0xFF
143
144 /* Max number of devices. Limited by vmdq. */
145 #define MAX_DEVICES 64
146
147 /* Size of buffers used for snprintfs. */
148 #define MAX_PRINT_BUFF 6072
149
150 /* Maximum character device basename size. */
151 #define MAX_BASENAME_SZ 10
152
153 /* Maximum long option length for option parsing. */
154 #define MAX_LONG_OPT_SZ 64
155
156 /* Used to compare MAC addresses. */
157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158
159 /* Number of descriptors per cacheline. */
160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161
162 /* mask of enabled ports */
163 static uint32_t enabled_port_mask = 0;
164
165 /*Number of switching cores enabled*/
166 static uint32_t num_switching_cores = 0;
167
168 /* number of devices/queues to support*/
169 static uint32_t num_queues = 0;
170 uint32_t num_devices = 0;
171
172 /*
173  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
174  * disabled on default.
175  */
176 static uint32_t zero_copy;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* This can be set by the user so it is made available here. */
222 extern uint64_t VHOST_FEATURES;
223
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226         .rx_thresh = {
227                 .pthresh = RX_PTHRESH,
228                 .hthresh = RX_HTHRESH,
229                 .wthresh = RX_WTHRESH,
230         },
231         .rx_drop_en = 1,
232 };
233
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240         .tx_thresh = {
241                 .pthresh = TX_PTHRESH,
242                 .hthresh = TX_HTHRESH,
243                 .wthresh = TX_WTHRESH,
244         },
245         .tx_free_thresh = 0, /* Use PMD default values */
246         .tx_rs_thresh = 0, /* Use PMD default values */
247 };
248
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251         .rxmode = {
252                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253                 .split_hdr_size = 0,
254                 .header_split   = 0, /**< Header Split disabled */
255                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
257                 /*
258                  * It is necessary for 1G NIC such as I350,
259                  * this fixes bug of ipv4 forwarding in guest can't
260                  * forward pakets from one virtio dev to another virtio dev.
261                  */
262                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
265         },
266
267         .txmode = {
268                 .mq_mode = ETH_MQ_TX_NONE,
269         },
270         .rx_adv_conf = {
271                 /*
272                  * should be overridden separately in code with
273                  * appropriate values
274                  */
275                 .vmdq_rx_conf = {
276                         .nb_queue_pools = ETH_8_POOLS,
277                         .enable_default_pool = 0,
278                         .default_pool = 0,
279                         .nb_pool_maps = 0,
280                         .pool_map = {{0, 0},},
281                 },
282         },
283 };
284
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
293         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313         unsigned len;
314         unsigned txq_id;
315         struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326         unsigned char   h_dest[ETH_ALEN];
327         unsigned char   h_source[ETH_ALEN];
328         __be16          h_vlan_proto;
329         __be16          h_vlan_TCI;
330         __be16          h_vlan_encapsulated_proto;
331 };
332
333 /* IPv4 Header */
334 struct ipv4_hdr {
335         uint8_t  version_ihl;           /**< version and header length */
336         uint8_t  type_of_service;       /**< type of service */
337         uint16_t total_length;          /**< length of packet */
338         uint16_t packet_id;             /**< packet ID */
339         uint16_t fragment_offset;       /**< fragmentation offset */
340         uint8_t  time_to_live;          /**< time to live */
341         uint8_t  next_proto_id;         /**< protocol ID */
342         uint16_t hdr_checksum;          /**< header checksum */
343         uint32_t src_addr;              /**< source address */
344         uint32_t dst_addr;              /**< destination address */
345 } __attribute__((__packed__));
346
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350
351 /* Per-device statistics struct */
352 struct device_statistics {
353         uint64_t tx_total;
354         rte_atomic64_t rx_total_atomic;
355         uint64_t rx_total;
356         uint64_t tx;
357         rte_atomic64_t rx_atomic;
358         uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369         struct rte_eth_vmdq_rx_conf conf;
370         unsigned i;
371
372         memset(&conf, 0, sizeof(conf));
373         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
374         conf.nb_pool_maps = num_devices;
375         conf.enable_loop_back =
376                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
377
378         for (i = 0; i < conf.nb_pool_maps; i++) {
379                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
380                 conf.pool_map[i].pools = (1UL << i);
381         }
382
383         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
384         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
385                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
386         return 0;
387 }
388
389 /*
390  * Validate the device number according to the max pool number gotten form
391  * dev_info. If the device number is invalid, give the error message and
392  * return -1. Each device must have its own pool.
393  */
394 static inline int
395 validate_num_devices(uint32_t max_nb_devices)
396 {
397         if (num_devices > max_nb_devices) {
398                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
399                 return -1;
400         }
401         return 0;
402 }
403
404 /*
405  * Initialises a given port using global settings and with the rx buffers
406  * coming from the mbuf_pool passed as parameter
407  */
408 static inline int
409 port_init(uint8_t port)
410 {
411         struct rte_eth_dev_info dev_info;
412         struct rte_eth_conf port_conf;
413         uint16_t rx_rings, tx_rings;
414         uint16_t rx_ring_size, tx_ring_size;
415         int retval;
416         uint16_t q;
417
418         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
419         rte_eth_dev_info_get (port, &dev_info);
420
421         /*configure the number of supported virtio devices based on VMDQ limits */
422         num_devices = dev_info.max_vmdq_pools;
423         num_queues = dev_info.max_rx_queues;
424
425         if (zero_copy) {
426                 rx_ring_size = num_rx_descriptor;
427                 tx_ring_size = num_tx_descriptor;
428                 tx_rings = dev_info.max_tx_queues;
429         } else {
430                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
431                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
432                 tx_rings = (uint16_t)rte_lcore_count();
433         }
434
435         retval = validate_num_devices(MAX_DEVICES);
436         if (retval < 0)
437                 return retval;
438
439         /* Get port configuration. */
440         retval = get_eth_conf(&port_conf, num_devices);
441         if (retval < 0)
442                 return retval;
443
444         if (port >= rte_eth_dev_count()) return -1;
445
446         rx_rings = (uint16_t)num_queues,
447         /* Configure ethernet device. */
448         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449         if (retval != 0)
450                 return retval;
451
452         /* Setup the queues. */
453         for (q = 0; q < rx_rings; q ++) {
454                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
456                                                 vpool_array[q].pool);
457                 if (retval < 0)
458                         return retval;
459         }
460         for (q = 0; q < tx_rings; q ++) {
461                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
463                 if (retval < 0)
464                         return retval;
465         }
466
467         /* Start the device. */
468         retval  = rte_eth_dev_start(port);
469         if (retval < 0) {
470                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471                 return retval;
472         }
473
474         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
475         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
476         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
477                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
478                         (unsigned)port,
479                         vmdq_ports_eth_addr[port].addr_bytes[0],
480                         vmdq_ports_eth_addr[port].addr_bytes[1],
481                         vmdq_ports_eth_addr[port].addr_bytes[2],
482                         vmdq_ports_eth_addr[port].addr_bytes[3],
483                         vmdq_ports_eth_addr[port].addr_bytes[4],
484                         vmdq_ports_eth_addr[port].addr_bytes[5]);
485
486         return 0;
487 }
488
489 /*
490  * Set character device basename.
491  */
492 static int
493 us_vhost_parse_basename(const char *q_arg)
494 {
495         /* parse number string */
496
497         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
498                 return -1;
499         else
500                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
501
502         return 0;
503 }
504
505 /*
506  * Parse the portmask provided at run time.
507  */
508 static int
509 parse_portmask(const char *portmask)
510 {
511         char *end = NULL;
512         unsigned long pm;
513
514         errno = 0;
515
516         /* parse hexadecimal string */
517         pm = strtoul(portmask, &end, 16);
518         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
519                 return -1;
520
521         if (pm == 0)
522                 return -1;
523
524         return pm;
525
526 }
527
528 /*
529  * Parse num options at run time.
530  */
531 static int
532 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
533 {
534         char *end = NULL;
535         unsigned long num;
536
537         errno = 0;
538
539         /* parse unsigned int string */
540         num = strtoul(q_arg, &end, 10);
541         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
542                 return -1;
543
544         if (num > max_valid_value)
545                 return -1;
546
547         return num;
548
549 }
550
551 /*
552  * Display usage
553  */
554 static void
555 us_vhost_usage(const char *prgname)
556 {
557         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
558         "               --vm2vm [0|1|2]\n"
559         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
560         "               --dev-basename <name>\n"
561         "               --nb-devices ND\n"
562         "               -p PORTMASK: Set mask for ports to be used by application\n"
563         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
564         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
565         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
566         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
567         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
568         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
569         "               --dev-basename: The basename to be used for the character device.\n"
570         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
571                         "zero copy\n"
572         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
573                         "used only when zero copy is enabled.\n"
574         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
575                         "used only when zero copy is enabled.\n",
576                prgname);
577 }
578
579 /*
580  * Parse the arguments given in the command line of the application.
581  */
582 static int
583 us_vhost_parse_args(int argc, char **argv)
584 {
585         int opt, ret;
586         int option_index;
587         unsigned i;
588         const char *prgname = argv[0];
589         static struct option long_option[] = {
590                 {"vm2vm", required_argument, NULL, 0},
591                 {"rx-retry", required_argument, NULL, 0},
592                 {"rx-retry-delay", required_argument, NULL, 0},
593                 {"rx-retry-num", required_argument, NULL, 0},
594                 {"mergeable", required_argument, NULL, 0},
595                 {"stats", required_argument, NULL, 0},
596                 {"dev-basename", required_argument, NULL, 0},
597                 {"zero-copy", required_argument, NULL, 0},
598                 {"rx-desc-num", required_argument, NULL, 0},
599                 {"tx-desc-num", required_argument, NULL, 0},
600                 {NULL, 0, 0, 0},
601         };
602
603         /* Parse command line */
604         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605                 switch (opt) {
606                 /* Portmask */
607                 case 'p':
608                         enabled_port_mask = parse_portmask(optarg);
609                         if (enabled_port_mask == 0) {
610                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611                                 us_vhost_usage(prgname);
612                                 return -1;
613                         }
614                         break;
615
616                 case 0:
617                         /* Enable/disable vm2vm comms. */
618                         if (!strncmp(long_option[option_index].name, "vm2vm",
619                                 MAX_LONG_OPT_SZ)) {
620                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621                                 if (ret == -1) {
622                                         RTE_LOG(INFO, VHOST_CONFIG,
623                                                 "Invalid argument for "
624                                                 "vm2vm [0|1|2]\n");
625                                         us_vhost_usage(prgname);
626                                         return -1;
627                                 } else {
628                                         vm2vm_mode = (vm2vm_type)ret;
629                                 }
630                         }
631
632                         /* Enable/disable retries on RX. */
633                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634                                 ret = parse_num_opt(optarg, 1);
635                                 if (ret == -1) {
636                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637                                         us_vhost_usage(prgname);
638                                         return -1;
639                                 } else {
640                                         enable_retry = ret;
641                                 }
642                         }
643
644                         /* Specify the retries delay time (in useconds) on RX. */
645                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646                                 ret = parse_num_opt(optarg, INT32_MAX);
647                                 if (ret == -1) {
648                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         burst_rx_delay_time = ret;
653                                 }
654                         }
655
656                         /* Specify the retries number on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, INT32_MAX);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         burst_rx_retry_num = ret;
665                                 }
666                         }
667
668                         /* Enable/disable RX mergeable buffers. */
669                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else {
676                                         if (ret) {
677                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
678                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
679                                                         = JUMBO_FRAME_MAX_SIZE;
680                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681                                         }
682                                 }
683                         }
684
685                         /* Enable/disable stats. */
686                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687                                 ret = parse_num_opt(optarg, INT32_MAX);
688                                 if (ret == -1) {
689                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690                                         us_vhost_usage(prgname);
691                                         return -1;
692                                 } else {
693                                         enable_stats = ret;
694                                 }
695                         }
696
697                         /* Set character device basename. */
698                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699                                 if (us_vhost_parse_basename(optarg) == -1) {
700                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701                                         us_vhost_usage(prgname);
702                                         return -1;
703                                 }
704                         }
705
706                         /* Enable/disable rx/tx zero copy. */
707                         if (!strncmp(long_option[option_index].name,
708                                 "zero-copy", MAX_LONG_OPT_SZ)) {
709                                 ret = parse_num_opt(optarg, 1);
710                                 if (ret == -1) {
711                                         RTE_LOG(INFO, VHOST_CONFIG,
712                                                 "Invalid argument"
713                                                 " for zero-copy [0|1]\n");
714                                         us_vhost_usage(prgname);
715                                         return -1;
716                                 } else
717                                         zero_copy = ret;
718
719                                 if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722                                         "zero copy vhost APP, please "
723                                         "disable RTE_MBUF_REFCNT\n"
724                                         "in config file and then rebuild DPDK "
725                                         "core lib!\n"
726                                         "Otherwise please disable zero copy "
727                                         "flag in command line!\n");
728                                         return -1;
729 #endif
730                                 }
731                         }
732
733                         /* Specify the descriptor number on RX. */
734                         if (!strncmp(long_option[option_index].name,
735                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
736                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
737                                 if ((ret == -1) || (!POWEROF2(ret))) {
738                                         RTE_LOG(INFO, VHOST_CONFIG,
739                                         "Invalid argument for rx-desc-num[0-N],"
740                                         "power of 2 required.\n");
741                                         us_vhost_usage(prgname);
742                                         return -1;
743                                 } else {
744                                         num_rx_descriptor = ret;
745                                 }
746                         }
747
748                         /* Specify the descriptor number on TX. */
749                         if (!strncmp(long_option[option_index].name,
750                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
751                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
752                                 if ((ret == -1) || (!POWEROF2(ret))) {
753                                         RTE_LOG(INFO, VHOST_CONFIG,
754                                         "Invalid argument for tx-desc-num [0-N],"
755                                         "power of 2 required.\n");
756                                         us_vhost_usage(prgname);
757                                         return -1;
758                                 } else {
759                                         num_tx_descriptor = ret;
760                                 }
761                         }
762
763                         break;
764
765                         /* Invalid option - print options. */
766                 default:
767                         us_vhost_usage(prgname);
768                         return -1;
769                 }
770         }
771
772         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773                 if (enabled_port_mask & (1 << i))
774                         ports[num_ports++] = (uint8_t)i;
775         }
776
777         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780                 return -1;
781         }
782
783         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784                 RTE_LOG(INFO, VHOST_PORT,
785                         "Vhost zero copy doesn't support software vm2vm,"
786                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
787                 return -1;
788         }
789
790         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791                 RTE_LOG(INFO, VHOST_PORT,
792                         "Vhost zero copy doesn't support jumbo frame,"
793                         "please specify '--mergeable 0' to disable the "
794                         "mergeable feature.\n");
795                 return -1;
796         }
797
798         return 0;
799 }
800
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807         unsigned valid_num_ports = num_ports;
808         unsigned portid;
809
810         if (num_ports > nb_ports) {
811                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812                         num_ports, nb_ports);
813                 num_ports = nb_ports;
814         }
815
816         for (portid = 0; portid < num_ports; portid ++) {
817                 if (ports[portid] >= nb_ports) {
818                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819                                 ports[portid], (nb_ports - 1));
820                         ports[portid] = INVALID_PORT_ID;
821                         valid_num_ports--;
822                 }
823         }
824         return valid_num_ports;
825 }
826
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
833         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
834         unsigned int index;                                                                                                                                                                                             \
835         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
836                                                                                                                                                                                                                                         \
837         if ((header))                                                                                                                                                                                                   \
838                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
839         else                                                                                                                                                                                                                    \
840                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
841         for (index = 0; index < (size); index++) {                                                                                                                                              \
842                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
843                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
844         }                                                                                                                                                                                                                               \
845         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
846                                                                                                                                                                                                                                         \
847         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859         uint32_t buf_len, hpa_type *addr_type)
860 {
861         struct virtio_memory_regions_hpa *region;
862         uint32_t regionidx;
863         uint64_t vhost_pa = 0;
864
865         *addr_type = PHYS_ADDR_INVALID;
866
867         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868                 region = &vdev->regions_hpa[regionidx];
869                 if ((guest_pa >= region->guest_phys_address) &&
870                         (guest_pa <= region->guest_phys_address_end)) {
871                         vhost_pa = region->host_phys_addr_offset + guest_pa;
872                         if (likely((guest_pa + buf_len - 1)
873                                 <= region->guest_phys_address_end))
874                                 *addr_type = PHYS_ADDR_CONTINUOUS;
875                         else
876                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
877                         break;
878                 }
879         }
880
881         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883                 (void *)(uintptr_t)vhost_pa);
884
885         return vhost_pa;
886 }
887
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904         struct ether_hdr *pkt_hdr;
905         struct virtio_net_data_ll *dev_ll;
906         struct virtio_net *dev = vdev->dev;
907         int i, ret;
908
909         /* Learn MAC address of guest device from packet */
910         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911
912         dev_ll = ll_root_used;
913
914         while (dev_ll != NULL) {
915                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917                         return -1;
918                 }
919                 dev_ll = dev_ll->next;
920         }
921
922         for (i = 0; i < ETHER_ADDR_LEN; i++)
923                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924
925         /* vlan_tag currently uses the device_id. */
926         vdev->vlan_tag = vlan_tags[dev->device_fh];
927
928         /* Print out VMDQ registration info. */
929         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930                 dev->device_fh,
931                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934                 vdev->vlan_tag);
935
936         /* Register the MAC address. */
937         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
938         if (ret)
939                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
940                                         dev->device_fh);
941
942         /* Enable stripping of the vlan tag as we handle routing. */
943         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
944
945         /* Set device as ready for RX. */
946         vdev->ready = DEVICE_RX;
947
948         return 0;
949 }
950
951 /*
952  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
953  * queue before disabling RX on the device.
954  */
955 static inline void
956 unlink_vmdq(struct vhost_dev *vdev)
957 {
958         unsigned i = 0;
959         unsigned rx_count;
960         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
961
962         if (vdev->ready == DEVICE_RX) {
963                 /*clear MAC and VLAN settings*/
964                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
965                 for (i = 0; i < 6; i++)
966                         vdev->mac_address.addr_bytes[i] = 0;
967
968                 vdev->vlan_tag = 0;
969
970                 /*Clear out the receive buffers*/
971                 rx_count = rte_eth_rx_burst(ports[0],
972                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
973
974                 while (rx_count) {
975                         for (i = 0; i < rx_count; i++)
976                                 rte_pktmbuf_free(pkts_burst[i]);
977
978                         rx_count = rte_eth_rx_burst(ports[0],
979                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980                 }
981
982                 vdev->ready = DEVICE_MAC_LEARNING;
983         }
984 }
985
986 /*
987  * Check if the packet destination MAC address is for a local device. If so then put
988  * the packet on that devices RX queue. If not then return.
989  */
990 static inline unsigned __attribute__((always_inline))
991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
992 {
993         struct virtio_net_data_ll *dev_ll;
994         struct ether_hdr *pkt_hdr;
995         uint64_t ret = 0;
996         struct virtio_net *dev = vdev->dev;
997         struct virtio_net *tdev; /* destination virito device */
998
999         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1000
1001         /*get the used devices list*/
1002         dev_ll = ll_root_used;
1003
1004         while (dev_ll != NULL) {
1005                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1006                                           &dev_ll->vdev->mac_address)) {
1007
1008                         /* Drop the packet if the TX packet is destined for the TX device. */
1009                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1010                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011                                                         dev->device_fh);
1012                                 return 0;
1013                         }
1014                         tdev = dev_ll->vdev->dev;
1015
1016
1017                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1018
1019                         if (dev_ll->vdev->remove) {
1020                                 /*drop the packet if the device is marked for removal*/
1021                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1022                         } else {
1023                                 /*send the packet to the local virtio device*/
1024                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1025                                 if (enable_stats) {
1026                                         rte_atomic64_add(
1027                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1028                                         1);
1029                                         rte_atomic64_add(
1030                                         &dev_statistics[tdev->device_fh].rx_atomic,
1031                                         ret);
1032                                         dev_statistics[tdev->device_fh].tx_total++;
1033                                         dev_statistics[tdev->device_fh].tx += ret;
1034                                 }
1035                         }
1036
1037                         return 0;
1038                 }
1039                 dev_ll = dev_ll->next;
1040         }
1041
1042         return -1;
1043 }
1044
1045 /*
1046  * This function routes the TX packet to the correct interface. This may be a local device
1047  * or the physical port.
1048  */
1049 static inline void __attribute__((always_inline))
1050 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1051 {
1052         struct mbuf_table *tx_q;
1053         struct vlan_ethhdr *vlan_hdr;
1054         struct rte_mbuf **m_table;
1055         struct rte_mbuf *mbuf, *prev;
1056         unsigned len, ret, offset = 0;
1057         const uint16_t lcore_id = rte_lcore_id();
1058         struct virtio_net_data_ll *dev_ll = ll_root_used;
1059         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1060         struct virtio_net *dev = vdev->dev;
1061
1062         /*check if destination is local VM*/
1063         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1064                 return;
1065
1066         if (vm2vm_mode == VM2VM_HARDWARE) {
1067                 while (dev_ll != NULL) {
1068                         if ((dev_ll->vdev->ready == DEVICE_RX)
1069                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1070                                 &dev_ll->vdev->mac_address)) {
1071                                 /*
1072                                  * Drop the packet if the TX packet is
1073                                  * destined for the TX device.
1074                                  */
1075                                 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1076                                         LOG_DEBUG(VHOST_DATA,
1077                                         "(%"PRIu64") TX: Source and destination"
1078                                         " MAC addresses are the same. Dropping "
1079                                         "packet.\n",
1080                                         dev_ll->vdev->device_fh);
1081                                         return;
1082                                 }
1083                                 offset = 4;
1084                                 vlan_tag =
1085                                 (uint16_t)
1086                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1087
1088                                 LOG_DEBUG(VHOST_DATA,
1089                                 "(%"PRIu64") TX: pkt to local VM device id:"
1090                                 "(%"PRIu64") vlan tag: %d.\n",
1091                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1092                                 vlan_tag);
1093
1094                                 break;
1095                         }
1096                         dev_ll = dev_ll->next;
1097                 }
1098         }
1099
1100         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1101
1102         /*Add packet to the port tx queue*/
1103         tx_q = &lcore_tx_queue[lcore_id];
1104         len = tx_q->len;
1105
1106         /* Allocate an mbuf and populate the structure. */
1107         mbuf = rte_pktmbuf_alloc(mbuf_pool);
1108         if (unlikely(mbuf == NULL)) {
1109                 RTE_LOG(ERR, VHOST_DATA,
1110                         "Failed to allocate memory for mbuf.\n");
1111                 return;
1112         }
1113
1114         mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1115         mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1116         mbuf->nb_segs = m->nb_segs;
1117
1118         /* Copy ethernet header to mbuf. */
1119         rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1120                 rte_pktmbuf_mtod(m, const void *),
1121                 ETH_HLEN);
1122
1123
1124         /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1125         vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1126         vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1127         vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1128         vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1129
1130         /* Copy the remaining packet contents to the mbuf. */
1131         rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1132                 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1133                 (m->data_len - ETH_HLEN));
1134
1135         /* Copy the remaining segments for the whole packet. */
1136         prev = mbuf;
1137         while (m->next) {
1138                 /* Allocate an mbuf and populate the structure. */
1139                 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1140                 if (unlikely(next_mbuf == NULL)) {
1141                         rte_pktmbuf_free(mbuf);
1142                         RTE_LOG(ERR, VHOST_DATA,
1143                                 "Failed to allocate memory for mbuf.\n");
1144                         return;
1145                 }
1146
1147                 m = m->next;
1148                 prev->next = next_mbuf;
1149                 prev = next_mbuf;
1150                 next_mbuf->data_len = m->data_len;
1151
1152                 /* Copy data to next mbuf. */
1153                 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1154                         rte_pktmbuf_mtod(m, const void *), m->data_len);
1155         }
1156
1157         tx_q->m_table[len] = mbuf;
1158         len++;
1159         if (enable_stats) {
1160                 dev_statistics[dev->device_fh].tx_total++;
1161                 dev_statistics[dev->device_fh].tx++;
1162         }
1163
1164         if (unlikely(len == MAX_PKT_BURST)) {
1165                 m_table = (struct rte_mbuf **)tx_q->m_table;
1166                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1167                 /* Free any buffers not handled by TX and update the port stats. */
1168                 if (unlikely(ret < len)) {
1169                         do {
1170                                 rte_pktmbuf_free(m_table[ret]);
1171                         } while (++ret < len);
1172                 }
1173
1174                 len = 0;
1175         }
1176
1177         tx_q->len = len;
1178         return;
1179 }
1180 /*
1181  * This function is called by each data core. It handles all RX/TX registered with the
1182  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1183  * with all devices in the main linked list.
1184  */
1185 static int
1186 switch_worker(__attribute__((unused)) void *arg)
1187 {
1188         struct rte_mempool *mbuf_pool = arg;
1189         struct virtio_net *dev = NULL;
1190         struct vhost_dev *vdev = NULL;
1191         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1192         struct virtio_net_data_ll *dev_ll;
1193         struct mbuf_table *tx_q;
1194         volatile struct lcore_ll_info *lcore_ll;
1195         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1196         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1197         unsigned ret, i;
1198         const uint16_t lcore_id = rte_lcore_id();
1199         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1200         uint16_t rx_count = 0;
1201         uint16_t tx_count;
1202         uint32_t retry = 0;
1203
1204         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1205         lcore_ll = lcore_info[lcore_id].lcore_ll;
1206         prev_tsc = 0;
1207
1208         tx_q = &lcore_tx_queue[lcore_id];
1209         for (i = 0; i < num_cores; i ++) {
1210                 if (lcore_ids[i] == lcore_id) {
1211                         tx_q->txq_id = i;
1212                         break;
1213                 }
1214         }
1215
1216         while(1) {
1217                 cur_tsc = rte_rdtsc();
1218                 /*
1219                  * TX burst queue drain
1220                  */
1221                 diff_tsc = cur_tsc - prev_tsc;
1222                 if (unlikely(diff_tsc > drain_tsc)) {
1223
1224                         if (tx_q->len) {
1225                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1226
1227                                 /*Tx any packets in the queue*/
1228                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1229                                                                            (struct rte_mbuf **)tx_q->m_table,
1230                                                                            (uint16_t)tx_q->len);
1231                                 if (unlikely(ret < tx_q->len)) {
1232                                         do {
1233                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1234                                         } while (++ret < tx_q->len);
1235                                 }
1236
1237                                 tx_q->len = 0;
1238                         }
1239
1240                         prev_tsc = cur_tsc;
1241
1242                 }
1243
1244                 rte_prefetch0(lcore_ll->ll_root_used);
1245                 /*
1246                  * Inform the configuration core that we have exited the linked list and that no devices are
1247                  * in use if requested.
1248                  */
1249                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1250                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1251
1252                 /*
1253                  * Process devices
1254                  */
1255                 dev_ll = lcore_ll->ll_root_used;
1256
1257                 while (dev_ll != NULL) {
1258                         /*get virtio device ID*/
1259                         vdev = dev_ll->vdev;
1260                         dev = vdev->dev;
1261
1262                         if (vdev->remove) {
1263                                 dev_ll = dev_ll->next;
1264                                 unlink_vmdq(vdev);
1265                                 vdev->ready = DEVICE_SAFE_REMOVE;
1266                                 continue;
1267                         }
1268                         if (likely(vdev->ready == DEVICE_RX)) {
1269                                 /*Handle guest RX*/
1270                                 rx_count = rte_eth_rx_burst(ports[0],
1271                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1272
1273                                 if (rx_count) {
1274                                         /*
1275                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1276                                         * Here MAX_PKT_BURST must be less than virtio queue size
1277                                         */
1278                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1279                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1280                                                         rte_delay_us(burst_rx_delay_time);
1281                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1282                                                                 break;
1283                                                 }
1284                                         }
1285                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1286                                         if (enable_stats) {
1287                                                 rte_atomic64_add(
1288                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1289                                                 rx_count);
1290                                                 rte_atomic64_add(
1291                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1292                                         }
1293                                         while (likely(rx_count)) {
1294                                                 rx_count--;
1295                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1296                                         }
1297
1298                                 }
1299                         }
1300
1301                         if (!vdev->remove) {
1302                                 /* Handle guest TX*/
1303                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1304                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1305                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1306                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1307                                                 while (tx_count--)
1308                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1309                                         }
1310                                 }
1311                                 while (tx_count)
1312                                         virtio_tx_route(vdev, pkts_burst[--tx_count], mbuf_pool, (uint16_t)dev->device_fh);
1313                         }
1314
1315                         /*move to the next device in the list*/
1316                         dev_ll = dev_ll->next;
1317                 }
1318         }
1319
1320         return 0;
1321 }
1322
1323 /*
1324  * This function gets available ring number for zero copy rx.
1325  * Only one thread will call this funciton for a paticular virtio device,
1326  * so, it is designed as non-thread-safe function.
1327  */
1328 static inline uint32_t __attribute__((always_inline))
1329 get_available_ring_num_zcp(struct virtio_net *dev)
1330 {
1331         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1332         uint16_t avail_idx;
1333
1334         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1335         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1336 }
1337
1338 /*
1339  * This function gets available ring index for zero copy rx,
1340  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1341  * Only one thread will call this funciton for a paticular virtio device,
1342  * so, it is designed as non-thread-safe function.
1343  */
1344 static inline uint32_t __attribute__((always_inline))
1345 get_available_ring_index_zcp(struct virtio_net *dev,
1346         uint16_t *res_base_idx, uint32_t count)
1347 {
1348         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1349         uint16_t avail_idx;
1350         uint32_t retry = 0;
1351         uint16_t free_entries;
1352
1353         *res_base_idx = vq->last_used_idx_res;
1354         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1355         free_entries = (avail_idx - *res_base_idx);
1356
1357         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1358                         "avail idx: %d, "
1359                         "res base idx:%d, free entries:%d\n",
1360                         dev->device_fh, avail_idx, *res_base_idx,
1361                         free_entries);
1362
1363         /*
1364          * If retry is enabled and the queue is full then we wait
1365          * and retry to avoid packet loss.
1366          */
1367         if (enable_retry && unlikely(count > free_entries)) {
1368                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1369                         rte_delay_us(burst_rx_delay_time);
1370                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1371                         free_entries = (avail_idx - *res_base_idx);
1372                         if (count <= free_entries)
1373                                 break;
1374                 }
1375         }
1376
1377         /*check that we have enough buffers*/
1378         if (unlikely(count > free_entries))
1379                 count = free_entries;
1380
1381         if (unlikely(count == 0)) {
1382                 LOG_DEBUG(VHOST_DATA,
1383                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1384                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1385                         dev->device_fh, avail_idx,
1386                         *res_base_idx, free_entries);
1387                 return 0;
1388         }
1389
1390         vq->last_used_idx_res = *res_base_idx + count;
1391
1392         return count;
1393 }
1394
1395 /*
1396  * This function put descriptor back to used list.
1397  */
1398 static inline void __attribute__((always_inline))
1399 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1400 {
1401         uint16_t res_cur_idx = vq->last_used_idx;
1402         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1403         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1404         rte_compiler_barrier();
1405         *(volatile uint16_t *)&vq->used->idx += 1;
1406         vq->last_used_idx += 1;
1407
1408         /* Kick the guest if necessary. */
1409         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1410                 eventfd_write((int)vq->kickfd, 1);
1411 }
1412
1413 /*
1414  * This function get available descriptor from vitio vring and un-attached mbuf
1415  * from vpool->ring, and then attach them together. It needs adjust the offset
1416  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1417  * frame data may be put to wrong location in mbuf.
1418  */
1419 static inline void __attribute__((always_inline))
1420 attach_rxmbuf_zcp(struct virtio_net *dev)
1421 {
1422         uint16_t res_base_idx, desc_idx;
1423         uint64_t buff_addr, phys_addr;
1424         struct vhost_virtqueue *vq;
1425         struct vring_desc *desc;
1426         struct rte_mbuf *mbuf = NULL;
1427         struct vpool *vpool;
1428         hpa_type addr_type;
1429         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1430
1431         vpool = &vpool_array[vdev->vmdq_rx_q];
1432         vq = dev->virtqueue[VIRTIO_RXQ];
1433
1434         do {
1435                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1436                                 1) != 1))
1437                         return;
1438                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1439
1440                 desc = &vq->desc[desc_idx];
1441                 if (desc->flags & VRING_DESC_F_NEXT) {
1442                         desc = &vq->desc[desc->next];
1443                         buff_addr = gpa_to_vva(dev, desc->addr);
1444                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1445                                         &addr_type);
1446                 } else {
1447                         buff_addr = gpa_to_vva(dev,
1448                                         desc->addr + vq->vhost_hlen);
1449                         phys_addr = gpa_to_hpa(vdev,
1450                                         desc->addr + vq->vhost_hlen,
1451                                         desc->len, &addr_type);
1452                 }
1453
1454                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1455                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1456                                 " address found when attaching RX frame buffer"
1457                                 " address!\n", dev->device_fh);
1458                         put_desc_to_used_list_zcp(vq, desc_idx);
1459                         continue;
1460                 }
1461
1462                 /*
1463                  * Check if the frame buffer address from guest crosses
1464                  * sub-region or not.
1465                  */
1466                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1467                         RTE_LOG(ERR, VHOST_DATA,
1468                                 "(%"PRIu64") Frame buffer address cross "
1469                                 "sub-regioin found when attaching RX frame "
1470                                 "buffer address!\n",
1471                                 dev->device_fh);
1472                         put_desc_to_used_list_zcp(vq, desc_idx);
1473                         continue;
1474                 }
1475         } while (unlikely(phys_addr == 0));
1476
1477         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1478         if (unlikely(mbuf == NULL)) {
1479                 LOG_DEBUG(VHOST_DATA,
1480                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1481                         "ring_sc_dequeue fail.\n",
1482                         dev->device_fh);
1483                 put_desc_to_used_list_zcp(vq, desc_idx);
1484                 return;
1485         }
1486
1487         if (unlikely(vpool->buf_size > desc->len)) {
1488                 LOG_DEBUG(VHOST_DATA,
1489                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1490                         "length(%d) of descriptor idx: %d less than room "
1491                         "size required: %d\n",
1492                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1493                 put_desc_to_used_list_zcp(vq, desc_idx);
1494                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1495                 return;
1496         }
1497
1498         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1499         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1500         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1501         mbuf->data_len = desc->len;
1502         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1503
1504         LOG_DEBUG(VHOST_DATA,
1505                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1506                 "descriptor idx:%d\n",
1507                 dev->device_fh, res_base_idx, desc_idx);
1508
1509         __rte_mbuf_raw_free(mbuf);
1510
1511         return;
1512 }
1513
1514 /*
1515  * Detach an attched packet mbuf -
1516  *  - restore original mbuf address and length values.
1517  *  - reset pktmbuf data and data_len to their default values.
1518  *  All other fields of the given packet mbuf will be left intact.
1519  *
1520  * @param m
1521  *   The attached packet mbuf.
1522  */
1523 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1524 {
1525         const struct rte_mempool *mp = m->pool;
1526         void *buf = RTE_MBUF_TO_BADDR(m);
1527         uint32_t buf_ofs;
1528         uint32_t buf_len = mp->elt_size - sizeof(*m);
1529         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1530
1531         m->buf_addr = buf;
1532         m->buf_len = (uint16_t)buf_len;
1533
1534         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1535                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1536         m->data_off = buf_ofs;
1537
1538         m->data_len = 0;
1539 }
1540
1541 /*
1542  * This function is called after packets have been transimited. It fetchs mbuf
1543  * from vpool->pool, detached it and put into vpool->ring. It also update the
1544  * used index and kick the guest if necessary.
1545  */
1546 static inline uint32_t __attribute__((always_inline))
1547 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1548 {
1549         struct rte_mbuf *mbuf;
1550         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1551         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1552         uint32_t index = 0;
1553         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1554
1555         LOG_DEBUG(VHOST_DATA,
1556                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1557                 "clean is: %d\n",
1558                 dev->device_fh, mbuf_count);
1559         LOG_DEBUG(VHOST_DATA,
1560                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1561                 "clean  is : %d\n",
1562                 dev->device_fh, rte_ring_count(vpool->ring));
1563
1564         for (index = 0; index < mbuf_count; index++) {
1565                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1566                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1567                         pktmbuf_detach_zcp(mbuf);
1568                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1569
1570                 /* Update used index buffer information. */
1571                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1572                 vq->used->ring[used_idx].len = 0;
1573
1574                 used_idx = (used_idx + 1) & (vq->size - 1);
1575         }
1576
1577         LOG_DEBUG(VHOST_DATA,
1578                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1579                 "clean is: %d\n",
1580                 dev->device_fh, rte_mempool_count(vpool->pool));
1581         LOG_DEBUG(VHOST_DATA,
1582                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1583                 "clean  is : %d\n",
1584                 dev->device_fh, rte_ring_count(vpool->ring));
1585         LOG_DEBUG(VHOST_DATA,
1586                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1587                 "vq->last_used_idx:%d\n",
1588                 dev->device_fh, vq->last_used_idx);
1589
1590         vq->last_used_idx += mbuf_count;
1591
1592         LOG_DEBUG(VHOST_DATA,
1593                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1594                 "vq->last_used_idx:%d\n",
1595                 dev->device_fh, vq->last_used_idx);
1596
1597         rte_compiler_barrier();
1598
1599         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1600
1601         /* Kick guest if required. */
1602         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1603                 eventfd_write((int)vq->kickfd, 1);
1604
1605         return 0;
1606 }
1607
1608 /*
1609  * This function is called when a virtio device is destroy.
1610  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1611  */
1612 static void mbuf_destroy_zcp(struct vpool *vpool)
1613 {
1614         struct rte_mbuf *mbuf = NULL;
1615         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1616
1617         LOG_DEBUG(VHOST_CONFIG,
1618                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1619                 "mbuf_destroy_zcp is: %d\n",
1620                 mbuf_count);
1621         LOG_DEBUG(VHOST_CONFIG,
1622                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1623                 "mbuf_destroy_zcp  is : %d\n",
1624                 rte_ring_count(vpool->ring));
1625
1626         for (index = 0; index < mbuf_count; index++) {
1627                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1628                 if (likely(mbuf != NULL)) {
1629                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1630                                 pktmbuf_detach_zcp(mbuf);
1631                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1632                 }
1633         }
1634
1635         LOG_DEBUG(VHOST_CONFIG,
1636                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1637                 "mbuf_destroy_zcp is: %d\n",
1638                 rte_mempool_count(vpool->pool));
1639         LOG_DEBUG(VHOST_CONFIG,
1640                 "in mbuf_destroy_zcp: mbuf count in ring after "
1641                 "mbuf_destroy_zcp is : %d\n",
1642                 rte_ring_count(vpool->ring));
1643 }
1644
1645 /*
1646  * This function update the use flag and counter.
1647  */
1648 static inline uint32_t __attribute__((always_inline))
1649 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1650         uint32_t count)
1651 {
1652         struct vhost_virtqueue *vq;
1653         struct vring_desc *desc;
1654         struct rte_mbuf *buff;
1655         /* The virtio_hdr is initialised to 0. */
1656         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1657                 = {{0, 0, 0, 0, 0, 0}, 0};
1658         uint64_t buff_hdr_addr = 0;
1659         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1660         uint32_t head_idx, packet_success = 0;
1661         uint16_t res_cur_idx;
1662
1663         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1664
1665         if (count == 0)
1666                 return 0;
1667
1668         vq = dev->virtqueue[VIRTIO_RXQ];
1669         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1670
1671         res_cur_idx = vq->last_used_idx;
1672         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1673                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1674
1675         /* Retrieve all of the head indexes first to avoid caching issues. */
1676         for (head_idx = 0; head_idx < count; head_idx++)
1677                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1678
1679         /*Prefetch descriptor index. */
1680         rte_prefetch0(&vq->desc[head[packet_success]]);
1681
1682         while (packet_success != count) {
1683                 /* Get descriptor from available ring */
1684                 desc = &vq->desc[head[packet_success]];
1685
1686                 buff = pkts[packet_success];
1687                 LOG_DEBUG(VHOST_DATA,
1688                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1689                         "pkt[%d] descriptor idx: %d\n",
1690                         dev->device_fh, packet_success,
1691                         MBUF_HEADROOM_UINT32(buff));
1692
1693                 PRINT_PACKET(dev,
1694                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1695                         + RTE_PKTMBUF_HEADROOM),
1696                         rte_pktmbuf_data_len(buff), 0);
1697
1698                 /* Buffer address translation for virtio header. */
1699                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1700                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1701
1702                 /*
1703                  * If the descriptors are chained the header and data are
1704                  * placed in separate buffers.
1705                  */
1706                 if (desc->flags & VRING_DESC_F_NEXT) {
1707                         desc->len = vq->vhost_hlen;
1708                         desc = &vq->desc[desc->next];
1709                         desc->len = rte_pktmbuf_data_len(buff);
1710                 } else {
1711                         desc->len = packet_len;
1712                 }
1713
1714                 /* Update used ring with desc information */
1715                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1716                         = head[packet_success];
1717                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1718                         = packet_len;
1719                 res_cur_idx++;
1720                 packet_success++;
1721
1722                 /* A header is required per buffer. */
1723                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1724                         (const void *)&virtio_hdr, vq->vhost_hlen);
1725
1726                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1727
1728                 if (likely(packet_success < count)) {
1729                         /* Prefetch descriptor index. */
1730                         rte_prefetch0(&vq->desc[head[packet_success]]);
1731                 }
1732         }
1733
1734         rte_compiler_barrier();
1735
1736         LOG_DEBUG(VHOST_DATA,
1737                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1738                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1739                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1740
1741         *(volatile uint16_t *)&vq->used->idx += count;
1742         vq->last_used_idx += count;
1743
1744         LOG_DEBUG(VHOST_DATA,
1745                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1746                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1747                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1748
1749         /* Kick the guest if necessary. */
1750         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1751                 eventfd_write((int)vq->kickfd, 1);
1752
1753         return count;
1754 }
1755
1756 /*
1757  * This function routes the TX packet to the correct interface.
1758  * This may be a local device or the physical port.
1759  */
1760 static inline void __attribute__((always_inline))
1761 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1762         uint32_t desc_idx, uint8_t need_copy)
1763 {
1764         struct mbuf_table *tx_q;
1765         struct rte_mbuf **m_table;
1766         struct rte_mbuf *mbuf = NULL;
1767         unsigned len, ret, offset = 0;
1768         struct vpool *vpool;
1769         struct virtio_net_data_ll *dev_ll = ll_root_used;
1770         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1771         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1772         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1773
1774         /*Add packet to the port tx queue*/
1775         tx_q = &tx_queue_zcp[vmdq_rx_q];
1776         len = tx_q->len;
1777
1778         /* Allocate an mbuf and populate the structure. */
1779         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1780         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1781         if (unlikely(mbuf == NULL)) {
1782                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1783                 RTE_LOG(ERR, VHOST_DATA,
1784                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1785                         dev->device_fh);
1786                 put_desc_to_used_list_zcp(vq, desc_idx);
1787                 return;
1788         }
1789
1790         if (vm2vm_mode == VM2VM_HARDWARE) {
1791                 /* Avoid using a vlan tag from any vm for external pkt, such as
1792                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1793                  * selection, MAC address determines it as an external pkt
1794                  * which should go to network, while vlan tag determine it as
1795                  * a vm2vm pkt should forward to another vm. Hardware confuse
1796                  * such a ambiguous situation, so pkt will lost.
1797                  */
1798                 vlan_tag = external_pkt_default_vlan_tag;
1799                 while (dev_ll != NULL) {
1800                         if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1801                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1802                                 &dev_ll->vdev->mac_address)) {
1803
1804                                 /*
1805                                  * Drop the packet if the TX packet is destined
1806                                  * for the TX device.
1807                                  */
1808                                 if (unlikely(dev_ll->vdev->dev->device_fh
1809                                         == dev->device_fh)) {
1810                                         LOG_DEBUG(VHOST_DATA,
1811                                         "(%"PRIu64") TX: Source and destination"
1812                                         "MAC addresses are the same. Dropping "
1813                                         "packet.\n",
1814                                         dev_ll->vdev->dev->device_fh);
1815                                         MBUF_HEADROOM_UINT32(mbuf)
1816                                                 = (uint32_t)desc_idx;
1817                                         __rte_mbuf_raw_free(mbuf);
1818                                         return;
1819                                 }
1820
1821                                 /*
1822                                  * Packet length offset 4 bytes for HW vlan
1823                                  * strip when L2 switch back.
1824                                  */
1825                                 offset = 4;
1826                                 vlan_tag =
1827                                 (uint16_t)
1828                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1829
1830                                 LOG_DEBUG(VHOST_DATA,
1831                                 "(%"PRIu64") TX: pkt to local VM device id:"
1832                                 "(%"PRIu64") vlan tag: %d.\n",
1833                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1834                                 vlan_tag);
1835
1836                                 break;
1837                         }
1838                         dev_ll = dev_ll->next;
1839                 }
1840         }
1841
1842         mbuf->nb_segs = m->nb_segs;
1843         mbuf->next = m->next;
1844         mbuf->data_len = m->data_len + offset;
1845         mbuf->pkt_len = mbuf->data_len;
1846         if (unlikely(need_copy)) {
1847                 /* Copy the packet contents to the mbuf. */
1848                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1849                         rte_pktmbuf_mtod(m, void *),
1850                         m->data_len);
1851         } else {
1852                 mbuf->data_off = m->data_off;
1853                 mbuf->buf_physaddr = m->buf_physaddr;
1854                 mbuf->buf_addr = m->buf_addr;
1855         }
1856         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1857         mbuf->vlan_tci = vlan_tag;
1858         mbuf->l2_len = sizeof(struct ether_hdr);
1859         mbuf->l3_len = sizeof(struct ipv4_hdr);
1860         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1861
1862         tx_q->m_table[len] = mbuf;
1863         len++;
1864
1865         LOG_DEBUG(VHOST_DATA,
1866                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1867                 dev->device_fh,
1868                 mbuf->nb_segs,
1869                 (mbuf->next == NULL) ? "null" : "non-null");
1870
1871         if (enable_stats) {
1872                 dev_statistics[dev->device_fh].tx_total++;
1873                 dev_statistics[dev->device_fh].tx++;
1874         }
1875
1876         if (unlikely(len == MAX_PKT_BURST)) {
1877                 m_table = (struct rte_mbuf **)tx_q->m_table;
1878                 ret = rte_eth_tx_burst(ports[0],
1879                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1880
1881                 /*
1882                  * Free any buffers not handled by TX and update
1883                  * the port stats.
1884                  */
1885                 if (unlikely(ret < len)) {
1886                         do {
1887                                 rte_pktmbuf_free(m_table[ret]);
1888                         } while (++ret < len);
1889                 }
1890
1891                 len = 0;
1892                 txmbuf_clean_zcp(dev, vpool);
1893         }
1894
1895         tx_q->len = len;
1896
1897         return;
1898 }
1899
1900 /*
1901  * This function TX all available packets in virtio TX queue for one
1902  * virtio-net device. If it is first packet, it learns MAC address and
1903  * setup VMDQ.
1904  */
1905 static inline void __attribute__((always_inline))
1906 virtio_dev_tx_zcp(struct virtio_net *dev)
1907 {
1908         struct rte_mbuf m;
1909         struct vhost_virtqueue *vq;
1910         struct vring_desc *desc;
1911         uint64_t buff_addr = 0, phys_addr;
1912         uint32_t head[MAX_PKT_BURST];
1913         uint32_t i;
1914         uint16_t free_entries, packet_success = 0;
1915         uint16_t avail_idx;
1916         uint8_t need_copy = 0;
1917         hpa_type addr_type;
1918         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1919
1920         vq = dev->virtqueue[VIRTIO_TXQ];
1921         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1922
1923         /* If there are no available buffers then return. */
1924         if (vq->last_used_idx_res == avail_idx)
1925                 return;
1926
1927         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1928
1929         /* Prefetch available ring to retrieve head indexes. */
1930         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1931
1932         /* Get the number of free entries in the ring */
1933         free_entries = (avail_idx - vq->last_used_idx_res);
1934
1935         /* Limit to MAX_PKT_BURST. */
1936         free_entries
1937                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1938
1939         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1940                 dev->device_fh, free_entries);
1941
1942         /* Retrieve all of the head indexes first to avoid caching issues. */
1943         for (i = 0; i < free_entries; i++)
1944                 head[i]
1945                         = vq->avail->ring[(vq->last_used_idx_res + i)
1946                         & (vq->size - 1)];
1947
1948         vq->last_used_idx_res += free_entries;
1949
1950         /* Prefetch descriptor index. */
1951         rte_prefetch0(&vq->desc[head[packet_success]]);
1952         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1953
1954         while (packet_success < free_entries) {
1955                 desc = &vq->desc[head[packet_success]];
1956
1957                 /* Discard first buffer as it is the virtio header */
1958                 desc = &vq->desc[desc->next];
1959
1960                 /* Buffer address translation. */
1961                 buff_addr = gpa_to_vva(dev, desc->addr);
1962                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1963
1964                 if (likely(packet_success < (free_entries - 1)))
1965                         /* Prefetch descriptor index. */
1966                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1967
1968                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1969                         RTE_LOG(ERR, VHOST_DATA,
1970                                 "(%"PRIu64") Invalid frame buffer address found"
1971                                 "when TX packets!\n",
1972                                 dev->device_fh);
1973                         packet_success++;
1974                         continue;
1975                 }
1976
1977                 /* Prefetch buffer address. */
1978                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1979
1980                 /*
1981                  * Setup dummy mbuf. This is copied to a real mbuf if
1982                  * transmitted out the physical port.
1983                  */
1984                 m.data_len = desc->len;
1985                 m.nb_segs = 1;
1986                 m.next = NULL;
1987                 m.data_off = 0;
1988                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1989                 m.buf_physaddr = phys_addr;
1990
1991                 /*
1992                  * Check if the frame buffer address from guest crosses
1993                  * sub-region or not.
1994                  */
1995                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1996                         RTE_LOG(ERR, VHOST_DATA,
1997                                 "(%"PRIu64") Frame buffer address cross "
1998                                 "sub-regioin found when attaching TX frame "
1999                                 "buffer address!\n",
2000                                 dev->device_fh);
2001                         need_copy = 1;
2002                 } else
2003                         need_copy = 0;
2004
2005                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2006
2007                 /*
2008                  * If this is the first received packet we need to learn
2009                  * the MAC and setup VMDQ
2010                  */
2011                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2012                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2013                                 /*
2014                                  * Discard frame if device is scheduled for
2015                                  * removal or a duplicate MAC address is found.
2016                                  */
2017                                 packet_success += free_entries;
2018                                 vq->last_used_idx += packet_success;
2019                                 break;
2020                         }
2021                 }
2022
2023                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2024                 packet_success++;
2025         }
2026 }
2027
2028 /*
2029  * This function is called by each data core. It handles all RX/TX registered
2030  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2031  * addresses are compared with all devices in the main linked list.
2032  */
2033 static int
2034 switch_worker_zcp(__attribute__((unused)) void *arg)
2035 {
2036         struct virtio_net *dev = NULL;
2037         struct vhost_dev  *vdev = NULL;
2038         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2039         struct virtio_net_data_ll *dev_ll;
2040         struct mbuf_table *tx_q;
2041         volatile struct lcore_ll_info *lcore_ll;
2042         const uint64_t drain_tsc
2043                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2044                 * BURST_TX_DRAIN_US;
2045         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2046         unsigned ret;
2047         const uint16_t lcore_id = rte_lcore_id();
2048         uint16_t count_in_ring, rx_count = 0;
2049
2050         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2051
2052         lcore_ll = lcore_info[lcore_id].lcore_ll;
2053         prev_tsc = 0;
2054
2055         while (1) {
2056                 cur_tsc = rte_rdtsc();
2057
2058                 /* TX burst queue drain */
2059                 diff_tsc = cur_tsc - prev_tsc;
2060                 if (unlikely(diff_tsc > drain_tsc)) {
2061                         /*
2062                          * Get mbuf from vpool.pool and detach mbuf and
2063                          * put back into vpool.ring.
2064                          */
2065                         dev_ll = lcore_ll->ll_root_used;
2066                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2067                                 /* Get virtio device ID */
2068                                 vdev = dev_ll->vdev;
2069                                 dev = vdev->dev;
2070
2071                                 if (likely(!vdev->remove)) {
2072                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2073                                         if (tx_q->len) {
2074                                                 LOG_DEBUG(VHOST_DATA,
2075                                                 "TX queue drained after timeout"
2076                                                 " with burst size %u\n",
2077                                                 tx_q->len);
2078
2079                                                 /*
2080                                                  * Tx any packets in the queue
2081                                                  */
2082                                                 ret = rte_eth_tx_burst(
2083                                                         ports[0],
2084                                                         (uint16_t)tx_q->txq_id,
2085                                                         (struct rte_mbuf **)
2086                                                         tx_q->m_table,
2087                                                         (uint16_t)tx_q->len);
2088                                                 if (unlikely(ret < tx_q->len)) {
2089                                                         do {
2090                                                                 rte_pktmbuf_free(
2091                                                                         tx_q->m_table[ret]);
2092                                                         } while (++ret < tx_q->len);
2093                                                 }
2094                                                 tx_q->len = 0;
2095
2096                                                 txmbuf_clean_zcp(dev,
2097                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2098                                         }
2099                                 }
2100                                 dev_ll = dev_ll->next;
2101                         }
2102                         prev_tsc = cur_tsc;
2103                 }
2104
2105                 rte_prefetch0(lcore_ll->ll_root_used);
2106
2107                 /*
2108                  * Inform the configuration core that we have exited the linked
2109                  * list and that no devices are in use if requested.
2110                  */
2111                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2112                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2113
2114                 /* Process devices */
2115                 dev_ll = lcore_ll->ll_root_used;
2116
2117                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2118                         vdev = dev_ll->vdev;
2119                         dev  = vdev->dev;
2120                         if (unlikely(vdev->remove)) {
2121                                 dev_ll = dev_ll->next;
2122                                 unlink_vmdq(vdev);
2123                                 vdev->ready = DEVICE_SAFE_REMOVE;
2124                                 continue;
2125                         }
2126
2127                         if (likely(vdev->ready == DEVICE_RX)) {
2128                                 uint32_t index = vdev->vmdq_rx_q;
2129                                 uint16_t i;
2130                                 count_in_ring
2131                                 = rte_ring_count(vpool_array[index].ring);
2132                                 uint16_t free_entries
2133                                 = (uint16_t)get_available_ring_num_zcp(dev);
2134
2135                                 /*
2136                                  * Attach all mbufs in vpool.ring and put back
2137                                  * into vpool.pool.
2138                                  */
2139                                 for (i = 0;
2140                                 i < RTE_MIN(free_entries,
2141                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2142                                 i++)
2143                                         attach_rxmbuf_zcp(dev);
2144
2145                                 /* Handle guest RX */
2146                                 rx_count = rte_eth_rx_burst(ports[0],
2147                                         vdev->vmdq_rx_q, pkts_burst,
2148                                         MAX_PKT_BURST);
2149
2150                                 if (rx_count) {
2151                                         ret_count = virtio_dev_rx_zcp(dev,
2152                                                         pkts_burst, rx_count);
2153                                         if (enable_stats) {
2154                                                 dev_statistics[dev->device_fh].rx_total
2155                                                         += rx_count;
2156                                                 dev_statistics[dev->device_fh].rx
2157                                                         += ret_count;
2158                                         }
2159                                         while (likely(rx_count)) {
2160                                                 rx_count--;
2161                                                 pktmbuf_detach_zcp(
2162                                                         pkts_burst[rx_count]);
2163                                                 rte_ring_sp_enqueue(
2164                                                         vpool_array[index].ring,
2165                                                         (void *)pkts_burst[rx_count]);
2166                                         }
2167                                 }
2168                         }
2169
2170                         if (likely(!vdev->remove))
2171                                 /* Handle guest TX */
2172                                 virtio_dev_tx_zcp(dev);
2173
2174                         /* Move to the next device in the list */
2175                         dev_ll = dev_ll->next;
2176                 }
2177         }
2178
2179         return 0;
2180 }
2181
2182
2183 /*
2184  * Add an entry to a used linked list. A free entry must first be found
2185  * in the free linked list using get_data_ll_free_entry();
2186  */
2187 static void
2188 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2189         struct virtio_net_data_ll *ll_dev)
2190 {
2191         struct virtio_net_data_ll *ll = *ll_root_addr;
2192
2193         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2194         ll_dev->next = NULL;
2195         rte_compiler_barrier();
2196
2197         /* If ll == NULL then this is the first device. */
2198         if (ll) {
2199                 /* Increment to the tail of the linked list. */
2200                 while ((ll->next != NULL) )
2201                         ll = ll->next;
2202
2203                 ll->next = ll_dev;
2204         } else {
2205                 *ll_root_addr = ll_dev;
2206         }
2207 }
2208
2209 /*
2210  * Remove an entry from a used linked list. The entry must then be added to
2211  * the free linked list using put_data_ll_free_entry().
2212  */
2213 static void
2214 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2215         struct virtio_net_data_ll *ll_dev,
2216         struct virtio_net_data_ll *ll_dev_last)
2217 {
2218         struct virtio_net_data_ll *ll = *ll_root_addr;
2219
2220         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2221                 return;
2222
2223         if (ll_dev == ll)
2224                 *ll_root_addr = ll_dev->next;
2225         else
2226                 if (likely(ll_dev_last != NULL))
2227                         ll_dev_last->next = ll_dev->next;
2228                 else
2229                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2230 }
2231
2232 /*
2233  * Find and return an entry from the free linked list.
2234  */
2235 static struct virtio_net_data_ll *
2236 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2237 {
2238         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2239         struct virtio_net_data_ll *ll_dev;
2240
2241         if (ll_free == NULL)
2242                 return NULL;
2243
2244         ll_dev = ll_free;
2245         *ll_root_addr = ll_free->next;
2246
2247         return ll_dev;
2248 }
2249
2250 /*
2251  * Place an entry back on to the free linked list.
2252  */
2253 static void
2254 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2255         struct virtio_net_data_ll *ll_dev)
2256 {
2257         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2258
2259         if (ll_dev == NULL)
2260                 return;
2261
2262         ll_dev->next = ll_free;
2263         *ll_root_addr = ll_dev;
2264 }
2265
2266 /*
2267  * Creates a linked list of a given size.
2268  */
2269 static struct virtio_net_data_ll *
2270 alloc_data_ll(uint32_t size)
2271 {
2272         struct virtio_net_data_ll *ll_new;
2273         uint32_t i;
2274
2275         /* Malloc and then chain the linked list. */
2276         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2277         if (ll_new == NULL) {
2278                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2279                 return NULL;
2280         }
2281
2282         for (i = 0; i < size - 1; i++) {
2283                 ll_new[i].vdev = NULL;
2284                 ll_new[i].next = &ll_new[i+1];
2285         }
2286         ll_new[i].next = NULL;
2287
2288         return (ll_new);
2289 }
2290
2291 /*
2292  * Create the main linked list along with each individual cores linked list. A used and a free list
2293  * are created to manage entries.
2294  */
2295 static int
2296 init_data_ll (void)
2297 {
2298         int lcore;
2299
2300         RTE_LCORE_FOREACH_SLAVE(lcore) {
2301                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2302                 if (lcore_info[lcore].lcore_ll == NULL) {
2303                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2304                         return -1;
2305                 }
2306
2307                 lcore_info[lcore].lcore_ll->device_num = 0;
2308                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2309                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2310                 if (num_devices % num_switching_cores)
2311                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2312                 else
2313                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2314         }
2315
2316         /* Allocate devices up to a maximum of MAX_DEVICES. */
2317         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2318
2319         return 0;
2320 }
2321
2322 /*
2323  * Set virtqueue flags so that we do not receive interrupts.
2324  */
2325 static void
2326 set_irq_status (struct virtio_net *dev)
2327 {
2328         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2329         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2330 }
2331
2332 /*
2333  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2334  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2335  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2336  */
2337 static void
2338 destroy_device (volatile struct virtio_net *dev)
2339 {
2340         struct virtio_net_data_ll *ll_lcore_dev_cur;
2341         struct virtio_net_data_ll *ll_main_dev_cur;
2342         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2343         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2344         struct vhost_dev *vdev;
2345         int lcore;
2346
2347         dev->flags &= ~VIRTIO_DEV_RUNNING;
2348
2349         vdev = (struct vhost_dev *)dev->priv;
2350         /*set the remove flag. */
2351         vdev->remove = 1;
2352         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2353                 rte_pause();
2354         }
2355
2356         /* Search for entry to be removed from lcore ll */
2357         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2358         while (ll_lcore_dev_cur != NULL) {
2359                 if (ll_lcore_dev_cur->vdev == vdev) {
2360                         break;
2361                 } else {
2362                         ll_lcore_dev_last = ll_lcore_dev_cur;
2363                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2364                 }
2365         }
2366
2367         if (ll_lcore_dev_cur == NULL) {
2368                 RTE_LOG(ERR, VHOST_CONFIG,
2369                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2370                         dev->device_fh);
2371                 return;
2372         }
2373
2374         /* Search for entry to be removed from main ll */
2375         ll_main_dev_cur = ll_root_used;
2376         ll_main_dev_last = NULL;
2377         while (ll_main_dev_cur != NULL) {
2378                 if (ll_main_dev_cur->vdev == vdev) {
2379                         break;
2380                 } else {
2381                         ll_main_dev_last = ll_main_dev_cur;
2382                         ll_main_dev_cur = ll_main_dev_cur->next;
2383                 }
2384         }
2385
2386         /* Remove entries from the lcore and main ll. */
2387         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2388         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2389
2390         /* Set the dev_removal_flag on each lcore. */
2391         RTE_LCORE_FOREACH_SLAVE(lcore) {
2392                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2393         }
2394
2395         /*
2396          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2397          * they can no longer access the device removed from the linked lists and that the devices
2398          * are no longer in use.
2399          */
2400         RTE_LCORE_FOREACH_SLAVE(lcore) {
2401                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2402                         rte_pause();
2403                 }
2404         }
2405
2406         /* Add the entries back to the lcore and main free ll.*/
2407         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2408         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2409
2410         /* Decrement number of device on the lcore. */
2411         lcore_info[vdev->coreid].lcore_ll->device_num--;
2412
2413         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2414
2415         if (zero_copy) {
2416                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2417
2418                 /* Stop the RX queue. */
2419                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2420                         LOG_DEBUG(VHOST_CONFIG,
2421                                 "(%"PRIu64") In destroy_device: Failed to stop "
2422                                 "rx queue:%d\n",
2423                                 dev->device_fh,
2424                                 vdev->vmdq_rx_q);
2425                 }
2426
2427                 LOG_DEBUG(VHOST_CONFIG,
2428                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2429                         "mempool back to ring for RX queue: %d\n",
2430                         dev->device_fh, vdev->vmdq_rx_q);
2431
2432                 mbuf_destroy_zcp(vpool);
2433
2434                 /* Stop the TX queue. */
2435                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2436                         LOG_DEBUG(VHOST_CONFIG,
2437                                 "(%"PRIu64") In destroy_device: Failed to "
2438                                 "stop tx queue:%d\n",
2439                                 dev->device_fh, vdev->vmdq_rx_q);
2440                 }
2441
2442                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2443
2444                 LOG_DEBUG(VHOST_CONFIG,
2445                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2446                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2447                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2448                         dev->device_fh);
2449
2450                 mbuf_destroy_zcp(vpool);
2451                 rte_free(vdev->regions_hpa);
2452         }
2453         rte_free(vdev);
2454
2455 }
2456
2457 /*
2458  * Calculate the region count of physical continous regions for one particular
2459  * region of whose vhost virtual address is continous. The particular region
2460  * start from vva_start, with size of 'size' in argument.
2461  */
2462 static uint32_t
2463 check_hpa_regions(uint64_t vva_start, uint64_t size)
2464 {
2465         uint32_t i, nregions = 0, page_size = getpagesize();
2466         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2467         if (vva_start % page_size) {
2468                 LOG_DEBUG(VHOST_CONFIG,
2469                         "in check_countinous: vva start(%p) mod page_size(%d) "
2470                         "has remainder\n",
2471                         (void *)(uintptr_t)vva_start, page_size);
2472                 return 0;
2473         }
2474         if (size % page_size) {
2475                 LOG_DEBUG(VHOST_CONFIG,
2476                         "in check_countinous: "
2477                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2478                         size, page_size);
2479                 return 0;
2480         }
2481         for (i = 0; i < size - page_size; i = i + page_size) {
2482                 cur_phys_addr
2483                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2484                 next_phys_addr = rte_mem_virt2phy(
2485                         (void *)(uintptr_t)(vva_start + i + page_size));
2486                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2487                         ++nregions;
2488                         LOG_DEBUG(VHOST_CONFIG,
2489                                 "in check_continuous: hva addr:(%p) is not "
2490                                 "continuous with hva addr:(%p), diff:%d\n",
2491                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2492                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2493                                 + page_size), page_size);
2494                         LOG_DEBUG(VHOST_CONFIG,
2495                                 "in check_continuous: hpa addr:(%p) is not "
2496                                 "continuous with hpa addr:(%p), "
2497                                 "diff:(%"PRIu64")\n",
2498                                 (void *)(uintptr_t)cur_phys_addr,
2499                                 (void *)(uintptr_t)next_phys_addr,
2500                                 (next_phys_addr-cur_phys_addr));
2501                 }
2502         }
2503         return nregions;
2504 }
2505
2506 /*
2507  * Divide each region whose vhost virtual address is continous into a few
2508  * sub-regions, make sure the physical address within each sub-region are
2509  * continous. And fill offset(to GPA) and size etc. information of each
2510  * sub-region into regions_hpa.
2511  */
2512 static uint32_t
2513 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2514 {
2515         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2516         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2517
2518         if (mem_region_hpa == NULL)
2519                 return 0;
2520
2521         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2522                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2523                         virtio_memory->regions[regionidx].address_offset;
2524                 mem_region_hpa[regionidx_hpa].guest_phys_address
2525                         = virtio_memory->regions[regionidx].guest_phys_address;
2526                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2527                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2528                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2529                 LOG_DEBUG(VHOST_CONFIG,
2530                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2531                         regionidx_hpa,
2532                         (void *)(uintptr_t)
2533                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2534                 LOG_DEBUG(VHOST_CONFIG,
2535                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2536                         regionidx_hpa,
2537                         (void *)(uintptr_t)
2538                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2539                 for (i = 0, k = 0;
2540                         i < virtio_memory->regions[regionidx].memory_size -
2541                                 page_size;
2542                         i += page_size) {
2543                         cur_phys_addr = rte_mem_virt2phy(
2544                                         (void *)(uintptr_t)(vva_start + i));
2545                         next_phys_addr = rte_mem_virt2phy(
2546                                         (void *)(uintptr_t)(vva_start +
2547                                         i + page_size));
2548                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2549                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2550                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2551                                         k + page_size;
2552                                 mem_region_hpa[regionidx_hpa].memory_size
2553                                         = k + page_size;
2554                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2555                                         "phys addr end  [%d]:(%p)\n",
2556                                         regionidx_hpa,
2557                                         (void *)(uintptr_t)
2558                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2559                                 LOG_DEBUG(VHOST_CONFIG,
2560                                         "in fill_hpa_regions: guest phys addr "
2561                                         "size [%d]:(%p)\n",
2562                                         regionidx_hpa,
2563                                         (void *)(uintptr_t)
2564                                         (mem_region_hpa[regionidx_hpa].memory_size));
2565                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2566                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2567                                 ++regionidx_hpa;
2568                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2569                                         next_phys_addr -
2570                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2571                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2572                                         " phys addr start[%d]:(%p)\n",
2573                                         regionidx_hpa,
2574                                         (void *)(uintptr_t)
2575                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2576                                 LOG_DEBUG(VHOST_CONFIG,
2577                                         "in fill_hpa_regions: host  phys addr "
2578                                         "start[%d]:(%p)\n",
2579                                         regionidx_hpa,
2580                                         (void *)(uintptr_t)
2581                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2582                                 k = 0;
2583                         } else {
2584                                 k += page_size;
2585                         }
2586                 }
2587                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2588                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2589                         + k + page_size;
2590                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2591                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2592                         "[%d]:(%p)\n", regionidx_hpa,
2593                         (void *)(uintptr_t)
2594                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2595                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2596                         "[%d]:(%p)\n", regionidx_hpa,
2597                         (void *)(uintptr_t)
2598                         (mem_region_hpa[regionidx_hpa].memory_size));
2599                 ++regionidx_hpa;
2600         }
2601         return regionidx_hpa;
2602 }
2603
2604 /*
2605  * A new device is added to a data core. First the device is added to the main linked list
2606  * and the allocated to a specific data core.
2607  */
2608 static int
2609 new_device (struct virtio_net *dev)
2610 {
2611         struct virtio_net_data_ll *ll_dev;
2612         int lcore, core_add = 0;
2613         uint32_t device_num_min = num_devices;
2614         struct vhost_dev *vdev;
2615         uint32_t regionidx;
2616
2617         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2618         if (vdev == NULL) {
2619                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2620                         dev->device_fh);
2621                 return -1;
2622         }
2623         vdev->dev = dev;
2624         dev->priv = vdev;
2625
2626         if (zero_copy) {
2627                 vdev->nregions_hpa = dev->mem->nregions;
2628                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2629                         vdev->nregions_hpa
2630                                 += check_hpa_regions(
2631                                         dev->mem->regions[regionidx].guest_phys_address
2632                                         + dev->mem->regions[regionidx].address_offset,
2633                                         dev->mem->regions[regionidx].memory_size);
2634
2635                 }
2636
2637                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2638                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2639                         CACHE_LINE_SIZE);
2640                 if (vdev->regions_hpa == NULL) {
2641                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2642                         rte_free(vdev);
2643                         return -1;
2644                 }
2645
2646
2647                 if (fill_hpa_memory_regions(
2648                         vdev->regions_hpa, dev->mem
2649                         ) != vdev->nregions_hpa) {
2650
2651                         RTE_LOG(ERR, VHOST_CONFIG,
2652                                 "hpa memory regions number mismatch: "
2653                                 "[%d]\n", vdev->nregions_hpa);
2654                         rte_free(vdev->regions_hpa);
2655                         rte_free(vdev);
2656                         return -1;
2657                 }
2658         }
2659
2660
2661         /* Add device to main ll */
2662         ll_dev = get_data_ll_free_entry(&ll_root_free);
2663         if (ll_dev == NULL) {
2664                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2665                         "of %d devices per core has been reached\n",
2666                         dev->device_fh, num_devices);
2667                 if (vdev->regions_hpa)
2668                         rte_free(vdev->regions_hpa);
2669                 rte_free(vdev);
2670                 return -1;
2671         }
2672         ll_dev->vdev = vdev;
2673         add_data_ll_entry(&ll_root_used, ll_dev);
2674         vdev->vmdq_rx_q
2675                 = dev->device_fh * (num_queues / num_devices);
2676
2677         if (zero_copy) {
2678                 uint32_t index = vdev->vmdq_rx_q;
2679                 uint32_t count_in_ring, i;
2680                 struct mbuf_table *tx_q;
2681
2682                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2683
2684                 LOG_DEBUG(VHOST_CONFIG,
2685                         "(%"PRIu64") in new_device: mbuf count in mempool "
2686                         "before attach is: %d\n",
2687                         dev->device_fh,
2688                         rte_mempool_count(vpool_array[index].pool));
2689                 LOG_DEBUG(VHOST_CONFIG,
2690                         "(%"PRIu64") in new_device: mbuf count in  ring "
2691                         "before attach  is : %d\n",
2692                         dev->device_fh, count_in_ring);
2693
2694                 /*
2695                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2696                  */
2697                 for (i = 0; i < count_in_ring; i++)
2698                         attach_rxmbuf_zcp(dev);
2699
2700                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2701                         "mempool after attach is: %d\n",
2702                         dev->device_fh,
2703                         rte_mempool_count(vpool_array[index].pool));
2704                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2705                         "ring after attach  is : %d\n",
2706                         dev->device_fh,
2707                         rte_ring_count(vpool_array[index].ring));
2708
2709                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2710                 tx_q->txq_id = vdev->vmdq_rx_q;
2711
2712                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2713                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2714
2715                         LOG_DEBUG(VHOST_CONFIG,
2716                                 "(%"PRIu64") In new_device: Failed to start "
2717                                 "tx queue:%d\n",
2718                                 dev->device_fh, vdev->vmdq_rx_q);
2719
2720                         mbuf_destroy_zcp(vpool);
2721                         rte_free(vdev->regions_hpa);
2722                         rte_free(vdev);
2723                         return -1;
2724                 }
2725
2726                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2727                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2728
2729                         LOG_DEBUG(VHOST_CONFIG,
2730                                 "(%"PRIu64") In new_device: Failed to start "
2731                                 "rx queue:%d\n",
2732                                 dev->device_fh, vdev->vmdq_rx_q);
2733
2734                         /* Stop the TX queue. */
2735                         if (rte_eth_dev_tx_queue_stop(ports[0],
2736                                 vdev->vmdq_rx_q) != 0) {
2737                                 LOG_DEBUG(VHOST_CONFIG,
2738                                         "(%"PRIu64") In new_device: Failed to "
2739                                         "stop tx queue:%d\n",
2740                                         dev->device_fh, vdev->vmdq_rx_q);
2741                         }
2742
2743                         mbuf_destroy_zcp(vpool);
2744                         rte_free(vdev->regions_hpa);
2745                         rte_free(vdev);
2746                         return -1;
2747                 }
2748
2749         }
2750
2751         /*reset ready flag*/
2752         vdev->ready = DEVICE_MAC_LEARNING;
2753         vdev->remove = 0;
2754
2755         /* Find a suitable lcore to add the device. */
2756         RTE_LCORE_FOREACH_SLAVE(lcore) {
2757                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2758                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2759                         core_add = lcore;
2760                 }
2761         }
2762         /* Add device to lcore ll */
2763         ll_dev->dev->coreid = core_add;
2764         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2765         if (ll_dev == NULL) {
2766                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2767                 vdev->ready = DEVICE_SAFE_REMOVE;
2768                 destroy_device(dev);
2769                 if (vdev->regions_hpa)
2770                         rte_free(vdev->regions_hpa);
2771                 rte_free(vdev);
2772                 return -1;
2773         }
2774         ll_dev->vdev = vdev;
2775         vdev->coreid = core_add;
2776
2777         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2778
2779         /* Initialize device stats */
2780         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2781
2782         /* Disable notifications. */
2783         set_irq_status(dev);
2784         lcore_info[vdev->coreid].lcore_ll->device_num++;
2785         dev->flags |= VIRTIO_DEV_RUNNING;
2786
2787         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2788
2789         return 0;
2790 }
2791
2792 /*
2793  * These callback allow devices to be added to the data core when configuration
2794  * has been fully complete.
2795  */
2796 static const struct virtio_net_device_ops virtio_net_device_ops =
2797 {
2798         .new_device =  new_device,
2799         .destroy_device = destroy_device,
2800 };
2801
2802 /*
2803  * This is a thread will wake up after a period to print stats if the user has
2804  * enabled them.
2805  */
2806 static void
2807 print_stats(void)
2808 {
2809         struct virtio_net_data_ll *dev_ll;
2810         uint64_t tx_dropped, rx_dropped;
2811         uint64_t tx, tx_total, rx, rx_total;
2812         uint32_t device_fh;
2813         const char clr[] = { 27, '[', '2', 'J', '\0' };
2814         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2815
2816         while(1) {
2817                 sleep(enable_stats);
2818
2819                 /* Clear screen and move to top left */
2820                 printf("%s%s", clr, top_left);
2821
2822                 printf("\nDevice statistics ====================================");
2823
2824                 dev_ll = ll_root_used;
2825                 while (dev_ll != NULL) {
2826                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2827                         tx_total = dev_statistics[device_fh].tx_total;
2828                         tx = dev_statistics[device_fh].tx;
2829                         tx_dropped = tx_total - tx;
2830                         if (zero_copy == 0) {
2831                                 rx_total = rte_atomic64_read(
2832                                         &dev_statistics[device_fh].rx_total_atomic);
2833                                 rx = rte_atomic64_read(
2834                                         &dev_statistics[device_fh].rx_atomic);
2835                         } else {
2836                                 rx_total = dev_statistics[device_fh].rx_total;
2837                                 rx = dev_statistics[device_fh].rx;
2838                         }
2839                         rx_dropped = rx_total - rx;
2840
2841                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2842                                         "\nTX total:            %"PRIu64""
2843                                         "\nTX dropped:          %"PRIu64""
2844                                         "\nTX successful:               %"PRIu64""
2845                                         "\nRX total:            %"PRIu64""
2846                                         "\nRX dropped:          %"PRIu64""
2847                                         "\nRX successful:               %"PRIu64"",
2848                                         device_fh,
2849                                         tx_total,
2850                                         tx_dropped,
2851                                         tx,
2852                                         rx_total,
2853                                         rx_dropped,
2854                                         rx);
2855
2856                         dev_ll = dev_ll->next;
2857                 }
2858                 printf("\n======================================================\n");
2859         }
2860 }
2861
2862 static void
2863 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2864         char *ring_name, uint32_t nb_mbuf)
2865 {
2866         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2867         vpool_array[index].pool
2868                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2869                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2870                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2871                 rte_pktmbuf_init, NULL, socket, 0);
2872         if (vpool_array[index].pool != NULL) {
2873                 vpool_array[index].ring
2874                         = rte_ring_create(ring_name,
2875                                 rte_align32pow2(nb_mbuf + 1),
2876                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2877                 if (likely(vpool_array[index].ring != NULL)) {
2878                         LOG_DEBUG(VHOST_CONFIG,
2879                                 "in setup_mempool_tbl: mbuf count in "
2880                                 "mempool is: %d\n",
2881                                 rte_mempool_count(vpool_array[index].pool));
2882                         LOG_DEBUG(VHOST_CONFIG,
2883                                 "in setup_mempool_tbl: mbuf count in "
2884                                 "ring   is: %d\n",
2885                                 rte_ring_count(vpool_array[index].ring));
2886                 } else {
2887                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2888                                 ring_name);
2889                 }
2890
2891                 /* Need consider head room. */
2892                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2893         } else {
2894                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2895         }
2896 }
2897
2898
2899 /*
2900  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2901  * device is also registered here to handle the IOCTLs.
2902  */
2903 int
2904 MAIN(int argc, char *argv[])
2905 {
2906         struct rte_mempool *mbuf_pool = NULL;
2907         unsigned lcore_id, core_id = 0;
2908         unsigned nb_ports, valid_num_ports;
2909         int ret;
2910         uint8_t portid, queue_id = 0;
2911         static pthread_t tid;
2912
2913         /* init EAL */
2914         ret = rte_eal_init(argc, argv);
2915         if (ret < 0)
2916                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2917         argc -= ret;
2918         argv += ret;
2919
2920         /* parse app arguments */
2921         ret = us_vhost_parse_args(argc, argv);
2922         if (ret < 0)
2923                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2924
2925         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2926                 if (rte_lcore_is_enabled(lcore_id))
2927                         lcore_ids[core_id ++] = lcore_id;
2928
2929         if (rte_lcore_count() > RTE_MAX_LCORE)
2930                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2931
2932         /*set the number of swithcing cores available*/
2933         num_switching_cores = rte_lcore_count()-1;
2934
2935         /* Get the number of physical ports. */
2936         nb_ports = rte_eth_dev_count();
2937         if (nb_ports > RTE_MAX_ETHPORTS)
2938                 nb_ports = RTE_MAX_ETHPORTS;
2939
2940         /*
2941          * Update the global var NUM_PORTS and global array PORTS
2942          * and get value of var VALID_NUM_PORTS according to system ports number
2943          */
2944         valid_num_ports = check_ports_num(nb_ports);
2945
2946         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2947                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2948                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2949                 return -1;
2950         }
2951
2952         if (zero_copy == 0) {
2953                 /* Create the mbuf pool. */
2954                 mbuf_pool = rte_mempool_create(
2955                                 "MBUF_POOL",
2956                                 NUM_MBUFS_PER_PORT
2957                                 * valid_num_ports,
2958                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2959                                 sizeof(struct rte_pktmbuf_pool_private),
2960                                 rte_pktmbuf_pool_init, NULL,
2961                                 rte_pktmbuf_init, NULL,
2962                                 rte_socket_id(), 0);
2963                 if (mbuf_pool == NULL)
2964                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2965
2966                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2967                         vpool_array[queue_id].pool = mbuf_pool;
2968
2969                 if (vm2vm_mode == VM2VM_HARDWARE) {
2970                         /* Enable VT loop back to let L2 switch to do it. */
2971                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2972                         LOG_DEBUG(VHOST_CONFIG,
2973                                 "Enable loop back for L2 switch in vmdq.\n");
2974                 }
2975         } else {
2976                 uint32_t nb_mbuf;
2977                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2978                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2979
2980                 /*
2981                  * Zero copy defers queue RX/TX start to the time when guest
2982                  * finishes its startup and packet buffers from that guest are
2983                  * available.
2984                  */
2985                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2986                 rx_conf_default.rx_drop_en = 0;
2987                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2988                 nb_mbuf = num_rx_descriptor
2989                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2990                         + num_switching_cores * MAX_PKT_BURST;
2991
2992                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2993                         snprintf(pool_name, sizeof(pool_name),
2994                                 "rxmbuf_pool_%u", queue_id);
2995                         snprintf(ring_name, sizeof(ring_name),
2996                                 "rxmbuf_ring_%u", queue_id);
2997                         setup_mempool_tbl(rte_socket_id(), queue_id,
2998                                 pool_name, ring_name, nb_mbuf);
2999                 }
3000
3001                 nb_mbuf = num_tx_descriptor
3002                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3003                                 + num_switching_cores * MAX_PKT_BURST;
3004
3005                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3006                         snprintf(pool_name, sizeof(pool_name),
3007                                 "txmbuf_pool_%u", queue_id);
3008                         snprintf(ring_name, sizeof(ring_name),
3009                                 "txmbuf_ring_%u", queue_id);
3010                         setup_mempool_tbl(rte_socket_id(),
3011                                 (queue_id + MAX_QUEUES),
3012                                 pool_name, ring_name, nb_mbuf);
3013                 }
3014
3015                 if (vm2vm_mode == VM2VM_HARDWARE) {
3016                         /* Enable VT loop back to let L2 switch to do it. */
3017                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3018                         LOG_DEBUG(VHOST_CONFIG,
3019                                 "Enable loop back for L2 switch in vmdq.\n");
3020                 }
3021         }
3022         /* Set log level. */
3023         rte_set_log_level(LOG_LEVEL);
3024
3025         /* initialize all ports */
3026         for (portid = 0; portid < nb_ports; portid++) {
3027                 /* skip ports that are not enabled */
3028                 if ((enabled_port_mask & (1 << portid)) == 0) {
3029                         RTE_LOG(INFO, VHOST_PORT,
3030                                 "Skipping disabled port %d\n", portid);
3031                         continue;
3032                 }
3033                 if (port_init(portid) != 0)
3034                         rte_exit(EXIT_FAILURE,
3035                                 "Cannot initialize network ports\n");
3036         }
3037
3038         /* Initialise all linked lists. */
3039         if (init_data_ll() == -1)
3040                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3041
3042         /* Initialize device stats */
3043         memset(&dev_statistics, 0, sizeof(dev_statistics));
3044
3045         /* Enable stats if the user option is set. */
3046         if (enable_stats)
3047                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3048
3049         /* Launch all data cores. */
3050         if (zero_copy == 0) {
3051                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3052                         rte_eal_remote_launch(switch_worker,
3053                                 mbuf_pool, lcore_id);
3054                 }
3055         } else {
3056                 uint32_t count_in_mempool, index, i;
3057                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3058                         /* For all RX and TX queues. */
3059                         count_in_mempool
3060                                 = rte_mempool_count(vpool_array[index].pool);
3061
3062                         /*
3063                          * Transfer all un-attached mbufs from vpool.pool
3064                          * to vpoo.ring.
3065                          */
3066                         for (i = 0; i < count_in_mempool; i++) {
3067                                 struct rte_mbuf *mbuf
3068                                         = __rte_mbuf_raw_alloc(
3069                                                 vpool_array[index].pool);
3070                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3071                                                 (void *)mbuf);
3072                         }
3073
3074                         LOG_DEBUG(VHOST_CONFIG,
3075                                 "in MAIN: mbuf count in mempool at initial "
3076                                 "is: %d\n", count_in_mempool);
3077                         LOG_DEBUG(VHOST_CONFIG,
3078                                 "in MAIN: mbuf count in  ring at initial  is :"
3079                                 " %d\n",
3080                                 rte_ring_count(vpool_array[index].ring));
3081                 }
3082
3083                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3084                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3085                                 lcore_id);
3086         }
3087
3088         /* Register CUSE device to handle IOCTLs. */
3089         ret = rte_vhost_driver_register((char *)&dev_basename);
3090         if (ret != 0)
3091                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3092
3093         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3094
3095         /* Start CUSE session. */
3096         rte_vhost_driver_session_start();
3097         return 0;
3098
3099 }
3100