examples/vhost: register with lib
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
103 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
104
105 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
106 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
107
108 #define JUMBO_FRAME_MAX_SIZE    0x2600
109
110 /* State of virtio device. */
111 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_RX                       1
113 #define DEVICE_SAFE_REMOVE      2
114
115 /* Config_core_flag status definitions. */
116 #define REQUEST_DEV_REMOVAL 1
117 #define ACK_DEV_REMOVAL 0
118
119 /* Configurable number of RX/TX ring descriptors */
120 #define RTE_TEST_RX_DESC_DEFAULT 1024
121 #define RTE_TEST_TX_DESC_DEFAULT 512
122
123 /*
124  * Need refine these 2 macros for legacy and DPDK based front end:
125  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
126  * And then adjust power 2.
127  */
128 /*
129  * For legacy front end, 128 descriptors,
130  * half for virtio header, another half for mbuf.
131  */
132 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
133 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
134
135 /* Get first 4 bytes in mbuf headroom. */
136 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
137                 + sizeof(struct rte_mbuf)))
138
139 /* true if x is a power of 2 */
140 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141
142 #define INVALID_PORT_ID 0xFF
143
144 /* Max number of devices. Limited by vmdq. */
145 #define MAX_DEVICES 64
146
147 /* Size of buffers used for snprintfs. */
148 #define MAX_PRINT_BUFF 6072
149
150 /* Maximum character device basename size. */
151 #define MAX_BASENAME_SZ 10
152
153 /* Maximum long option length for option parsing. */
154 #define MAX_LONG_OPT_SZ 64
155
156 /* Used to compare MAC addresses. */
157 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158
159 /* Number of descriptors per cacheline. */
160 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161
162 /* mask of enabled ports */
163 static uint32_t enabled_port_mask = 0;
164
165 /*Number of switching cores enabled*/
166 static uint32_t num_switching_cores = 0;
167
168 /* number of devices/queues to support*/
169 static uint32_t num_queues = 0;
170 uint32_t num_devices = 0;
171
172 /*
173  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
174  * disabled on default.
175  */
176 static uint32_t zero_copy;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* This can be set by the user so it is made available here. */
222 extern uint64_t VHOST_FEATURES;
223
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226         .rx_thresh = {
227                 .pthresh = RX_PTHRESH,
228                 .hthresh = RX_HTHRESH,
229                 .wthresh = RX_WTHRESH,
230         },
231         .rx_drop_en = 1,
232 };
233
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240         .tx_thresh = {
241                 .pthresh = TX_PTHRESH,
242                 .hthresh = TX_HTHRESH,
243                 .wthresh = TX_WTHRESH,
244         },
245         .tx_free_thresh = 0, /* Use PMD default values */
246         .tx_rs_thresh = 0, /* Use PMD default values */
247 };
248
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251         .rxmode = {
252                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253                 .split_hdr_size = 0,
254                 .header_split   = 0, /**< Header Split disabled */
255                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
257                 /*
258                  * It is necessary for 1G NIC such as I350,
259                  * this fixes bug of ipv4 forwarding in guest can't
260                  * forward pakets from one virtio dev to another virtio dev.
261                  */
262                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
265         },
266
267         .txmode = {
268                 .mq_mode = ETH_MQ_TX_NONE,
269         },
270         .rx_adv_conf = {
271                 /*
272                  * should be overridden separately in code with
273                  * appropriate values
274                  */
275                 .vmdq_rx_conf = {
276                         .nb_queue_pools = ETH_8_POOLS,
277                         .enable_default_pool = 0,
278                         .default_pool = 0,
279                         .nb_pool_maps = 0,
280                         .pool_map = {{0, 0},},
281                 },
282         },
283 };
284
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
293         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
299 };
300
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
303
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
307
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
310
311 /* Used for queueing bursts of TX packets. */
312 struct mbuf_table {
313         unsigned len;
314         unsigned txq_id;
315         struct rte_mbuf *m_table[MAX_PKT_BURST];
316 };
317
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
320
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
323
324 /* Vlan header struct used to insert vlan tags on TX. */
325 struct vlan_ethhdr {
326         unsigned char   h_dest[ETH_ALEN];
327         unsigned char   h_source[ETH_ALEN];
328         __be16          h_vlan_proto;
329         __be16          h_vlan_TCI;
330         __be16          h_vlan_encapsulated_proto;
331 };
332
333 /* IPv4 Header */
334 struct ipv4_hdr {
335         uint8_t  version_ihl;           /**< version and header length */
336         uint8_t  type_of_service;       /**< type of service */
337         uint16_t total_length;          /**< length of packet */
338         uint16_t packet_id;             /**< packet ID */
339         uint16_t fragment_offset;       /**< fragmentation offset */
340         uint8_t  time_to_live;          /**< time to live */
341         uint8_t  next_proto_id;         /**< protocol ID */
342         uint16_t hdr_checksum;          /**< header checksum */
343         uint32_t src_addr;              /**< source address */
344         uint32_t dst_addr;              /**< destination address */
345 } __attribute__((__packed__));
346
347 /* Header lengths. */
348 #define VLAN_HLEN       4
349 #define VLAN_ETH_HLEN   18
350
351 /* Per-device statistics struct */
352 struct device_statistics {
353         uint64_t tx_total;
354         rte_atomic64_t rx_total_atomic;
355         uint64_t rx_total;
356         uint64_t tx;
357         rte_atomic64_t rx_atomic;
358         uint64_t rx;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
361
362 /*
363  * Builds up the correct configuration for VMDQ VLAN pool map
364  * according to the pool & queue limits.
365  */
366 static inline int
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
368 {
369         struct rte_eth_vmdq_rx_conf conf;
370         unsigned i;
371
372         memset(&conf, 0, sizeof(conf));
373         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
374         conf.nb_pool_maps = num_devices;
375         conf.enable_loop_back =
376                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
377
378         for (i = 0; i < conf.nb_pool_maps; i++) {
379                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
380                 conf.pool_map[i].pools = (1UL << i);
381         }
382
383         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
384         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
385                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
386         return 0;
387 }
388
389 /*
390  * Validate the device number according to the max pool number gotten form
391  * dev_info. If the device number is invalid, give the error message and
392  * return -1. Each device must have its own pool.
393  */
394 static inline int
395 validate_num_devices(uint32_t max_nb_devices)
396 {
397         if (num_devices > max_nb_devices) {
398                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
399                 return -1;
400         }
401         return 0;
402 }
403
404 /*
405  * Initialises a given port using global settings and with the rx buffers
406  * coming from the mbuf_pool passed as parameter
407  */
408 static inline int
409 port_init(uint8_t port)
410 {
411         struct rte_eth_dev_info dev_info;
412         struct rte_eth_conf port_conf;
413         uint16_t rx_rings, tx_rings;
414         uint16_t rx_ring_size, tx_ring_size;
415         int retval;
416         uint16_t q;
417
418         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
419         rte_eth_dev_info_get (port, &dev_info);
420
421         /*configure the number of supported virtio devices based on VMDQ limits */
422         num_devices = dev_info.max_vmdq_pools;
423         num_queues = dev_info.max_rx_queues;
424
425         if (zero_copy) {
426                 rx_ring_size = num_rx_descriptor;
427                 tx_ring_size = num_tx_descriptor;
428                 tx_rings = dev_info.max_tx_queues;
429         } else {
430                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
431                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
432                 tx_rings = (uint16_t)rte_lcore_count();
433         }
434
435         retval = validate_num_devices(MAX_DEVICES);
436         if (retval < 0)
437                 return retval;
438
439         /* Get port configuration. */
440         retval = get_eth_conf(&port_conf, num_devices);
441         if (retval < 0)
442                 return retval;
443
444         if (port >= rte_eth_dev_count()) return -1;
445
446         rx_rings = (uint16_t)num_queues,
447         /* Configure ethernet device. */
448         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
449         if (retval != 0)
450                 return retval;
451
452         /* Setup the queues. */
453         for (q = 0; q < rx_rings; q ++) {
454                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
455                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
456                                                 vpool_array[q].pool);
457                 if (retval < 0)
458                         return retval;
459         }
460         for (q = 0; q < tx_rings; q ++) {
461                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
462                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
463                 if (retval < 0)
464                         return retval;
465         }
466
467         /* Start the device. */
468         retval  = rte_eth_dev_start(port);
469         if (retval < 0) {
470                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
471                 return retval;
472         }
473
474         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
475         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
476         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
477                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
478                         (unsigned)port,
479                         vmdq_ports_eth_addr[port].addr_bytes[0],
480                         vmdq_ports_eth_addr[port].addr_bytes[1],
481                         vmdq_ports_eth_addr[port].addr_bytes[2],
482                         vmdq_ports_eth_addr[port].addr_bytes[3],
483                         vmdq_ports_eth_addr[port].addr_bytes[4],
484                         vmdq_ports_eth_addr[port].addr_bytes[5]);
485
486         return 0;
487 }
488
489 /*
490  * Set character device basename.
491  */
492 static int
493 us_vhost_parse_basename(const char *q_arg)
494 {
495         /* parse number string */
496
497         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
498                 return -1;
499         else
500                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
501
502         return 0;
503 }
504
505 /*
506  * Parse the portmask provided at run time.
507  */
508 static int
509 parse_portmask(const char *portmask)
510 {
511         char *end = NULL;
512         unsigned long pm;
513
514         errno = 0;
515
516         /* parse hexadecimal string */
517         pm = strtoul(portmask, &end, 16);
518         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
519                 return -1;
520
521         if (pm == 0)
522                 return -1;
523
524         return pm;
525
526 }
527
528 /*
529  * Parse num options at run time.
530  */
531 static int
532 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
533 {
534         char *end = NULL;
535         unsigned long num;
536
537         errno = 0;
538
539         /* parse unsigned int string */
540         num = strtoul(q_arg, &end, 10);
541         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
542                 return -1;
543
544         if (num > max_valid_value)
545                 return -1;
546
547         return num;
548
549 }
550
551 /*
552  * Display usage
553  */
554 static void
555 us_vhost_usage(const char *prgname)
556 {
557         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
558         "               --vm2vm [0|1|2]\n"
559         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
560         "               --dev-basename <name>\n"
561         "               --nb-devices ND\n"
562         "               -p PORTMASK: Set mask for ports to be used by application\n"
563         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
564         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
565         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
566         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
567         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
568         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
569         "               --dev-basename: The basename to be used for the character device.\n"
570         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
571                         "zero copy\n"
572         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
573                         "used only when zero copy is enabled.\n"
574         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
575                         "used only when zero copy is enabled.\n",
576                prgname);
577 }
578
579 /*
580  * Parse the arguments given in the command line of the application.
581  */
582 static int
583 us_vhost_parse_args(int argc, char **argv)
584 {
585         int opt, ret;
586         int option_index;
587         unsigned i;
588         const char *prgname = argv[0];
589         static struct option long_option[] = {
590                 {"vm2vm", required_argument, NULL, 0},
591                 {"rx-retry", required_argument, NULL, 0},
592                 {"rx-retry-delay", required_argument, NULL, 0},
593                 {"rx-retry-num", required_argument, NULL, 0},
594                 {"mergeable", required_argument, NULL, 0},
595                 {"stats", required_argument, NULL, 0},
596                 {"dev-basename", required_argument, NULL, 0},
597                 {"zero-copy", required_argument, NULL, 0},
598                 {"rx-desc-num", required_argument, NULL, 0},
599                 {"tx-desc-num", required_argument, NULL, 0},
600                 {NULL, 0, 0, 0},
601         };
602
603         /* Parse command line */
604         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
605                 switch (opt) {
606                 /* Portmask */
607                 case 'p':
608                         enabled_port_mask = parse_portmask(optarg);
609                         if (enabled_port_mask == 0) {
610                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
611                                 us_vhost_usage(prgname);
612                                 return -1;
613                         }
614                         break;
615
616                 case 0:
617                         /* Enable/disable vm2vm comms. */
618                         if (!strncmp(long_option[option_index].name, "vm2vm",
619                                 MAX_LONG_OPT_SZ)) {
620                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
621                                 if (ret == -1) {
622                                         RTE_LOG(INFO, VHOST_CONFIG,
623                                                 "Invalid argument for "
624                                                 "vm2vm [0|1|2]\n");
625                                         us_vhost_usage(prgname);
626                                         return -1;
627                                 } else {
628                                         vm2vm_mode = (vm2vm_type)ret;
629                                 }
630                         }
631
632                         /* Enable/disable retries on RX. */
633                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
634                                 ret = parse_num_opt(optarg, 1);
635                                 if (ret == -1) {
636                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
637                                         us_vhost_usage(prgname);
638                                         return -1;
639                                 } else {
640                                         enable_retry = ret;
641                                 }
642                         }
643
644                         /* Specify the retries delay time (in useconds) on RX. */
645                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
646                                 ret = parse_num_opt(optarg, INT32_MAX);
647                                 if (ret == -1) {
648                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         burst_rx_delay_time = ret;
653                                 }
654                         }
655
656                         /* Specify the retries number on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, INT32_MAX);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         burst_rx_retry_num = ret;
665                                 }
666                         }
667
668                         /* Enable/disable RX mergeable buffers. */
669                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else {
676                                         if (ret) {
677                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
678                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
679                                                         = JUMBO_FRAME_MAX_SIZE;
680                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681                                         }
682                                 }
683                         }
684
685                         /* Enable/disable stats. */
686                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
687                                 ret = parse_num_opt(optarg, INT32_MAX);
688                                 if (ret == -1) {
689                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
690                                         us_vhost_usage(prgname);
691                                         return -1;
692                                 } else {
693                                         enable_stats = ret;
694                                 }
695                         }
696
697                         /* Set character device basename. */
698                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
699                                 if (us_vhost_parse_basename(optarg) == -1) {
700                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
701                                         us_vhost_usage(prgname);
702                                         return -1;
703                                 }
704                         }
705
706                         /* Enable/disable rx/tx zero copy. */
707                         if (!strncmp(long_option[option_index].name,
708                                 "zero-copy", MAX_LONG_OPT_SZ)) {
709                                 ret = parse_num_opt(optarg, 1);
710                                 if (ret == -1) {
711                                         RTE_LOG(INFO, VHOST_CONFIG,
712                                                 "Invalid argument"
713                                                 " for zero-copy [0|1]\n");
714                                         us_vhost_usage(prgname);
715                                         return -1;
716                                 } else
717                                         zero_copy = ret;
718
719                                 if (zero_copy) {
720 #ifdef RTE_MBUF_REFCNT
721                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
722                                         "zero copy vhost APP, please "
723                                         "disable RTE_MBUF_REFCNT\n"
724                                         "in config file and then rebuild DPDK "
725                                         "core lib!\n"
726                                         "Otherwise please disable zero copy "
727                                         "flag in command line!\n");
728                                         return -1;
729 #endif
730                                 }
731                         }
732
733                         /* Specify the descriptor number on RX. */
734                         if (!strncmp(long_option[option_index].name,
735                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
736                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
737                                 if ((ret == -1) || (!POWEROF2(ret))) {
738                                         RTE_LOG(INFO, VHOST_CONFIG,
739                                         "Invalid argument for rx-desc-num[0-N],"
740                                         "power of 2 required.\n");
741                                         us_vhost_usage(prgname);
742                                         return -1;
743                                 } else {
744                                         num_rx_descriptor = ret;
745                                 }
746                         }
747
748                         /* Specify the descriptor number on TX. */
749                         if (!strncmp(long_option[option_index].name,
750                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
751                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
752                                 if ((ret == -1) || (!POWEROF2(ret))) {
753                                         RTE_LOG(INFO, VHOST_CONFIG,
754                                         "Invalid argument for tx-desc-num [0-N],"
755                                         "power of 2 required.\n");
756                                         us_vhost_usage(prgname);
757                                         return -1;
758                                 } else {
759                                         num_tx_descriptor = ret;
760                                 }
761                         }
762
763                         break;
764
765                         /* Invalid option - print options. */
766                 default:
767                         us_vhost_usage(prgname);
768                         return -1;
769                 }
770         }
771
772         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
773                 if (enabled_port_mask & (1 << i))
774                         ports[num_ports++] = (uint8_t)i;
775         }
776
777         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
778                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
779                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
780                 return -1;
781         }
782
783         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
784                 RTE_LOG(INFO, VHOST_PORT,
785                         "Vhost zero copy doesn't support software vm2vm,"
786                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
787                 return -1;
788         }
789
790         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
791                 RTE_LOG(INFO, VHOST_PORT,
792                         "Vhost zero copy doesn't support jumbo frame,"
793                         "please specify '--mergeable 0' to disable the "
794                         "mergeable feature.\n");
795                 return -1;
796         }
797
798         return 0;
799 }
800
801 /*
802  * Update the global var NUM_PORTS and array PORTS according to system ports number
803  * and return valid ports number
804  */
805 static unsigned check_ports_num(unsigned nb_ports)
806 {
807         unsigned valid_num_ports = num_ports;
808         unsigned portid;
809
810         if (num_ports > nb_ports) {
811                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
812                         num_ports, nb_ports);
813                 num_ports = nb_ports;
814         }
815
816         for (portid = 0; portid < num_ports; portid ++) {
817                 if (ports[portid] >= nb_ports) {
818                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
819                                 ports[portid], (nb_ports - 1));
820                         ports[portid] = INVALID_PORT_ID;
821                         valid_num_ports--;
822                 }
823         }
824         return valid_num_ports;
825 }
826
827 /*
828  * Macro to print out packet contents. Wrapped in debug define so that the
829  * data path is not effected when debug is disabled.
830  */
831 #ifdef DEBUG
832 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
833         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
834         unsigned int index;                                                                                                                                                                                             \
835         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
836                                                                                                                                                                                                                                         \
837         if ((header))                                                                                                                                                                                                   \
838                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
839         else                                                                                                                                                                                                                    \
840                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
841         for (index = 0; index < (size); index++) {                                                                                                                                              \
842                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
843                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
844         }                                                                                                                                                                                                                               \
845         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
846                                                                                                                                                                                                                                         \
847         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
848 } while(0)
849 #else
850 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
851 #endif
852
853 /*
854  * Function to convert guest physical addresses to vhost physical addresses.
855  * This is used to convert virtio buffer addresses.
856  */
857 static inline uint64_t __attribute__((always_inline))
858 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
859         uint32_t buf_len, hpa_type *addr_type)
860 {
861         struct virtio_memory_regions_hpa *region;
862         uint32_t regionidx;
863         uint64_t vhost_pa = 0;
864
865         *addr_type = PHYS_ADDR_INVALID;
866
867         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
868                 region = &vdev->regions_hpa[regionidx];
869                 if ((guest_pa >= region->guest_phys_address) &&
870                         (guest_pa <= region->guest_phys_address_end)) {
871                         vhost_pa = region->host_phys_addr_offset + guest_pa;
872                         if (likely((guest_pa + buf_len - 1)
873                                 <= region->guest_phys_address_end))
874                                 *addr_type = PHYS_ADDR_CONTINUOUS;
875                         else
876                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
877                         break;
878                 }
879         }
880
881         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
882                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
883                 (void *)(uintptr_t)vhost_pa);
884
885         return vhost_pa;
886 }
887
888 /*
889  * Compares a packet destination MAC address to a device MAC address.
890  */
891 static inline int __attribute__((always_inline))
892 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
893 {
894         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
895 }
896
897 /*
898  * This function learns the MAC address of the device and registers this along with a
899  * vlan tag to a VMDQ.
900  */
901 static int
902 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
903 {
904         struct ether_hdr *pkt_hdr;
905         struct virtio_net_data_ll *dev_ll;
906         struct virtio_net *dev = vdev->dev;
907         int i, ret;
908
909         /* Learn MAC address of guest device from packet */
910         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
911
912         dev_ll = ll_root_used;
913
914         while (dev_ll != NULL) {
915                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
916                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
917                         return -1;
918                 }
919                 dev_ll = dev_ll->next;
920         }
921
922         for (i = 0; i < ETHER_ADDR_LEN; i++)
923                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
924
925         /* vlan_tag currently uses the device_id. */
926         vdev->vlan_tag = vlan_tags[dev->device_fh];
927
928         /* Print out VMDQ registration info. */
929         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
930                 dev->device_fh,
931                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
932                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
933                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
934                 vdev->vlan_tag);
935
936         /* Register the MAC address. */
937         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
938         if (ret)
939                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
940                                         dev->device_fh);
941
942         /* Enable stripping of the vlan tag as we handle routing. */
943         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
944
945         /* Set device as ready for RX. */
946         vdev->ready = DEVICE_RX;
947
948         return 0;
949 }
950
951 /*
952  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
953  * queue before disabling RX on the device.
954  */
955 static inline void
956 unlink_vmdq(struct vhost_dev *vdev)
957 {
958         unsigned i = 0;
959         unsigned rx_count;
960         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
961
962         if (vdev->ready == DEVICE_RX) {
963                 /*clear MAC and VLAN settings*/
964                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
965                 for (i = 0; i < 6; i++)
966                         vdev->mac_address.addr_bytes[i] = 0;
967
968                 vdev->vlan_tag = 0;
969
970                 /*Clear out the receive buffers*/
971                 rx_count = rte_eth_rx_burst(ports[0],
972                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
973
974                 while (rx_count) {
975                         for (i = 0; i < rx_count; i++)
976                                 rte_pktmbuf_free(pkts_burst[i]);
977
978                         rx_count = rte_eth_rx_burst(ports[0],
979                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
980                 }
981
982                 vdev->ready = DEVICE_MAC_LEARNING;
983         }
984 }
985
986 /*
987  * Check if the packet destination MAC address is for a local device. If so then put
988  * the packet on that devices RX queue. If not then return.
989  */
990 static inline unsigned __attribute__((always_inline))
991 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
992 {
993         struct virtio_net_data_ll *dev_ll;
994         struct ether_hdr *pkt_hdr;
995         uint64_t ret = 0;
996         struct virtio_net *dev = vdev->dev;
997         struct virtio_net *tdev; /* destination virito device */
998
999         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1000
1001         /*get the used devices list*/
1002         dev_ll = ll_root_used;
1003
1004         while (dev_ll != NULL) {
1005                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1006                                           &dev_ll->vdev->mac_address)) {
1007
1008                         /* Drop the packet if the TX packet is destined for the TX device. */
1009                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1010                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1011                                                         dev->device_fh);
1012                                 return 0;
1013                         }
1014                         tdev = dev_ll->vdev->dev;
1015
1016
1017                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1018
1019                         if (dev_ll->vdev->remove) {
1020                                 /*drop the packet if the device is marked for removal*/
1021                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1022                         } else {
1023                                 uint32_t mergeable =
1024                                         dev_ll->dev->features &
1025                                         (1 << VIRTIO_NET_F_MRG_RXBUF);
1026
1027                                 /*send the packet to the local virtio device*/
1028                                 if (likely(mergeable == 0))
1029                                         ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1030                                 else
1031                                         ret = virtio_dev_merge_rx(dev_ll->dev,
1032                                                 &m, 1);
1033
1034                                 if (enable_stats) {
1035                                         rte_atomic64_add(
1036                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1037                                         1);
1038                                         rte_atomic64_add(
1039                                         &dev_statistics[tdev->device_fh].rx_atomic,
1040                                         ret);
1041                                         dev_statistics[tdev->device_fh].tx_total++;
1042                                         dev_statistics[tdev->device_fh].tx += ret;
1043                                 }
1044                         }
1045
1046                         return 0;
1047                 }
1048                 dev_ll = dev_ll->next;
1049         }
1050
1051         return -1;
1052 }
1053
1054 /*
1055  * This function routes the TX packet to the correct interface. This may be a local device
1056  * or the physical port.
1057  */
1058 static inline void __attribute__((always_inline))
1059 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1060 {
1061         struct mbuf_table *tx_q;
1062         struct vlan_ethhdr *vlan_hdr;
1063         struct rte_mbuf **m_table;
1064         struct rte_mbuf *mbuf, *prev;
1065         unsigned len, ret, offset = 0;
1066         const uint16_t lcore_id = rte_lcore_id();
1067         struct virtio_net_data_ll *dev_ll = ll_root_used;
1068         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1069         struct virtio_net *dev = vdev->dev;
1070
1071         /*check if destination is local VM*/
1072         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1073                 return;
1074
1075         if (vm2vm_mode == VM2VM_HARDWARE) {
1076                 while (dev_ll != NULL) {
1077                         if ((dev_ll->vdev->ready == DEVICE_RX)
1078                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1079                                 &dev_ll->vdev->mac_address)) {
1080                                 /*
1081                                  * Drop the packet if the TX packet is
1082                                  * destined for the TX device.
1083                                  */
1084                                 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1085                                         LOG_DEBUG(VHOST_DATA,
1086                                         "(%"PRIu64") TX: Source and destination"
1087                                         " MAC addresses are the same. Dropping "
1088                                         "packet.\n",
1089                                         dev_ll->vdev->device_fh);
1090                                         return;
1091                                 }
1092                                 offset = 4;
1093                                 vlan_tag =
1094                                 (uint16_t)
1095                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1096
1097                                 LOG_DEBUG(VHOST_DATA,
1098                                 "(%"PRIu64") TX: pkt to local VM device id:"
1099                                 "(%"PRIu64") vlan tag: %d.\n",
1100                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1101                                 vlan_tag);
1102
1103                                 break;
1104                         }
1105                         dev_ll = dev_ll->next;
1106                 }
1107         }
1108
1109         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1110
1111         /*Add packet to the port tx queue*/
1112         tx_q = &lcore_tx_queue[lcore_id];
1113         len = tx_q->len;
1114
1115         /* Allocate an mbuf and populate the structure. */
1116         mbuf = rte_pktmbuf_alloc(mbuf_pool);
1117         if (unlikely(mbuf == NULL)) {
1118                 RTE_LOG(ERR, VHOST_DATA,
1119                         "Failed to allocate memory for mbuf.\n");
1120                 return;
1121         }
1122
1123         mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1124         mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1125         mbuf->nb_segs = m->nb_segs;
1126
1127         /* Copy ethernet header to mbuf. */
1128         rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1129                 rte_pktmbuf_mtod(m, const void *),
1130                 ETH_HLEN);
1131
1132
1133         /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1134         vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1135         vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1136         vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1137         vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1138
1139         /* Copy the remaining packet contents to the mbuf. */
1140         rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1141                 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1142                 (m->data_len - ETH_HLEN));
1143
1144         /* Copy the remaining segments for the whole packet. */
1145         prev = mbuf;
1146         while (m->next) {
1147                 /* Allocate an mbuf and populate the structure. */
1148                 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1149                 if (unlikely(next_mbuf == NULL)) {
1150                         rte_pktmbuf_free(mbuf);
1151                         RTE_LOG(ERR, VHOST_DATA,
1152                                 "Failed to allocate memory for mbuf.\n");
1153                         return;
1154                 }
1155
1156                 m = m->next;
1157                 prev->next = next_mbuf;
1158                 prev = next_mbuf;
1159                 next_mbuf->data_len = m->data_len;
1160
1161                 /* Copy data to next mbuf. */
1162                 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1163                         rte_pktmbuf_mtod(m, const void *), m->data_len);
1164         }
1165
1166         tx_q->m_table[len] = mbuf;
1167         len++;
1168         if (enable_stats) {
1169                 dev_statistics[dev->device_fh].tx_total++;
1170                 dev_statistics[dev->device_fh].tx++;
1171         }
1172
1173         if (unlikely(len == MAX_PKT_BURST)) {
1174                 m_table = (struct rte_mbuf **)tx_q->m_table;
1175                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1176                 /* Free any buffers not handled by TX and update the port stats. */
1177                 if (unlikely(ret < len)) {
1178                         do {
1179                                 rte_pktmbuf_free(m_table[ret]);
1180                         } while (++ret < len);
1181                 }
1182
1183                 len = 0;
1184         }
1185
1186         tx_q->len = len;
1187         return;
1188 }
1189 /*
1190  * This function is called by each data core. It handles all RX/TX registered with the
1191  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1192  * with all devices in the main linked list.
1193  */
1194 static int
1195 switch_worker(__attribute__((unused)) void *arg)
1196 {
1197         struct rte_mempool *mbuf_pool = arg;
1198         struct virtio_net *dev = NULL;
1199         struct vhost_dev *vdev = NULL;
1200         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1201         struct virtio_net_data_ll *dev_ll;
1202         struct mbuf_table *tx_q;
1203         volatile struct lcore_ll_info *lcore_ll;
1204         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1205         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1206         unsigned ret, i;
1207         const uint16_t lcore_id = rte_lcore_id();
1208         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1209         uint16_t rx_count = 0;
1210         uint32_t mergeable = 0;
1211
1212         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1213         lcore_ll = lcore_info[lcore_id].lcore_ll;
1214         prev_tsc = 0;
1215
1216         tx_q = &lcore_tx_queue[lcore_id];
1217         for (i = 0; i < num_cores; i ++) {
1218                 if (lcore_ids[i] == lcore_id) {
1219                         tx_q->txq_id = i;
1220                         break;
1221                 }
1222         }
1223
1224         while(1) {
1225                 cur_tsc = rte_rdtsc();
1226                 /*
1227                  * TX burst queue drain
1228                  */
1229                 diff_tsc = cur_tsc - prev_tsc;
1230                 if (unlikely(diff_tsc > drain_tsc)) {
1231
1232                         if (tx_q->len) {
1233                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1234
1235                                 /*Tx any packets in the queue*/
1236                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1237                                                                            (struct rte_mbuf **)tx_q->m_table,
1238                                                                            (uint16_t)tx_q->len);
1239                                 if (unlikely(ret < tx_q->len)) {
1240                                         do {
1241                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1242                                         } while (++ret < tx_q->len);
1243                                 }
1244
1245                                 tx_q->len = 0;
1246                         }
1247
1248                         prev_tsc = cur_tsc;
1249
1250                 }
1251
1252                 rte_prefetch0(lcore_ll->ll_root_used);
1253                 /*
1254                  * Inform the configuration core that we have exited the linked list and that no devices are
1255                  * in use if requested.
1256                  */
1257                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1258                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1259
1260                 /*
1261                  * Process devices
1262                  */
1263                 dev_ll = lcore_ll->ll_root_used;
1264
1265                 while (dev_ll != NULL) {
1266                         /*get virtio device ID*/
1267                         vdev = dev_ll->vdev;
1268                         dev = vdev->dev;
1269                         mergeable =
1270                                 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1271
1272                         if (vdev->remove) {
1273                                 dev_ll = dev_ll->next;
1274                                 unlink_vmdq(vdev);
1275                                 vdev->ready = DEVICE_SAFE_REMOVE;
1276                                 continue;
1277                         }
1278                         if (likely(vdev->ready == DEVICE_RX)) {
1279                                 /*Handle guest RX*/
1280                                 rx_count = rte_eth_rx_burst(ports[0],
1281                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1282
1283                                 if (rx_count) {
1284                                         if (likely(mergeable == 0))
1285                                                 ret_count =
1286                                                         virtio_dev_rx(dev,
1287                                                         pkts_burst, rx_count);
1288                                         else
1289                                                 ret_count =
1290                                                         virtio_dev_merge_rx(dev,
1291                                                         pkts_burst, rx_count);
1292
1293                                         if (enable_stats) {
1294                                                 rte_atomic64_add(
1295                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1296                                                 rx_count);
1297                                                 rte_atomic64_add(
1298                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1299                                         }
1300                                         while (likely(rx_count)) {
1301                                                 rx_count--;
1302                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1303                                         }
1304
1305                                 }
1306                         }
1307
1308                         if (!vdev->remove) {
1309                                 /*Handle guest TX*/
1310                                 if (likely(mergeable == 0))
1311                                         virtio_dev_tx(dev, mbuf_pool);
1312                                 else
1313                                         virtio_dev_merge_tx(dev, mbuf_pool);
1314                         }
1315
1316                         /*move to the next device in the list*/
1317                         dev_ll = dev_ll->next;
1318                 }
1319         }
1320
1321         return 0;
1322 }
1323
1324 /*
1325  * This function gets available ring number for zero copy rx.
1326  * Only one thread will call this funciton for a paticular virtio device,
1327  * so, it is designed as non-thread-safe function.
1328  */
1329 static inline uint32_t __attribute__((always_inline))
1330 get_available_ring_num_zcp(struct virtio_net *dev)
1331 {
1332         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1333         uint16_t avail_idx;
1334
1335         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1336         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1337 }
1338
1339 /*
1340  * This function gets available ring index for zero copy rx,
1341  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1342  * Only one thread will call this funciton for a paticular virtio device,
1343  * so, it is designed as non-thread-safe function.
1344  */
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_index_zcp(struct virtio_net *dev,
1347         uint16_t *res_base_idx, uint32_t count)
1348 {
1349         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1350         uint16_t avail_idx;
1351         uint32_t retry = 0;
1352         uint16_t free_entries;
1353
1354         *res_base_idx = vq->last_used_idx_res;
1355         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1356         free_entries = (avail_idx - *res_base_idx);
1357
1358         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1359                         "avail idx: %d, "
1360                         "res base idx:%d, free entries:%d\n",
1361                         dev->device_fh, avail_idx, *res_base_idx,
1362                         free_entries);
1363
1364         /*
1365          * If retry is enabled and the queue is full then we wait
1366          * and retry to avoid packet loss.
1367          */
1368         if (enable_retry && unlikely(count > free_entries)) {
1369                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1370                         rte_delay_us(burst_rx_delay_time);
1371                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372                         free_entries = (avail_idx - *res_base_idx);
1373                         if (count <= free_entries)
1374                                 break;
1375                 }
1376         }
1377
1378         /*check that we have enough buffers*/
1379         if (unlikely(count > free_entries))
1380                 count = free_entries;
1381
1382         if (unlikely(count == 0)) {
1383                 LOG_DEBUG(VHOST_DATA,
1384                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1385                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1386                         dev->device_fh, avail_idx,
1387                         *res_base_idx, free_entries);
1388                 return 0;
1389         }
1390
1391         vq->last_used_idx_res = *res_base_idx + count;
1392
1393         return count;
1394 }
1395
1396 /*
1397  * This function put descriptor back to used list.
1398  */
1399 static inline void __attribute__((always_inline))
1400 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1401 {
1402         uint16_t res_cur_idx = vq->last_used_idx;
1403         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1404         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1405         rte_compiler_barrier();
1406         *(volatile uint16_t *)&vq->used->idx += 1;
1407         vq->last_used_idx += 1;
1408
1409         /* Kick the guest if necessary. */
1410         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1411                 eventfd_write((int)vq->kickfd, 1);
1412 }
1413
1414 /*
1415  * This function get available descriptor from vitio vring and un-attached mbuf
1416  * from vpool->ring, and then attach them together. It needs adjust the offset
1417  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1418  * frame data may be put to wrong location in mbuf.
1419  */
1420 static inline void __attribute__((always_inline))
1421 attach_rxmbuf_zcp(struct virtio_net *dev)
1422 {
1423         uint16_t res_base_idx, desc_idx;
1424         uint64_t buff_addr, phys_addr;
1425         struct vhost_virtqueue *vq;
1426         struct vring_desc *desc;
1427         struct rte_mbuf *mbuf = NULL;
1428         struct vpool *vpool;
1429         hpa_type addr_type;
1430         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1431
1432         vpool = &vpool_array[vdev->vmdq_rx_q];
1433         vq = dev->virtqueue[VIRTIO_RXQ];
1434
1435         do {
1436                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1437                                 1) != 1))
1438                         return;
1439                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1440
1441                 desc = &vq->desc[desc_idx];
1442                 if (desc->flags & VRING_DESC_F_NEXT) {
1443                         desc = &vq->desc[desc->next];
1444                         buff_addr = gpa_to_vva(dev, desc->addr);
1445                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1446                                         &addr_type);
1447                 } else {
1448                         buff_addr = gpa_to_vva(dev,
1449                                         desc->addr + vq->vhost_hlen);
1450                         phys_addr = gpa_to_hpa(vdev,
1451                                         desc->addr + vq->vhost_hlen,
1452                                         desc->len, &addr_type);
1453                 }
1454
1455                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1456                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1457                                 " address found when attaching RX frame buffer"
1458                                 " address!\n", dev->device_fh);
1459                         put_desc_to_used_list_zcp(vq, desc_idx);
1460                         continue;
1461                 }
1462
1463                 /*
1464                  * Check if the frame buffer address from guest crosses
1465                  * sub-region or not.
1466                  */
1467                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1468                         RTE_LOG(ERR, VHOST_DATA,
1469                                 "(%"PRIu64") Frame buffer address cross "
1470                                 "sub-regioin found when attaching RX frame "
1471                                 "buffer address!\n",
1472                                 dev->device_fh);
1473                         put_desc_to_used_list_zcp(vq, desc_idx);
1474                         continue;
1475                 }
1476         } while (unlikely(phys_addr == 0));
1477
1478         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1479         if (unlikely(mbuf == NULL)) {
1480                 LOG_DEBUG(VHOST_DATA,
1481                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1482                         "ring_sc_dequeue fail.\n",
1483                         dev->device_fh);
1484                 put_desc_to_used_list_zcp(vq, desc_idx);
1485                 return;
1486         }
1487
1488         if (unlikely(vpool->buf_size > desc->len)) {
1489                 LOG_DEBUG(VHOST_DATA,
1490                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1491                         "length(%d) of descriptor idx: %d less than room "
1492                         "size required: %d\n",
1493                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1494                 put_desc_to_used_list_zcp(vq, desc_idx);
1495                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1496                 return;
1497         }
1498
1499         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1500         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1501         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1502         mbuf->data_len = desc->len;
1503         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1504
1505         LOG_DEBUG(VHOST_DATA,
1506                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1507                 "descriptor idx:%d\n",
1508                 dev->device_fh, res_base_idx, desc_idx);
1509
1510         __rte_mbuf_raw_free(mbuf);
1511
1512         return;
1513 }
1514
1515 /*
1516  * Detach an attched packet mbuf -
1517  *  - restore original mbuf address and length values.
1518  *  - reset pktmbuf data and data_len to their default values.
1519  *  All other fields of the given packet mbuf will be left intact.
1520  *
1521  * @param m
1522  *   The attached packet mbuf.
1523  */
1524 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1525 {
1526         const struct rte_mempool *mp = m->pool;
1527         void *buf = RTE_MBUF_TO_BADDR(m);
1528         uint32_t buf_ofs;
1529         uint32_t buf_len = mp->elt_size - sizeof(*m);
1530         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1531
1532         m->buf_addr = buf;
1533         m->buf_len = (uint16_t)buf_len;
1534
1535         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1536                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1537         m->data_off = buf_ofs;
1538
1539         m->data_len = 0;
1540 }
1541
1542 /*
1543  * This function is called after packets have been transimited. It fetchs mbuf
1544  * from vpool->pool, detached it and put into vpool->ring. It also update the
1545  * used index and kick the guest if necessary.
1546  */
1547 static inline uint32_t __attribute__((always_inline))
1548 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1549 {
1550         struct rte_mbuf *mbuf;
1551         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1552         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1553         uint32_t index = 0;
1554         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1555
1556         LOG_DEBUG(VHOST_DATA,
1557                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1558                 "clean is: %d\n",
1559                 dev->device_fh, mbuf_count);
1560         LOG_DEBUG(VHOST_DATA,
1561                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1562                 "clean  is : %d\n",
1563                 dev->device_fh, rte_ring_count(vpool->ring));
1564
1565         for (index = 0; index < mbuf_count; index++) {
1566                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1567                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1568                         pktmbuf_detach_zcp(mbuf);
1569                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1570
1571                 /* Update used index buffer information. */
1572                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1573                 vq->used->ring[used_idx].len = 0;
1574
1575                 used_idx = (used_idx + 1) & (vq->size - 1);
1576         }
1577
1578         LOG_DEBUG(VHOST_DATA,
1579                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1580                 "clean is: %d\n",
1581                 dev->device_fh, rte_mempool_count(vpool->pool));
1582         LOG_DEBUG(VHOST_DATA,
1583                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1584                 "clean  is : %d\n",
1585                 dev->device_fh, rte_ring_count(vpool->ring));
1586         LOG_DEBUG(VHOST_DATA,
1587                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1588                 "vq->last_used_idx:%d\n",
1589                 dev->device_fh, vq->last_used_idx);
1590
1591         vq->last_used_idx += mbuf_count;
1592
1593         LOG_DEBUG(VHOST_DATA,
1594                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1595                 "vq->last_used_idx:%d\n",
1596                 dev->device_fh, vq->last_used_idx);
1597
1598         rte_compiler_barrier();
1599
1600         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1601
1602         /* Kick guest if required. */
1603         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1604                 eventfd_write((int)vq->kickfd, 1);
1605
1606         return 0;
1607 }
1608
1609 /*
1610  * This function is called when a virtio device is destroy.
1611  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1612  */
1613 static void mbuf_destroy_zcp(struct vpool *vpool)
1614 {
1615         struct rte_mbuf *mbuf = NULL;
1616         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1617
1618         LOG_DEBUG(VHOST_CONFIG,
1619                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1620                 "mbuf_destroy_zcp is: %d\n",
1621                 mbuf_count);
1622         LOG_DEBUG(VHOST_CONFIG,
1623                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1624                 "mbuf_destroy_zcp  is : %d\n",
1625                 rte_ring_count(vpool->ring));
1626
1627         for (index = 0; index < mbuf_count; index++) {
1628                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1629                 if (likely(mbuf != NULL)) {
1630                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1631                                 pktmbuf_detach_zcp(mbuf);
1632                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1633                 }
1634         }
1635
1636         LOG_DEBUG(VHOST_CONFIG,
1637                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1638                 "mbuf_destroy_zcp is: %d\n",
1639                 rte_mempool_count(vpool->pool));
1640         LOG_DEBUG(VHOST_CONFIG,
1641                 "in mbuf_destroy_zcp: mbuf count in ring after "
1642                 "mbuf_destroy_zcp is : %d\n",
1643                 rte_ring_count(vpool->ring));
1644 }
1645
1646 /*
1647  * This function update the use flag and counter.
1648  */
1649 static inline uint32_t __attribute__((always_inline))
1650 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1651         uint32_t count)
1652 {
1653         struct vhost_virtqueue *vq;
1654         struct vring_desc *desc;
1655         struct rte_mbuf *buff;
1656         /* The virtio_hdr is initialised to 0. */
1657         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1658                 = {{0, 0, 0, 0, 0, 0}, 0};
1659         uint64_t buff_hdr_addr = 0;
1660         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1661         uint32_t head_idx, packet_success = 0;
1662         uint16_t res_cur_idx;
1663
1664         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1665
1666         if (count == 0)
1667                 return 0;
1668
1669         vq = dev->virtqueue[VIRTIO_RXQ];
1670         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1671
1672         res_cur_idx = vq->last_used_idx;
1673         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1674                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1675
1676         /* Retrieve all of the head indexes first to avoid caching issues. */
1677         for (head_idx = 0; head_idx < count; head_idx++)
1678                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1679
1680         /*Prefetch descriptor index. */
1681         rte_prefetch0(&vq->desc[head[packet_success]]);
1682
1683         while (packet_success != count) {
1684                 /* Get descriptor from available ring */
1685                 desc = &vq->desc[head[packet_success]];
1686
1687                 buff = pkts[packet_success];
1688                 LOG_DEBUG(VHOST_DATA,
1689                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1690                         "pkt[%d] descriptor idx: %d\n",
1691                         dev->device_fh, packet_success,
1692                         MBUF_HEADROOM_UINT32(buff));
1693
1694                 PRINT_PACKET(dev,
1695                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1696                         + RTE_PKTMBUF_HEADROOM),
1697                         rte_pktmbuf_data_len(buff), 0);
1698
1699                 /* Buffer address translation for virtio header. */
1700                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1701                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1702
1703                 /*
1704                  * If the descriptors are chained the header and data are
1705                  * placed in separate buffers.
1706                  */
1707                 if (desc->flags & VRING_DESC_F_NEXT) {
1708                         desc->len = vq->vhost_hlen;
1709                         desc = &vq->desc[desc->next];
1710                         desc->len = rte_pktmbuf_data_len(buff);
1711                 } else {
1712                         desc->len = packet_len;
1713                 }
1714
1715                 /* Update used ring with desc information */
1716                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1717                         = head[packet_success];
1718                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1719                         = packet_len;
1720                 res_cur_idx++;
1721                 packet_success++;
1722
1723                 /* A header is required per buffer. */
1724                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1725                         (const void *)&virtio_hdr, vq->vhost_hlen);
1726
1727                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1728
1729                 if (likely(packet_success < count)) {
1730                         /* Prefetch descriptor index. */
1731                         rte_prefetch0(&vq->desc[head[packet_success]]);
1732                 }
1733         }
1734
1735         rte_compiler_barrier();
1736
1737         LOG_DEBUG(VHOST_DATA,
1738                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1739                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1740                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1741
1742         *(volatile uint16_t *)&vq->used->idx += count;
1743         vq->last_used_idx += count;
1744
1745         LOG_DEBUG(VHOST_DATA,
1746                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1747                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1748                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1749
1750         /* Kick the guest if necessary. */
1751         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1752                 eventfd_write((int)vq->kickfd, 1);
1753
1754         return count;
1755 }
1756
1757 /*
1758  * This function routes the TX packet to the correct interface.
1759  * This may be a local device or the physical port.
1760  */
1761 static inline void __attribute__((always_inline))
1762 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1763         uint32_t desc_idx, uint8_t need_copy)
1764 {
1765         struct mbuf_table *tx_q;
1766         struct rte_mbuf **m_table;
1767         struct rte_mbuf *mbuf = NULL;
1768         unsigned len, ret, offset = 0;
1769         struct vpool *vpool;
1770         struct virtio_net_data_ll *dev_ll = ll_root_used;
1771         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1772         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1773         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1774
1775         /*Add packet to the port tx queue*/
1776         tx_q = &tx_queue_zcp[vmdq_rx_q];
1777         len = tx_q->len;
1778
1779         /* Allocate an mbuf and populate the structure. */
1780         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1781         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1782         if (unlikely(mbuf == NULL)) {
1783                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1784                 RTE_LOG(ERR, VHOST_DATA,
1785                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1786                         dev->device_fh);
1787                 put_desc_to_used_list_zcp(vq, desc_idx);
1788                 return;
1789         }
1790
1791         if (vm2vm_mode == VM2VM_HARDWARE) {
1792                 /* Avoid using a vlan tag from any vm for external pkt, such as
1793                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1794                  * selection, MAC address determines it as an external pkt
1795                  * which should go to network, while vlan tag determine it as
1796                  * a vm2vm pkt should forward to another vm. Hardware confuse
1797                  * such a ambiguous situation, so pkt will lost.
1798                  */
1799                 vlan_tag = external_pkt_default_vlan_tag;
1800                 while (dev_ll != NULL) {
1801                         if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1802                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1803                                 &dev_ll->vdev->mac_address)) {
1804
1805                                 /*
1806                                  * Drop the packet if the TX packet is destined
1807                                  * for the TX device.
1808                                  */
1809                                 if (unlikely(dev_ll->vdev->dev->device_fh
1810                                         == dev->device_fh)) {
1811                                         LOG_DEBUG(VHOST_DATA,
1812                                         "(%"PRIu64") TX: Source and destination"
1813                                         "MAC addresses are the same. Dropping "
1814                                         "packet.\n",
1815                                         dev_ll->vdev->dev->device_fh);
1816                                         MBUF_HEADROOM_UINT32(mbuf)
1817                                                 = (uint32_t)desc_idx;
1818                                         __rte_mbuf_raw_free(mbuf);
1819                                         return;
1820                                 }
1821
1822                                 /*
1823                                  * Packet length offset 4 bytes for HW vlan
1824                                  * strip when L2 switch back.
1825                                  */
1826                                 offset = 4;
1827                                 vlan_tag =
1828                                 (uint16_t)
1829                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1830
1831                                 LOG_DEBUG(VHOST_DATA,
1832                                 "(%"PRIu64") TX: pkt to local VM device id:"
1833                                 "(%"PRIu64") vlan tag: %d.\n",
1834                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1835                                 vlan_tag);
1836
1837                                 break;
1838                         }
1839                         dev_ll = dev_ll->next;
1840                 }
1841         }
1842
1843         mbuf->nb_segs = m->nb_segs;
1844         mbuf->next = m->next;
1845         mbuf->data_len = m->data_len + offset;
1846         mbuf->pkt_len = mbuf->data_len;
1847         if (unlikely(need_copy)) {
1848                 /* Copy the packet contents to the mbuf. */
1849                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1850                         rte_pktmbuf_mtod(m, void *),
1851                         m->data_len);
1852         } else {
1853                 mbuf->data_off = m->data_off;
1854                 mbuf->buf_physaddr = m->buf_physaddr;
1855                 mbuf->buf_addr = m->buf_addr;
1856         }
1857         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1858         mbuf->vlan_tci = vlan_tag;
1859         mbuf->l2_len = sizeof(struct ether_hdr);
1860         mbuf->l3_len = sizeof(struct ipv4_hdr);
1861         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1862
1863         tx_q->m_table[len] = mbuf;
1864         len++;
1865
1866         LOG_DEBUG(VHOST_DATA,
1867                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1868                 dev->device_fh,
1869                 mbuf->nb_segs,
1870                 (mbuf->next == NULL) ? "null" : "non-null");
1871
1872         if (enable_stats) {
1873                 dev_statistics[dev->device_fh].tx_total++;
1874                 dev_statistics[dev->device_fh].tx++;
1875         }
1876
1877         if (unlikely(len == MAX_PKT_BURST)) {
1878                 m_table = (struct rte_mbuf **)tx_q->m_table;
1879                 ret = rte_eth_tx_burst(ports[0],
1880                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1881
1882                 /*
1883                  * Free any buffers not handled by TX and update
1884                  * the port stats.
1885                  */
1886                 if (unlikely(ret < len)) {
1887                         do {
1888                                 rte_pktmbuf_free(m_table[ret]);
1889                         } while (++ret < len);
1890                 }
1891
1892                 len = 0;
1893                 txmbuf_clean_zcp(dev, vpool);
1894         }
1895
1896         tx_q->len = len;
1897
1898         return;
1899 }
1900
1901 /*
1902  * This function TX all available packets in virtio TX queue for one
1903  * virtio-net device. If it is first packet, it learns MAC address and
1904  * setup VMDQ.
1905  */
1906 static inline void __attribute__((always_inline))
1907 virtio_dev_tx_zcp(struct virtio_net *dev)
1908 {
1909         struct rte_mbuf m;
1910         struct vhost_virtqueue *vq;
1911         struct vring_desc *desc;
1912         uint64_t buff_addr = 0, phys_addr;
1913         uint32_t head[MAX_PKT_BURST];
1914         uint32_t i;
1915         uint16_t free_entries, packet_success = 0;
1916         uint16_t avail_idx;
1917         uint8_t need_copy = 0;
1918         hpa_type addr_type;
1919         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1920
1921         vq = dev->virtqueue[VIRTIO_TXQ];
1922         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1923
1924         /* If there are no available buffers then return. */
1925         if (vq->last_used_idx_res == avail_idx)
1926                 return;
1927
1928         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1929
1930         /* Prefetch available ring to retrieve head indexes. */
1931         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1932
1933         /* Get the number of free entries in the ring */
1934         free_entries = (avail_idx - vq->last_used_idx_res);
1935
1936         /* Limit to MAX_PKT_BURST. */
1937         free_entries
1938                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1939
1940         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1941                 dev->device_fh, free_entries);
1942
1943         /* Retrieve all of the head indexes first to avoid caching issues. */
1944         for (i = 0; i < free_entries; i++)
1945                 head[i]
1946                         = vq->avail->ring[(vq->last_used_idx_res + i)
1947                         & (vq->size - 1)];
1948
1949         vq->last_used_idx_res += free_entries;
1950
1951         /* Prefetch descriptor index. */
1952         rte_prefetch0(&vq->desc[head[packet_success]]);
1953         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1954
1955         while (packet_success < free_entries) {
1956                 desc = &vq->desc[head[packet_success]];
1957
1958                 /* Discard first buffer as it is the virtio header */
1959                 desc = &vq->desc[desc->next];
1960
1961                 /* Buffer address translation. */
1962                 buff_addr = gpa_to_vva(dev, desc->addr);
1963                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1964
1965                 if (likely(packet_success < (free_entries - 1)))
1966                         /* Prefetch descriptor index. */
1967                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1968
1969                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1970                         RTE_LOG(ERR, VHOST_DATA,
1971                                 "(%"PRIu64") Invalid frame buffer address found"
1972                                 "when TX packets!\n",
1973                                 dev->device_fh);
1974                         packet_success++;
1975                         continue;
1976                 }
1977
1978                 /* Prefetch buffer address. */
1979                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1980
1981                 /*
1982                  * Setup dummy mbuf. This is copied to a real mbuf if
1983                  * transmitted out the physical port.
1984                  */
1985                 m.data_len = desc->len;
1986                 m.nb_segs = 1;
1987                 m.next = NULL;
1988                 m.data_off = 0;
1989                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1990                 m.buf_physaddr = phys_addr;
1991
1992                 /*
1993                  * Check if the frame buffer address from guest crosses
1994                  * sub-region or not.
1995                  */
1996                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1997                         RTE_LOG(ERR, VHOST_DATA,
1998                                 "(%"PRIu64") Frame buffer address cross "
1999                                 "sub-regioin found when attaching TX frame "
2000                                 "buffer address!\n",
2001                                 dev->device_fh);
2002                         need_copy = 1;
2003                 } else
2004                         need_copy = 0;
2005
2006                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2007
2008                 /*
2009                  * If this is the first received packet we need to learn
2010                  * the MAC and setup VMDQ
2011                  */
2012                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2013                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2014                                 /*
2015                                  * Discard frame if device is scheduled for
2016                                  * removal or a duplicate MAC address is found.
2017                                  */
2018                                 packet_success += free_entries;
2019                                 vq->last_used_idx += packet_success;
2020                                 break;
2021                         }
2022                 }
2023
2024                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2025                 packet_success++;
2026         }
2027 }
2028
2029 /*
2030  * This function is called by each data core. It handles all RX/TX registered
2031  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2032  * addresses are compared with all devices in the main linked list.
2033  */
2034 static int
2035 switch_worker_zcp(__attribute__((unused)) void *arg)
2036 {
2037         struct virtio_net *dev = NULL;
2038         struct vhost_dev  *vdev = NULL;
2039         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2040         struct virtio_net_data_ll *dev_ll;
2041         struct mbuf_table *tx_q;
2042         volatile struct lcore_ll_info *lcore_ll;
2043         const uint64_t drain_tsc
2044                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2045                 * BURST_TX_DRAIN_US;
2046         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2047         unsigned ret;
2048         const uint16_t lcore_id = rte_lcore_id();
2049         uint16_t count_in_ring, rx_count = 0;
2050
2051         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2052
2053         lcore_ll = lcore_info[lcore_id].lcore_ll;
2054         prev_tsc = 0;
2055
2056         while (1) {
2057                 cur_tsc = rte_rdtsc();
2058
2059                 /* TX burst queue drain */
2060                 diff_tsc = cur_tsc - prev_tsc;
2061                 if (unlikely(diff_tsc > drain_tsc)) {
2062                         /*
2063                          * Get mbuf from vpool.pool and detach mbuf and
2064                          * put back into vpool.ring.
2065                          */
2066                         dev_ll = lcore_ll->ll_root_used;
2067                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2068                                 /* Get virtio device ID */
2069                                 vdev = dev_ll->vdev;
2070                                 dev = vdev->dev;
2071
2072                                 if (likely(!vdev->remove)) {
2073                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2074                                         if (tx_q->len) {
2075                                                 LOG_DEBUG(VHOST_DATA,
2076                                                 "TX queue drained after timeout"
2077                                                 " with burst size %u\n",
2078                                                 tx_q->len);
2079
2080                                                 /*
2081                                                  * Tx any packets in the queue
2082                                                  */
2083                                                 ret = rte_eth_tx_burst(
2084                                                         ports[0],
2085                                                         (uint16_t)tx_q->txq_id,
2086                                                         (struct rte_mbuf **)
2087                                                         tx_q->m_table,
2088                                                         (uint16_t)tx_q->len);
2089                                                 if (unlikely(ret < tx_q->len)) {
2090                                                         do {
2091                                                                 rte_pktmbuf_free(
2092                                                                         tx_q->m_table[ret]);
2093                                                         } while (++ret < tx_q->len);
2094                                                 }
2095                                                 tx_q->len = 0;
2096
2097                                                 txmbuf_clean_zcp(dev,
2098                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2099                                         }
2100                                 }
2101                                 dev_ll = dev_ll->next;
2102                         }
2103                         prev_tsc = cur_tsc;
2104                 }
2105
2106                 rte_prefetch0(lcore_ll->ll_root_used);
2107
2108                 /*
2109                  * Inform the configuration core that we have exited the linked
2110                  * list and that no devices are in use if requested.
2111                  */
2112                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2113                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2114
2115                 /* Process devices */
2116                 dev_ll = lcore_ll->ll_root_used;
2117
2118                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2119                         vdev = dev_ll->vdev;
2120                         dev  = vdev->dev;
2121                         if (unlikely(vdev->remove)) {
2122                                 dev_ll = dev_ll->next;
2123                                 unlink_vmdq(vdev);
2124                                 vdev->ready = DEVICE_SAFE_REMOVE;
2125                                 continue;
2126                         }
2127
2128                         if (likely(vdev->ready == DEVICE_RX)) {
2129                                 uint32_t index = vdev->vmdq_rx_q;
2130                                 uint16_t i;
2131                                 count_in_ring
2132                                 = rte_ring_count(vpool_array[index].ring);
2133                                 uint16_t free_entries
2134                                 = (uint16_t)get_available_ring_num_zcp(dev);
2135
2136                                 /*
2137                                  * Attach all mbufs in vpool.ring and put back
2138                                  * into vpool.pool.
2139                                  */
2140                                 for (i = 0;
2141                                 i < RTE_MIN(free_entries,
2142                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2143                                 i++)
2144                                         attach_rxmbuf_zcp(dev);
2145
2146                                 /* Handle guest RX */
2147                                 rx_count = rte_eth_rx_burst(ports[0],
2148                                         vdev->vmdq_rx_q, pkts_burst,
2149                                         MAX_PKT_BURST);
2150
2151                                 if (rx_count) {
2152                                         ret_count = virtio_dev_rx_zcp(dev,
2153                                                         pkts_burst, rx_count);
2154                                         if (enable_stats) {
2155                                                 dev_statistics[dev->device_fh].rx_total
2156                                                         += rx_count;
2157                                                 dev_statistics[dev->device_fh].rx
2158                                                         += ret_count;
2159                                         }
2160                                         while (likely(rx_count)) {
2161                                                 rx_count--;
2162                                                 pktmbuf_detach_zcp(
2163                                                         pkts_burst[rx_count]);
2164                                                 rte_ring_sp_enqueue(
2165                                                         vpool_array[index].ring,
2166                                                         (void *)pkts_burst[rx_count]);
2167                                         }
2168                                 }
2169                         }
2170
2171                         if (likely(!vdev->remove))
2172                                 /* Handle guest TX */
2173                                 virtio_dev_tx_zcp(dev);
2174
2175                         /* Move to the next device in the list */
2176                         dev_ll = dev_ll->next;
2177                 }
2178         }
2179
2180         return 0;
2181 }
2182
2183
2184 /*
2185  * Add an entry to a used linked list. A free entry must first be found
2186  * in the free linked list using get_data_ll_free_entry();
2187  */
2188 static void
2189 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2190         struct virtio_net_data_ll *ll_dev)
2191 {
2192         struct virtio_net_data_ll *ll = *ll_root_addr;
2193
2194         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2195         ll_dev->next = NULL;
2196         rte_compiler_barrier();
2197
2198         /* If ll == NULL then this is the first device. */
2199         if (ll) {
2200                 /* Increment to the tail of the linked list. */
2201                 while ((ll->next != NULL) )
2202                         ll = ll->next;
2203
2204                 ll->next = ll_dev;
2205         } else {
2206                 *ll_root_addr = ll_dev;
2207         }
2208 }
2209
2210 /*
2211  * Remove an entry from a used linked list. The entry must then be added to
2212  * the free linked list using put_data_ll_free_entry().
2213  */
2214 static void
2215 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2216         struct virtio_net_data_ll *ll_dev,
2217         struct virtio_net_data_ll *ll_dev_last)
2218 {
2219         struct virtio_net_data_ll *ll = *ll_root_addr;
2220
2221         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2222                 return;
2223
2224         if (ll_dev == ll)
2225                 *ll_root_addr = ll_dev->next;
2226         else
2227                 if (likely(ll_dev_last != NULL))
2228                         ll_dev_last->next = ll_dev->next;
2229                 else
2230                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2231 }
2232
2233 /*
2234  * Find and return an entry from the free linked list.
2235  */
2236 static struct virtio_net_data_ll *
2237 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2238 {
2239         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2240         struct virtio_net_data_ll *ll_dev;
2241
2242         if (ll_free == NULL)
2243                 return NULL;
2244
2245         ll_dev = ll_free;
2246         *ll_root_addr = ll_free->next;
2247
2248         return ll_dev;
2249 }
2250
2251 /*
2252  * Place an entry back on to the free linked list.
2253  */
2254 static void
2255 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2256         struct virtio_net_data_ll *ll_dev)
2257 {
2258         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2259
2260         if (ll_dev == NULL)
2261                 return;
2262
2263         ll_dev->next = ll_free;
2264         *ll_root_addr = ll_dev;
2265 }
2266
2267 /*
2268  * Creates a linked list of a given size.
2269  */
2270 static struct virtio_net_data_ll *
2271 alloc_data_ll(uint32_t size)
2272 {
2273         struct virtio_net_data_ll *ll_new;
2274         uint32_t i;
2275
2276         /* Malloc and then chain the linked list. */
2277         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2278         if (ll_new == NULL) {
2279                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2280                 return NULL;
2281         }
2282
2283         for (i = 0; i < size - 1; i++) {
2284                 ll_new[i].vdev = NULL;
2285                 ll_new[i].next = &ll_new[i+1];
2286         }
2287         ll_new[i].next = NULL;
2288
2289         return (ll_new);
2290 }
2291
2292 /*
2293  * Create the main linked list along with each individual cores linked list. A used and a free list
2294  * are created to manage entries.
2295  */
2296 static int
2297 init_data_ll (void)
2298 {
2299         int lcore;
2300
2301         RTE_LCORE_FOREACH_SLAVE(lcore) {
2302                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2303                 if (lcore_info[lcore].lcore_ll == NULL) {
2304                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2305                         return -1;
2306                 }
2307
2308                 lcore_info[lcore].lcore_ll->device_num = 0;
2309                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2310                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2311                 if (num_devices % num_switching_cores)
2312                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2313                 else
2314                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2315         }
2316
2317         /* Allocate devices up to a maximum of MAX_DEVICES. */
2318         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2319
2320         return 0;
2321 }
2322
2323 /*
2324  * Set virtqueue flags so that we do not receive interrupts.
2325  */
2326 static void
2327 set_irq_status (struct virtio_net *dev)
2328 {
2329         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2330         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2331 }
2332
2333 /*
2334  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2335  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2336  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2337  */
2338 static void
2339 destroy_device (volatile struct virtio_net *dev)
2340 {
2341         struct virtio_net_data_ll *ll_lcore_dev_cur;
2342         struct virtio_net_data_ll *ll_main_dev_cur;
2343         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2344         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2345         struct vhost_dev *vdev;
2346         int lcore;
2347
2348         dev->flags &= ~VIRTIO_DEV_RUNNING;
2349
2350         vdev = (struct vhost_dev *)dev->priv;
2351         /*set the remove flag. */
2352         vdev->remove = 1;
2353         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2354                 rte_pause();
2355         }
2356
2357         /* Search for entry to be removed from lcore ll */
2358         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2359         while (ll_lcore_dev_cur != NULL) {
2360                 if (ll_lcore_dev_cur->vdev == vdev) {
2361                         break;
2362                 } else {
2363                         ll_lcore_dev_last = ll_lcore_dev_cur;
2364                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2365                 }
2366         }
2367
2368         if (ll_lcore_dev_cur == NULL) {
2369                 RTE_LOG(ERR, VHOST_CONFIG,
2370                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2371                         dev->device_fh);
2372                 return;
2373         }
2374
2375         /* Search for entry to be removed from main ll */
2376         ll_main_dev_cur = ll_root_used;
2377         ll_main_dev_last = NULL;
2378         while (ll_main_dev_cur != NULL) {
2379                 if (ll_main_dev_cur->vdev == vdev) {
2380                         break;
2381                 } else {
2382                         ll_main_dev_last = ll_main_dev_cur;
2383                         ll_main_dev_cur = ll_main_dev_cur->next;
2384                 }
2385         }
2386
2387         /* Remove entries from the lcore and main ll. */
2388         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2389         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2390
2391         /* Set the dev_removal_flag on each lcore. */
2392         RTE_LCORE_FOREACH_SLAVE(lcore) {
2393                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2394         }
2395
2396         /*
2397          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2398          * they can no longer access the device removed from the linked lists and that the devices
2399          * are no longer in use.
2400          */
2401         RTE_LCORE_FOREACH_SLAVE(lcore) {
2402                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2403                         rte_pause();
2404                 }
2405         }
2406
2407         /* Add the entries back to the lcore and main free ll.*/
2408         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2409         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2410
2411         /* Decrement number of device on the lcore. */
2412         lcore_info[vdev->coreid].lcore_ll->device_num--;
2413
2414         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2415
2416         if (zero_copy) {
2417                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2418
2419                 /* Stop the RX queue. */
2420                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2421                         LOG_DEBUG(VHOST_CONFIG,
2422                                 "(%"PRIu64") In destroy_device: Failed to stop "
2423                                 "rx queue:%d\n",
2424                                 dev->device_fh,
2425                                 vdev->vmdq_rx_q);
2426                 }
2427
2428                 LOG_DEBUG(VHOST_CONFIG,
2429                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2430                         "mempool back to ring for RX queue: %d\n",
2431                         dev->device_fh, vdev->vmdq_rx_q);
2432
2433                 mbuf_destroy_zcp(vpool);
2434
2435                 /* Stop the TX queue. */
2436                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2437                         LOG_DEBUG(VHOST_CONFIG,
2438                                 "(%"PRIu64") In destroy_device: Failed to "
2439                                 "stop tx queue:%d\n",
2440                                 dev->device_fh, vdev->vmdq_rx_q);
2441                 }
2442
2443                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2444
2445                 LOG_DEBUG(VHOST_CONFIG,
2446                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2447                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2448                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2449                         dev->device_fh);
2450
2451                 mbuf_destroy_zcp(vpool);
2452                 rte_free(vdev->regions_hpa);
2453         }
2454         rte_free(vdev);
2455
2456 }
2457
2458 /*
2459  * Calculate the region count of physical continous regions for one particular
2460  * region of whose vhost virtual address is continous. The particular region
2461  * start from vva_start, with size of 'size' in argument.
2462  */
2463 static uint32_t
2464 check_hpa_regions(uint64_t vva_start, uint64_t size)
2465 {
2466         uint32_t i, nregions = 0, page_size = getpagesize();
2467         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2468         if (vva_start % page_size) {
2469                 LOG_DEBUG(VHOST_CONFIG,
2470                         "in check_countinous: vva start(%p) mod page_size(%d) "
2471                         "has remainder\n",
2472                         (void *)(uintptr_t)vva_start, page_size);
2473                 return 0;
2474         }
2475         if (size % page_size) {
2476                 LOG_DEBUG(VHOST_CONFIG,
2477                         "in check_countinous: "
2478                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2479                         size, page_size);
2480                 return 0;
2481         }
2482         for (i = 0; i < size - page_size; i = i + page_size) {
2483                 cur_phys_addr
2484                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2485                 next_phys_addr = rte_mem_virt2phy(
2486                         (void *)(uintptr_t)(vva_start + i + page_size));
2487                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2488                         ++nregions;
2489                         LOG_DEBUG(VHOST_CONFIG,
2490                                 "in check_continuous: hva addr:(%p) is not "
2491                                 "continuous with hva addr:(%p), diff:%d\n",
2492                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2493                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2494                                 + page_size), page_size);
2495                         LOG_DEBUG(VHOST_CONFIG,
2496                                 "in check_continuous: hpa addr:(%p) is not "
2497                                 "continuous with hpa addr:(%p), "
2498                                 "diff:(%"PRIu64")\n",
2499                                 (void *)(uintptr_t)cur_phys_addr,
2500                                 (void *)(uintptr_t)next_phys_addr,
2501                                 (next_phys_addr-cur_phys_addr));
2502                 }
2503         }
2504         return nregions;
2505 }
2506
2507 /*
2508  * Divide each region whose vhost virtual address is continous into a few
2509  * sub-regions, make sure the physical address within each sub-region are
2510  * continous. And fill offset(to GPA) and size etc. information of each
2511  * sub-region into regions_hpa.
2512  */
2513 static uint32_t
2514 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2515 {
2516         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2517         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2518
2519         if (mem_region_hpa == NULL)
2520                 return 0;
2521
2522         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2523                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2524                         virtio_memory->regions[regionidx].address_offset;
2525                 mem_region_hpa[regionidx_hpa].guest_phys_address
2526                         = virtio_memory->regions[regionidx].guest_phys_address;
2527                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2528                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2529                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2530                 LOG_DEBUG(VHOST_CONFIG,
2531                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2532                         regionidx_hpa,
2533                         (void *)(uintptr_t)
2534                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2535                 LOG_DEBUG(VHOST_CONFIG,
2536                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2537                         regionidx_hpa,
2538                         (void *)(uintptr_t)
2539                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2540                 for (i = 0, k = 0;
2541                         i < virtio_memory->regions[regionidx].memory_size -
2542                                 page_size;
2543                         i += page_size) {
2544                         cur_phys_addr = rte_mem_virt2phy(
2545                                         (void *)(uintptr_t)(vva_start + i));
2546                         next_phys_addr = rte_mem_virt2phy(
2547                                         (void *)(uintptr_t)(vva_start +
2548                                         i + page_size));
2549                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2550                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2551                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2552                                         k + page_size;
2553                                 mem_region_hpa[regionidx_hpa].memory_size
2554                                         = k + page_size;
2555                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2556                                         "phys addr end  [%d]:(%p)\n",
2557                                         regionidx_hpa,
2558                                         (void *)(uintptr_t)
2559                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2560                                 LOG_DEBUG(VHOST_CONFIG,
2561                                         "in fill_hpa_regions: guest phys addr "
2562                                         "size [%d]:(%p)\n",
2563                                         regionidx_hpa,
2564                                         (void *)(uintptr_t)
2565                                         (mem_region_hpa[regionidx_hpa].memory_size));
2566                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2567                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2568                                 ++regionidx_hpa;
2569                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2570                                         next_phys_addr -
2571                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2572                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2573                                         " phys addr start[%d]:(%p)\n",
2574                                         regionidx_hpa,
2575                                         (void *)(uintptr_t)
2576                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2577                                 LOG_DEBUG(VHOST_CONFIG,
2578                                         "in fill_hpa_regions: host  phys addr "
2579                                         "start[%d]:(%p)\n",
2580                                         regionidx_hpa,
2581                                         (void *)(uintptr_t)
2582                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2583                                 k = 0;
2584                         } else {
2585                                 k += page_size;
2586                         }
2587                 }
2588                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2589                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2590                         + k + page_size;
2591                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2592                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2593                         "[%d]:(%p)\n", regionidx_hpa,
2594                         (void *)(uintptr_t)
2595                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2596                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2597                         "[%d]:(%p)\n", regionidx_hpa,
2598                         (void *)(uintptr_t)
2599                         (mem_region_hpa[regionidx_hpa].memory_size));
2600                 ++regionidx_hpa;
2601         }
2602         return regionidx_hpa;
2603 }
2604
2605 /*
2606  * A new device is added to a data core. First the device is added to the main linked list
2607  * and the allocated to a specific data core.
2608  */
2609 static int
2610 new_device (struct virtio_net *dev)
2611 {
2612         struct virtio_net_data_ll *ll_dev;
2613         int lcore, core_add = 0;
2614         uint32_t device_num_min = num_devices;
2615         struct vhost_dev *vdev;
2616         uint32_t regionidx;
2617
2618         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2619         if (vdev == NULL) {
2620                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2621                         dev->device_fh);
2622                 return -1;
2623         }
2624         vdev->dev = dev;
2625         dev->priv = vdev;
2626
2627         if (zero_copy) {
2628                 vdev->nregions_hpa = dev->mem->nregions;
2629                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2630                         vdev->nregions_hpa
2631                                 += check_hpa_regions(
2632                                         dev->mem->regions[regionidx].guest_phys_address
2633                                         + dev->mem->regions[regionidx].address_offset,
2634                                         dev->mem->regions[regionidx].memory_size);
2635
2636                 }
2637
2638                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2639                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2640                         CACHE_LINE_SIZE);
2641                 if (vdev->regions_hpa == NULL) {
2642                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2643                         rte_free(vdev);
2644                         return -1;
2645                 }
2646
2647
2648                 if (fill_hpa_memory_regions(
2649                         vdev->regions_hpa, dev->mem
2650                         ) != vdev->nregions_hpa) {
2651
2652                         RTE_LOG(ERR, VHOST_CONFIG,
2653                                 "hpa memory regions number mismatch: "
2654                                 "[%d]\n", vdev->nregions_hpa);
2655                         rte_free(vdev->regions_hpa);
2656                         rte_free(vdev);
2657                         return -1;
2658                 }
2659         }
2660
2661
2662         /* Add device to main ll */
2663         ll_dev = get_data_ll_free_entry(&ll_root_free);
2664         if (ll_dev == NULL) {
2665                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2666                         "of %d devices per core has been reached\n",
2667                         dev->device_fh, num_devices);
2668                 if (vdev->regions_hpa)
2669                         rte_free(vdev->regions_hpa);
2670                 rte_free(vdev);
2671                 return -1;
2672         }
2673         ll_dev->vdev = vdev;
2674         add_data_ll_entry(&ll_root_used, ll_dev);
2675         vdev->vmdq_rx_q
2676                 = dev->device_fh * (num_queues / num_devices);
2677
2678         if (zero_copy) {
2679                 uint32_t index = vdev->vmdq_rx_q;
2680                 uint32_t count_in_ring, i;
2681                 struct mbuf_table *tx_q;
2682
2683                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2684
2685                 LOG_DEBUG(VHOST_CONFIG,
2686                         "(%"PRIu64") in new_device: mbuf count in mempool "
2687                         "before attach is: %d\n",
2688                         dev->device_fh,
2689                         rte_mempool_count(vpool_array[index].pool));
2690                 LOG_DEBUG(VHOST_CONFIG,
2691                         "(%"PRIu64") in new_device: mbuf count in  ring "
2692                         "before attach  is : %d\n",
2693                         dev->device_fh, count_in_ring);
2694
2695                 /*
2696                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2697                  */
2698                 for (i = 0; i < count_in_ring; i++)
2699                         attach_rxmbuf_zcp(dev);
2700
2701                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2702                         "mempool after attach is: %d\n",
2703                         dev->device_fh,
2704                         rte_mempool_count(vpool_array[index].pool));
2705                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2706                         "ring after attach  is : %d\n",
2707                         dev->device_fh,
2708                         rte_ring_count(vpool_array[index].ring));
2709
2710                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2711                 tx_q->txq_id = vdev->vmdq_rx_q;
2712
2713                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2714                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2715
2716                         LOG_DEBUG(VHOST_CONFIG,
2717                                 "(%"PRIu64") In new_device: Failed to start "
2718                                 "tx queue:%d\n",
2719                                 dev->device_fh, vdev->vmdq_rx_q);
2720
2721                         mbuf_destroy_zcp(vpool);
2722                         rte_free(vdev->regions_hpa);
2723                         rte_free(vdev);
2724                         return -1;
2725                 }
2726
2727                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2728                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2729
2730                         LOG_DEBUG(VHOST_CONFIG,
2731                                 "(%"PRIu64") In new_device: Failed to start "
2732                                 "rx queue:%d\n",
2733                                 dev->device_fh, vdev->vmdq_rx_q);
2734
2735                         /* Stop the TX queue. */
2736                         if (rte_eth_dev_tx_queue_stop(ports[0],
2737                                 vdev->vmdq_rx_q) != 0) {
2738                                 LOG_DEBUG(VHOST_CONFIG,
2739                                         "(%"PRIu64") In new_device: Failed to "
2740                                         "stop tx queue:%d\n",
2741                                         dev->device_fh, vdev->vmdq_rx_q);
2742                         }
2743
2744                         mbuf_destroy_zcp(vpool);
2745                         rte_free(vdev->regions_hpa);
2746                         rte_free(vdev);
2747                         return -1;
2748                 }
2749
2750         }
2751
2752         /*reset ready flag*/
2753         vdev->ready = DEVICE_MAC_LEARNING;
2754         vdev->remove = 0;
2755
2756         /* Find a suitable lcore to add the device. */
2757         RTE_LCORE_FOREACH_SLAVE(lcore) {
2758                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2759                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2760                         core_add = lcore;
2761                 }
2762         }
2763         /* Add device to lcore ll */
2764         ll_dev->dev->coreid = core_add;
2765         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2766         if (ll_dev == NULL) {
2767                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2768                 vdev->ready = DEVICE_SAFE_REMOVE;
2769                 destroy_device(dev);
2770                 if (vdev->regions_hpa)
2771                         rte_free(vdev->regions_hpa);
2772                 rte_free(vdev);
2773                 return -1;
2774         }
2775         ll_dev->vdev = vdev;
2776         vdev->coreid = core_add;
2777
2778         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2779
2780         /* Initialize device stats */
2781         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2782
2783         /* Disable notifications. */
2784         set_irq_status(dev);
2785         lcore_info[vdev->coreid].lcore_ll->device_num++;
2786         dev->flags |= VIRTIO_DEV_RUNNING;
2787
2788         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2789
2790         return 0;
2791 }
2792
2793 /*
2794  * These callback allow devices to be added to the data core when configuration
2795  * has been fully complete.
2796  */
2797 static const struct virtio_net_device_ops virtio_net_device_ops =
2798 {
2799         .new_device =  new_device,
2800         .destroy_device = destroy_device,
2801 };
2802
2803 /*
2804  * This is a thread will wake up after a period to print stats if the user has
2805  * enabled them.
2806  */
2807 static void
2808 print_stats(void)
2809 {
2810         struct virtio_net_data_ll *dev_ll;
2811         uint64_t tx_dropped, rx_dropped;
2812         uint64_t tx, tx_total, rx, rx_total;
2813         uint32_t device_fh;
2814         const char clr[] = { 27, '[', '2', 'J', '\0' };
2815         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2816
2817         while(1) {
2818                 sleep(enable_stats);
2819
2820                 /* Clear screen and move to top left */
2821                 printf("%s%s", clr, top_left);
2822
2823                 printf("\nDevice statistics ====================================");
2824
2825                 dev_ll = ll_root_used;
2826                 while (dev_ll != NULL) {
2827                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2828                         tx_total = dev_statistics[device_fh].tx_total;
2829                         tx = dev_statistics[device_fh].tx;
2830                         tx_dropped = tx_total - tx;
2831                         if (zero_copy == 0) {
2832                                 rx_total = rte_atomic64_read(
2833                                         &dev_statistics[device_fh].rx_total_atomic);
2834                                 rx = rte_atomic64_read(
2835                                         &dev_statistics[device_fh].rx_atomic);
2836                         } else {
2837                                 rx_total = dev_statistics[device_fh].rx_total;
2838                                 rx = dev_statistics[device_fh].rx;
2839                         }
2840                         rx_dropped = rx_total - rx;
2841
2842                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2843                                         "\nTX total:            %"PRIu64""
2844                                         "\nTX dropped:          %"PRIu64""
2845                                         "\nTX successful:               %"PRIu64""
2846                                         "\nRX total:            %"PRIu64""
2847                                         "\nRX dropped:          %"PRIu64""
2848                                         "\nRX successful:               %"PRIu64"",
2849                                         device_fh,
2850                                         tx_total,
2851                                         tx_dropped,
2852                                         tx,
2853                                         rx_total,
2854                                         rx_dropped,
2855                                         rx);
2856
2857                         dev_ll = dev_ll->next;
2858                 }
2859                 printf("\n======================================================\n");
2860         }
2861 }
2862
2863 static void
2864 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2865         char *ring_name, uint32_t nb_mbuf)
2866 {
2867         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2868         vpool_array[index].pool
2869                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2870                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2871                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2872                 rte_pktmbuf_init, NULL, socket, 0);
2873         if (vpool_array[index].pool != NULL) {
2874                 vpool_array[index].ring
2875                         = rte_ring_create(ring_name,
2876                                 rte_align32pow2(nb_mbuf + 1),
2877                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2878                 if (likely(vpool_array[index].ring != NULL)) {
2879                         LOG_DEBUG(VHOST_CONFIG,
2880                                 "in setup_mempool_tbl: mbuf count in "
2881                                 "mempool is: %d\n",
2882                                 rte_mempool_count(vpool_array[index].pool));
2883                         LOG_DEBUG(VHOST_CONFIG,
2884                                 "in setup_mempool_tbl: mbuf count in "
2885                                 "ring   is: %d\n",
2886                                 rte_ring_count(vpool_array[index].ring));
2887                 } else {
2888                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2889                                 ring_name);
2890                 }
2891
2892                 /* Need consider head room. */
2893                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2894         } else {
2895                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2896         }
2897 }
2898
2899
2900 /*
2901  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2902  * device is also registered here to handle the IOCTLs.
2903  */
2904 int
2905 MAIN(int argc, char *argv[])
2906 {
2907         struct rte_mempool *mbuf_pool = NULL;
2908         unsigned lcore_id, core_id = 0;
2909         unsigned nb_ports, valid_num_ports;
2910         int ret;
2911         uint8_t portid, queue_id = 0;
2912         static pthread_t tid;
2913
2914         /* init EAL */
2915         ret = rte_eal_init(argc, argv);
2916         if (ret < 0)
2917                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2918         argc -= ret;
2919         argv += ret;
2920
2921         /* parse app arguments */
2922         ret = us_vhost_parse_args(argc, argv);
2923         if (ret < 0)
2924                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2925
2926         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2927                 if (rte_lcore_is_enabled(lcore_id))
2928                         lcore_ids[core_id ++] = lcore_id;
2929
2930         if (rte_lcore_count() > RTE_MAX_LCORE)
2931                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2932
2933         /*set the number of swithcing cores available*/
2934         num_switching_cores = rte_lcore_count()-1;
2935
2936         /* Get the number of physical ports. */
2937         nb_ports = rte_eth_dev_count();
2938         if (nb_ports > RTE_MAX_ETHPORTS)
2939                 nb_ports = RTE_MAX_ETHPORTS;
2940
2941         /*
2942          * Update the global var NUM_PORTS and global array PORTS
2943          * and get value of var VALID_NUM_PORTS according to system ports number
2944          */
2945         valid_num_ports = check_ports_num(nb_ports);
2946
2947         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2948                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2949                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2950                 return -1;
2951         }
2952
2953         if (zero_copy == 0) {
2954                 /* Create the mbuf pool. */
2955                 mbuf_pool = rte_mempool_create(
2956                                 "MBUF_POOL",
2957                                 NUM_MBUFS_PER_PORT
2958                                 * valid_num_ports,
2959                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2960                                 sizeof(struct rte_pktmbuf_pool_private),
2961                                 rte_pktmbuf_pool_init, NULL,
2962                                 rte_pktmbuf_init, NULL,
2963                                 rte_socket_id(), 0);
2964                 if (mbuf_pool == NULL)
2965                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2966
2967                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2968                         vpool_array[queue_id].pool = mbuf_pool;
2969
2970                 if (vm2vm_mode == VM2VM_HARDWARE) {
2971                         /* Enable VT loop back to let L2 switch to do it. */
2972                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2973                         LOG_DEBUG(VHOST_CONFIG,
2974                                 "Enable loop back for L2 switch in vmdq.\n");
2975                 }
2976         } else {
2977                 uint32_t nb_mbuf;
2978                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2979                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2980
2981                 /*
2982                  * Zero copy defers queue RX/TX start to the time when guest
2983                  * finishes its startup and packet buffers from that guest are
2984                  * available.
2985                  */
2986                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2987                 rx_conf_default.rx_drop_en = 0;
2988                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2989                 nb_mbuf = num_rx_descriptor
2990                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2991                         + num_switching_cores * MAX_PKT_BURST;
2992
2993                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2994                         snprintf(pool_name, sizeof(pool_name),
2995                                 "rxmbuf_pool_%u", queue_id);
2996                         snprintf(ring_name, sizeof(ring_name),
2997                                 "rxmbuf_ring_%u", queue_id);
2998                         setup_mempool_tbl(rte_socket_id(), queue_id,
2999                                 pool_name, ring_name, nb_mbuf);
3000                 }
3001
3002                 nb_mbuf = num_tx_descriptor
3003                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3004                                 + num_switching_cores * MAX_PKT_BURST;
3005
3006                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3007                         snprintf(pool_name, sizeof(pool_name),
3008                                 "txmbuf_pool_%u", queue_id);
3009                         snprintf(ring_name, sizeof(ring_name),
3010                                 "txmbuf_ring_%u", queue_id);
3011                         setup_mempool_tbl(rte_socket_id(),
3012                                 (queue_id + MAX_QUEUES),
3013                                 pool_name, ring_name, nb_mbuf);
3014                 }
3015
3016                 if (vm2vm_mode == VM2VM_HARDWARE) {
3017                         /* Enable VT loop back to let L2 switch to do it. */
3018                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3019                         LOG_DEBUG(VHOST_CONFIG,
3020                                 "Enable loop back for L2 switch in vmdq.\n");
3021                 }
3022         }
3023         /* Set log level. */
3024         rte_set_log_level(LOG_LEVEL);
3025
3026         /* initialize all ports */
3027         for (portid = 0; portid < nb_ports; portid++) {
3028                 /* skip ports that are not enabled */
3029                 if ((enabled_port_mask & (1 << portid)) == 0) {
3030                         RTE_LOG(INFO, VHOST_PORT,
3031                                 "Skipping disabled port %d\n", portid);
3032                         continue;
3033                 }
3034                 if (port_init(portid) != 0)
3035                         rte_exit(EXIT_FAILURE,
3036                                 "Cannot initialize network ports\n");
3037         }
3038
3039         /* Initialise all linked lists. */
3040         if (init_data_ll() == -1)
3041                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3042
3043         /* Initialize device stats */
3044         memset(&dev_statistics, 0, sizeof(dev_statistics));
3045
3046         /* Enable stats if the user option is set. */
3047         if (enable_stats)
3048                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3049
3050         /* Launch all data cores. */
3051         if (zero_copy == 0) {
3052                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3053                         rte_eal_remote_launch(switch_worker,
3054                                 mbuf_pool, lcore_id);
3055                 }
3056         } else {
3057                 uint32_t count_in_mempool, index, i;
3058                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3059                         /* For all RX and TX queues. */
3060                         count_in_mempool
3061                                 = rte_mempool_count(vpool_array[index].pool);
3062
3063                         /*
3064                          * Transfer all un-attached mbufs from vpool.pool
3065                          * to vpoo.ring.
3066                          */
3067                         for (i = 0; i < count_in_mempool; i++) {
3068                                 struct rte_mbuf *mbuf
3069                                         = __rte_mbuf_raw_alloc(
3070                                                 vpool_array[index].pool);
3071                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3072                                                 (void *)mbuf);
3073                         }
3074
3075                         LOG_DEBUG(VHOST_CONFIG,
3076                                 "in MAIN: mbuf count in mempool at initial "
3077                                 "is: %d\n", count_in_mempool);
3078                         LOG_DEBUG(VHOST_CONFIG,
3079                                 "in MAIN: mbuf count in  ring at initial  is :"
3080                                 " %d\n",
3081                                 rte_ring_count(vpool_array[index].ring));
3082                 }
3083
3084                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3085                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3086                                 lcore_id);
3087         }
3088
3089         /* Register CUSE device to handle IOCTLs. */
3090         ret = rte_vhost_driver_register((char *)&dev_basename);
3091         if (ret != 0)
3092                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3093
3094         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3095
3096         /* Start CUSE session. */
3097         rte_vhost_driver_session_start();
3098         return 0;
3099
3100 }
3101