examples/vhost: fix packet length
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 128
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
103
104 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
106
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 static uint32_t num_devices;
170
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176 static int mergeable;
177
178 /* number of descriptors to apply*/
179 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
180 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181
182 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
183 #define MAX_RING_DESC 4096
184
185 struct vpool {
186         struct rte_mempool *pool;
187         struct rte_ring *ring;
188         uint32_t buf_size;
189 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190
191 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
192 typedef enum {
193         VM2VM_DISABLED = 0,
194         VM2VM_SOFTWARE = 1,
195         VM2VM_HARDWARE = 2,
196         VM2VM_LAST
197 } vm2vm_type;
198 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199
200 /* The type of host physical address translated from guest physical address. */
201 typedef enum {
202         PHYS_ADDR_CONTINUOUS = 0,
203         PHYS_ADDR_CROSS_SUBREG = 1,
204         PHYS_ADDR_INVALID = 2,
205         PHYS_ADDR_LAST
206 } hpa_type;
207
208 /* Enable stats. */
209 static uint32_t enable_stats = 0;
210 /* Enable retries on RX. */
211 static uint32_t enable_retry = 1;
212 /* Specify timeout (in useconds) between retries on RX. */
213 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
214 /* Specify the number of retries on RX. */
215 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216
217 /* Character device basename. Can be set by user. */
218 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219
220
221 /* Default configuration for rx and tx thresholds etc. */
222 static struct rte_eth_rxconf rx_conf_default = {
223         .rx_thresh = {
224                 .pthresh = RX_PTHRESH,
225                 .hthresh = RX_HTHRESH,
226                 .wthresh = RX_WTHRESH,
227         },
228         .rx_drop_en = 1,
229 };
230
231 /*
232  * These default values are optimized for use with the Intel(R) 82599 10 GbE
233  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
234  * network controllers and/or network drivers.
235  */
236 static struct rte_eth_txconf tx_conf_default = {
237         .tx_thresh = {
238                 .pthresh = TX_PTHRESH,
239                 .hthresh = TX_HTHRESH,
240                 .wthresh = TX_WTHRESH,
241         },
242         .tx_free_thresh = 0, /* Use PMD default values */
243         .tx_rs_thresh = 0, /* Use PMD default values */
244 };
245
246 /* empty vmdq configuration structure. Filled in programatically */
247 static struct rte_eth_conf vmdq_conf_default = {
248         .rxmode = {
249                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
250                 .split_hdr_size = 0,
251                 .header_split   = 0, /**< Header Split disabled */
252                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
253                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
254                 /*
255                  * It is necessary for 1G NIC such as I350,
256                  * this fixes bug of ipv4 forwarding in guest can't
257                  * forward pakets from one virtio dev to another virtio dev.
258                  */
259                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
260                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
261                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
262         },
263
264         .txmode = {
265                 .mq_mode = ETH_MQ_TX_NONE,
266         },
267         .rx_adv_conf = {
268                 /*
269                  * should be overridden separately in code with
270                  * appropriate values
271                  */
272                 .vmdq_rx_conf = {
273                         .nb_queue_pools = ETH_8_POOLS,
274                         .enable_default_pool = 0,
275                         .default_pool = 0,
276                         .nb_pool_maps = 0,
277                         .pool_map = {{0, 0},},
278                 },
279         },
280 };
281
282 static unsigned lcore_ids[RTE_MAX_LCORE];
283 static uint8_t ports[RTE_MAX_ETHPORTS];
284 static unsigned num_ports = 0; /**< The number of ports specified in command line */
285
286 static const uint16_t external_pkt_default_vlan_tag = 2000;
287 const uint16_t vlan_tags[] = {
288         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
289         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
290         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
291         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
292         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
293         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
294         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
295         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
296 };
297
298 /* ethernet addresses of ports */
299 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
300
301 /* heads for the main used and free linked lists for the data path. */
302 static struct virtio_net_data_ll *ll_root_used = NULL;
303 static struct virtio_net_data_ll *ll_root_free = NULL;
304
305 /* Array of data core structures containing information on individual core linked lists. */
306 static struct lcore_info lcore_info[RTE_MAX_LCORE];
307
308 /* Used for queueing bursts of TX packets. */
309 struct mbuf_table {
310         unsigned len;
311         unsigned txq_id;
312         struct rte_mbuf *m_table[MAX_PKT_BURST];
313 };
314
315 /* TX queue for each data core. */
316 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
317
318 /* TX queue fori each virtio device for zero copy. */
319 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
320
321 /* Vlan header struct used to insert vlan tags on TX. */
322 struct vlan_ethhdr {
323         unsigned char   h_dest[ETH_ALEN];
324         unsigned char   h_source[ETH_ALEN];
325         __be16          h_vlan_proto;
326         __be16          h_vlan_TCI;
327         __be16          h_vlan_encapsulated_proto;
328 };
329
330 /* IPv4 Header */
331 struct ipv4_hdr {
332         uint8_t  version_ihl;           /**< version and header length */
333         uint8_t  type_of_service;       /**< type of service */
334         uint16_t total_length;          /**< length of packet */
335         uint16_t packet_id;             /**< packet ID */
336         uint16_t fragment_offset;       /**< fragmentation offset */
337         uint8_t  time_to_live;          /**< time to live */
338         uint8_t  next_proto_id;         /**< protocol ID */
339         uint16_t hdr_checksum;          /**< header checksum */
340         uint32_t src_addr;              /**< source address */
341         uint32_t dst_addr;              /**< destination address */
342 } __attribute__((__packed__));
343
344 /* Header lengths. */
345 #define VLAN_HLEN       4
346 #define VLAN_ETH_HLEN   18
347
348 /* Per-device statistics struct */
349 struct device_statistics {
350         uint64_t tx_total;
351         rte_atomic64_t rx_total_atomic;
352         uint64_t rx_total;
353         uint64_t tx;
354         rte_atomic64_t rx_atomic;
355         uint64_t rx;
356 } __rte_cache_aligned;
357 struct device_statistics dev_statistics[MAX_DEVICES];
358
359 /*
360  * Builds up the correct configuration for VMDQ VLAN pool map
361  * according to the pool & queue limits.
362  */
363 static inline int
364 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
365 {
366         struct rte_eth_vmdq_rx_conf conf;
367         unsigned i;
368
369         memset(&conf, 0, sizeof(conf));
370         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
371         conf.nb_pool_maps = num_devices;
372         conf.enable_loop_back =
373                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
374
375         for (i = 0; i < conf.nb_pool_maps; i++) {
376                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
377                 conf.pool_map[i].pools = (1UL << i);
378         }
379
380         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
381         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
382                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
383         return 0;
384 }
385
386 /*
387  * Validate the device number according to the max pool number gotten form
388  * dev_info. If the device number is invalid, give the error message and
389  * return -1. Each device must have its own pool.
390  */
391 static inline int
392 validate_num_devices(uint32_t max_nb_devices)
393 {
394         if (num_devices > max_nb_devices) {
395                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
396                 return -1;
397         }
398         return 0;
399 }
400
401 /*
402  * Initialises a given port using global settings and with the rx buffers
403  * coming from the mbuf_pool passed as parameter
404  */
405 static inline int
406 port_init(uint8_t port)
407 {
408         struct rte_eth_dev_info dev_info;
409         struct rte_eth_conf port_conf;
410         uint16_t rx_rings, tx_rings;
411         uint16_t rx_ring_size, tx_ring_size;
412         int retval;
413         uint16_t q;
414
415         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
416         rte_eth_dev_info_get (port, &dev_info);
417
418         /*configure the number of supported virtio devices based on VMDQ limits */
419         num_devices = dev_info.max_vmdq_pools;
420         num_queues = dev_info.max_rx_queues;
421
422         if (zero_copy) {
423                 rx_ring_size = num_rx_descriptor;
424                 tx_ring_size = num_tx_descriptor;
425                 tx_rings = dev_info.max_tx_queues;
426         } else {
427                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
428                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
429                 tx_rings = (uint16_t)rte_lcore_count();
430         }
431
432         retval = validate_num_devices(MAX_DEVICES);
433         if (retval < 0)
434                 return retval;
435
436         /* Get port configuration. */
437         retval = get_eth_conf(&port_conf, num_devices);
438         if (retval < 0)
439                 return retval;
440
441         if (port >= rte_eth_dev_count()) return -1;
442
443         rx_rings = (uint16_t)num_queues,
444         /* Configure ethernet device. */
445         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
446         if (retval != 0)
447                 return retval;
448
449         /* Setup the queues. */
450         for (q = 0; q < rx_rings; q ++) {
451                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
452                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
453                                                 vpool_array[q].pool);
454                 if (retval < 0)
455                         return retval;
456         }
457         for (q = 0; q < tx_rings; q ++) {
458                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
459                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
460                 if (retval < 0)
461                         return retval;
462         }
463
464         /* Start the device. */
465         retval  = rte_eth_dev_start(port);
466         if (retval < 0) {
467                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
468                 return retval;
469         }
470
471         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
472         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
473         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
474                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
475                         (unsigned)port,
476                         vmdq_ports_eth_addr[port].addr_bytes[0],
477                         vmdq_ports_eth_addr[port].addr_bytes[1],
478                         vmdq_ports_eth_addr[port].addr_bytes[2],
479                         vmdq_ports_eth_addr[port].addr_bytes[3],
480                         vmdq_ports_eth_addr[port].addr_bytes[4],
481                         vmdq_ports_eth_addr[port].addr_bytes[5]);
482
483         return 0;
484 }
485
486 /*
487  * Set character device basename.
488  */
489 static int
490 us_vhost_parse_basename(const char *q_arg)
491 {
492         /* parse number string */
493
494         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
495                 return -1;
496         else
497                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
498
499         return 0;
500 }
501
502 /*
503  * Parse the portmask provided at run time.
504  */
505 static int
506 parse_portmask(const char *portmask)
507 {
508         char *end = NULL;
509         unsigned long pm;
510
511         errno = 0;
512
513         /* parse hexadecimal string */
514         pm = strtoul(portmask, &end, 16);
515         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
516                 return -1;
517
518         if (pm == 0)
519                 return -1;
520
521         return pm;
522
523 }
524
525 /*
526  * Parse num options at run time.
527  */
528 static int
529 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
530 {
531         char *end = NULL;
532         unsigned long num;
533
534         errno = 0;
535
536         /* parse unsigned int string */
537         num = strtoul(q_arg, &end, 10);
538         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
539                 return -1;
540
541         if (num > max_valid_value)
542                 return -1;
543
544         return num;
545
546 }
547
548 /*
549  * Display usage
550  */
551 static void
552 us_vhost_usage(const char *prgname)
553 {
554         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
555         "               --vm2vm [0|1|2]\n"
556         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
557         "               --dev-basename <name>\n"
558         "               --nb-devices ND\n"
559         "               -p PORTMASK: Set mask for ports to be used by application\n"
560         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
561         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
562         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
563         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
564         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
565         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
566         "               --dev-basename: The basename to be used for the character device.\n"
567         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
568                         "zero copy\n"
569         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
570                         "used only when zero copy is enabled.\n"
571         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
572                         "used only when zero copy is enabled.\n",
573                prgname);
574 }
575
576 /*
577  * Parse the arguments given in the command line of the application.
578  */
579 static int
580 us_vhost_parse_args(int argc, char **argv)
581 {
582         int opt, ret;
583         int option_index;
584         unsigned i;
585         const char *prgname = argv[0];
586         static struct option long_option[] = {
587                 {"vm2vm", required_argument, NULL, 0},
588                 {"rx-retry", required_argument, NULL, 0},
589                 {"rx-retry-delay", required_argument, NULL, 0},
590                 {"rx-retry-num", required_argument, NULL, 0},
591                 {"mergeable", required_argument, NULL, 0},
592                 {"stats", required_argument, NULL, 0},
593                 {"dev-basename", required_argument, NULL, 0},
594                 {"zero-copy", required_argument, NULL, 0},
595                 {"rx-desc-num", required_argument, NULL, 0},
596                 {"tx-desc-num", required_argument, NULL, 0},
597                 {NULL, 0, 0, 0},
598         };
599
600         /* Parse command line */
601         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
602                 switch (opt) {
603                 /* Portmask */
604                 case 'p':
605                         enabled_port_mask = parse_portmask(optarg);
606                         if (enabled_port_mask == 0) {
607                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
608                                 us_vhost_usage(prgname);
609                                 return -1;
610                         }
611                         break;
612
613                 case 0:
614                         /* Enable/disable vm2vm comms. */
615                         if (!strncmp(long_option[option_index].name, "vm2vm",
616                                 MAX_LONG_OPT_SZ)) {
617                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
618                                 if (ret == -1) {
619                                         RTE_LOG(INFO, VHOST_CONFIG,
620                                                 "Invalid argument for "
621                                                 "vm2vm [0|1|2]\n");
622                                         us_vhost_usage(prgname);
623                                         return -1;
624                                 } else {
625                                         vm2vm_mode = (vm2vm_type)ret;
626                                 }
627                         }
628
629                         /* Enable/disable retries on RX. */
630                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
631                                 ret = parse_num_opt(optarg, 1);
632                                 if (ret == -1) {
633                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
634                                         us_vhost_usage(prgname);
635                                         return -1;
636                                 } else {
637                                         enable_retry = ret;
638                                 }
639                         }
640
641                         /* Specify the retries delay time (in useconds) on RX. */
642                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
643                                 ret = parse_num_opt(optarg, INT32_MAX);
644                                 if (ret == -1) {
645                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
646                                         us_vhost_usage(prgname);
647                                         return -1;
648                                 } else {
649                                         burst_rx_delay_time = ret;
650                                 }
651                         }
652
653                         /* Specify the retries number on RX. */
654                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
655                                 ret = parse_num_opt(optarg, INT32_MAX);
656                                 if (ret == -1) {
657                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
658                                         us_vhost_usage(prgname);
659                                         return -1;
660                                 } else {
661                                         burst_rx_retry_num = ret;
662                                 }
663                         }
664
665                         /* Enable/disable RX mergeable buffers. */
666                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
667                                 ret = parse_num_opt(optarg, 1);
668                                 if (ret == -1) {
669                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
670                                         us_vhost_usage(prgname);
671                                         return -1;
672                                 } else {
673                                         mergeable = !!ret;
674                                         if (ret) {
675                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
676                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
677                                                         = JUMBO_FRAME_MAX_SIZE;
678                                         }
679                                 }
680                         }
681
682                         /* Enable/disable stats. */
683                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
684                                 ret = parse_num_opt(optarg, INT32_MAX);
685                                 if (ret == -1) {
686                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
687                                         us_vhost_usage(prgname);
688                                         return -1;
689                                 } else {
690                                         enable_stats = ret;
691                                 }
692                         }
693
694                         /* Set character device basename. */
695                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
696                                 if (us_vhost_parse_basename(optarg) == -1) {
697                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
698                                         us_vhost_usage(prgname);
699                                         return -1;
700                                 }
701                         }
702
703                         /* Enable/disable rx/tx zero copy. */
704                         if (!strncmp(long_option[option_index].name,
705                                 "zero-copy", MAX_LONG_OPT_SZ)) {
706                                 ret = parse_num_opt(optarg, 1);
707                                 if (ret == -1) {
708                                         RTE_LOG(INFO, VHOST_CONFIG,
709                                                 "Invalid argument"
710                                                 " for zero-copy [0|1]\n");
711                                         us_vhost_usage(prgname);
712                                         return -1;
713                                 } else
714                                         zero_copy = ret;
715
716                                 if (zero_copy) {
717 #ifdef RTE_MBUF_REFCNT
718                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
719                                         "zero copy vhost APP, please "
720                                         "disable RTE_MBUF_REFCNT\n"
721                                         "in config file and then rebuild DPDK "
722                                         "core lib!\n"
723                                         "Otherwise please disable zero copy "
724                                         "flag in command line!\n");
725                                         return -1;
726 #endif
727                                 }
728                         }
729
730                         /* Specify the descriptor number on RX. */
731                         if (!strncmp(long_option[option_index].name,
732                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
733                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
734                                 if ((ret == -1) || (!POWEROF2(ret))) {
735                                         RTE_LOG(INFO, VHOST_CONFIG,
736                                         "Invalid argument for rx-desc-num[0-N],"
737                                         "power of 2 required.\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         num_rx_descriptor = ret;
742                                 }
743                         }
744
745                         /* Specify the descriptor number on TX. */
746                         if (!strncmp(long_option[option_index].name,
747                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
748                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
749                                 if ((ret == -1) || (!POWEROF2(ret))) {
750                                         RTE_LOG(INFO, VHOST_CONFIG,
751                                         "Invalid argument for tx-desc-num [0-N],"
752                                         "power of 2 required.\n");
753                                         us_vhost_usage(prgname);
754                                         return -1;
755                                 } else {
756                                         num_tx_descriptor = ret;
757                                 }
758                         }
759
760                         break;
761
762                         /* Invalid option - print options. */
763                 default:
764                         us_vhost_usage(prgname);
765                         return -1;
766                 }
767         }
768
769         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
770                 if (enabled_port_mask & (1 << i))
771                         ports[num_ports++] = (uint8_t)i;
772         }
773
774         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
775                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
776                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
777                 return -1;
778         }
779
780         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
781                 RTE_LOG(INFO, VHOST_PORT,
782                         "Vhost zero copy doesn't support software vm2vm,"
783                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
784                 return -1;
785         }
786
787         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
788                 RTE_LOG(INFO, VHOST_PORT,
789                         "Vhost zero copy doesn't support jumbo frame,"
790                         "please specify '--mergeable 0' to disable the "
791                         "mergeable feature.\n");
792                 return -1;
793         }
794
795         return 0;
796 }
797
798 /*
799  * Update the global var NUM_PORTS and array PORTS according to system ports number
800  * and return valid ports number
801  */
802 static unsigned check_ports_num(unsigned nb_ports)
803 {
804         unsigned valid_num_ports = num_ports;
805         unsigned portid;
806
807         if (num_ports > nb_ports) {
808                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
809                         num_ports, nb_ports);
810                 num_ports = nb_ports;
811         }
812
813         for (portid = 0; portid < num_ports; portid ++) {
814                 if (ports[portid] >= nb_ports) {
815                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
816                                 ports[portid], (nb_ports - 1));
817                         ports[portid] = INVALID_PORT_ID;
818                         valid_num_ports--;
819                 }
820         }
821         return valid_num_ports;
822 }
823
824 /*
825  * Macro to print out packet contents. Wrapped in debug define so that the
826  * data path is not effected when debug is disabled.
827  */
828 #ifdef DEBUG
829 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
830         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
831         unsigned int index;                                                                                                                                                                                             \
832         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
833                                                                                                                                                                                                                                         \
834         if ((header))                                                                                                                                                                                                   \
835                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
836         else                                                                                                                                                                                                                    \
837                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
838         for (index = 0; index < (size); index++) {                                                                                                                                              \
839                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
840                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
841         }                                                                                                                                                                                                                               \
842         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
843                                                                                                                                                                                                                                         \
844         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
845 } while(0)
846 #else
847 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
848 #endif
849
850 /*
851  * Function to convert guest physical addresses to vhost physical addresses.
852  * This is used to convert virtio buffer addresses.
853  */
854 static inline uint64_t __attribute__((always_inline))
855 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
856         uint32_t buf_len, hpa_type *addr_type)
857 {
858         struct virtio_memory_regions_hpa *region;
859         uint32_t regionidx;
860         uint64_t vhost_pa = 0;
861
862         *addr_type = PHYS_ADDR_INVALID;
863
864         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
865                 region = &vdev->regions_hpa[regionidx];
866                 if ((guest_pa >= region->guest_phys_address) &&
867                         (guest_pa <= region->guest_phys_address_end)) {
868                         vhost_pa = region->host_phys_addr_offset + guest_pa;
869                         if (likely((guest_pa + buf_len - 1)
870                                 <= region->guest_phys_address_end))
871                                 *addr_type = PHYS_ADDR_CONTINUOUS;
872                         else
873                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
874                         break;
875                 }
876         }
877
878         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
879                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
880                 (void *)(uintptr_t)vhost_pa);
881
882         return vhost_pa;
883 }
884
885 /*
886  * Compares a packet destination MAC address to a device MAC address.
887  */
888 static inline int __attribute__((always_inline))
889 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
890 {
891         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
892 }
893
894 /*
895  * This function learns the MAC address of the device and registers this along with a
896  * vlan tag to a VMDQ.
897  */
898 static int
899 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
900 {
901         struct ether_hdr *pkt_hdr;
902         struct virtio_net_data_ll *dev_ll;
903         struct virtio_net *dev = vdev->dev;
904         int i, ret;
905
906         /* Learn MAC address of guest device from packet */
907         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
908
909         dev_ll = ll_root_used;
910
911         while (dev_ll != NULL) {
912                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
913                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
914                         return -1;
915                 }
916                 dev_ll = dev_ll->next;
917         }
918
919         for (i = 0; i < ETHER_ADDR_LEN; i++)
920                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
921
922         /* vlan_tag currently uses the device_id. */
923         vdev->vlan_tag = vlan_tags[dev->device_fh];
924
925         /* Print out VMDQ registration info. */
926         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
927                 dev->device_fh,
928                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
929                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
930                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
931                 vdev->vlan_tag);
932
933         /* Register the MAC address. */
934         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
935         if (ret)
936                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
937                                         dev->device_fh);
938
939         /* Enable stripping of the vlan tag as we handle routing. */
940         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
941
942         /* Set device as ready for RX. */
943         vdev->ready = DEVICE_RX;
944
945         return 0;
946 }
947
948 /*
949  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
950  * queue before disabling RX on the device.
951  */
952 static inline void
953 unlink_vmdq(struct vhost_dev *vdev)
954 {
955         unsigned i = 0;
956         unsigned rx_count;
957         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
958
959         if (vdev->ready == DEVICE_RX) {
960                 /*clear MAC and VLAN settings*/
961                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
962                 for (i = 0; i < 6; i++)
963                         vdev->mac_address.addr_bytes[i] = 0;
964
965                 vdev->vlan_tag = 0;
966
967                 /*Clear out the receive buffers*/
968                 rx_count = rte_eth_rx_burst(ports[0],
969                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
970
971                 while (rx_count) {
972                         for (i = 0; i < rx_count; i++)
973                                 rte_pktmbuf_free(pkts_burst[i]);
974
975                         rx_count = rte_eth_rx_burst(ports[0],
976                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
977                 }
978
979                 vdev->ready = DEVICE_MAC_LEARNING;
980         }
981 }
982
983 /*
984  * Check if the packet destination MAC address is for a local device. If so then put
985  * the packet on that devices RX queue. If not then return.
986  */
987 static inline int __attribute__((always_inline))
988 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
989 {
990         struct virtio_net_data_ll *dev_ll;
991         struct ether_hdr *pkt_hdr;
992         uint64_t ret = 0;
993         struct virtio_net *dev = vdev->dev;
994         struct virtio_net *tdev; /* destination virito device */
995
996         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
997
998         /*get the used devices list*/
999         dev_ll = ll_root_used;
1000
1001         while (dev_ll != NULL) {
1002                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1003                                           &dev_ll->vdev->mac_address)) {
1004
1005                         /* Drop the packet if the TX packet is destined for the TX device. */
1006                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1007                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1008                                                         dev->device_fh);
1009                                 return 0;
1010                         }
1011                         tdev = dev_ll->vdev->dev;
1012
1013
1014                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1015
1016                         if (unlikely(dev_ll->vdev->remove)) {
1017                                 /*drop the packet if the device is marked for removal*/
1018                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1019                         } else {
1020                                 /*send the packet to the local virtio device*/
1021                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1022                                 if (enable_stats) {
1023                                         rte_atomic64_add(
1024                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1025                                         1);
1026                                         rte_atomic64_add(
1027                                         &dev_statistics[tdev->device_fh].rx_atomic,
1028                                         ret);
1029                                         dev_statistics[tdev->device_fh].tx_total++;
1030                                         dev_statistics[tdev->device_fh].tx += ret;
1031                                 }
1032                         }
1033
1034                         return 0;
1035                 }
1036                 dev_ll = dev_ll->next;
1037         }
1038
1039         return -1;
1040 }
1041
1042 /*
1043  * This function routes the TX packet to the correct interface. This may be a local device
1044  * or the physical port.
1045  */
1046 static inline void __attribute__((always_inline))
1047 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1048 {
1049         struct mbuf_table *tx_q;
1050         struct rte_mbuf **m_table;
1051         unsigned len, ret, offset = 0;
1052         const uint16_t lcore_id = rte_lcore_id();
1053         struct virtio_net_data_ll *dev_ll = ll_root_used;
1054         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1055         struct virtio_net *dev = vdev->dev;
1056
1057         /*check if destination is local VM*/
1058         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1059                 rte_pktmbuf_free(m);
1060                 return;
1061         }
1062
1063         if (vm2vm_mode == VM2VM_HARDWARE) {
1064                 while (dev_ll != NULL) {
1065                         if ((dev_ll->vdev->ready == DEVICE_RX)
1066                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1067                                 &dev_ll->vdev->mac_address)) {
1068                                 /*
1069                                  * Drop the packet if the TX packet is
1070                                  * destined for the TX device.
1071                                  */
1072                                 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1073                                         LOG_DEBUG(VHOST_DATA,
1074                                         "(%"PRIu64") TX: Source and destination"
1075                                         " MAC addresses are the same. Dropping "
1076                                         "packet.\n",
1077                                         dev_ll->vdev->dev->device_fh);
1078                                         rte_pktmbuf_free(m);
1079                                         return;
1080                                 }
1081
1082                                 /*
1083                                  * HW vlan strip will reduce the packet length
1084                                  * by minus length of vlan tag, so need restore
1085                                  * the packet length by plus it.
1086                                  */
1087                                 offset = VLAN_HLEN;
1088                                 vlan_tag =
1089                                 (uint16_t)
1090                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1091
1092                                 LOG_DEBUG(VHOST_DATA,
1093                                 "(%"PRIu64") TX: pkt to local VM device id:"
1094                                 "(%"PRIu64") vlan tag: %d.\n",
1095                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1096                                 vlan_tag);
1097
1098                                 break;
1099                         }
1100                         dev_ll = dev_ll->next;
1101                 }
1102         }
1103
1104         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1105
1106         /*Add packet to the port tx queue*/
1107         tx_q = &lcore_tx_queue[lcore_id];
1108         len = tx_q->len;
1109
1110         m->ol_flags = PKT_TX_VLAN_PKT;
1111
1112         m->data_len += offset;
1113         m->pkt_len += offset;
1114
1115         m->vlan_tci = vlan_tag;
1116
1117         tx_q->m_table[len] = m;
1118         len++;
1119         if (enable_stats) {
1120                 dev_statistics[dev->device_fh].tx_total++;
1121                 dev_statistics[dev->device_fh].tx++;
1122         }
1123
1124         if (unlikely(len == MAX_PKT_BURST)) {
1125                 m_table = (struct rte_mbuf **)tx_q->m_table;
1126                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1127                 /* Free any buffers not handled by TX and update the port stats. */
1128                 if (unlikely(ret < len)) {
1129                         do {
1130                                 rte_pktmbuf_free(m_table[ret]);
1131                         } while (++ret < len);
1132                 }
1133
1134                 len = 0;
1135         }
1136
1137         tx_q->len = len;
1138         return;
1139 }
1140 /*
1141  * This function is called by each data core. It handles all RX/TX registered with the
1142  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1143  * with all devices in the main linked list.
1144  */
1145 static int
1146 switch_worker(__attribute__((unused)) void *arg)
1147 {
1148         struct rte_mempool *mbuf_pool = arg;
1149         struct virtio_net *dev = NULL;
1150         struct vhost_dev *vdev = NULL;
1151         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1152         struct virtio_net_data_ll *dev_ll;
1153         struct mbuf_table *tx_q;
1154         volatile struct lcore_ll_info *lcore_ll;
1155         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1156         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1157         unsigned ret, i;
1158         const uint16_t lcore_id = rte_lcore_id();
1159         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1160         uint16_t rx_count = 0;
1161         uint16_t tx_count;
1162         uint32_t retry = 0;
1163
1164         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1165         lcore_ll = lcore_info[lcore_id].lcore_ll;
1166         prev_tsc = 0;
1167
1168         tx_q = &lcore_tx_queue[lcore_id];
1169         for (i = 0; i < num_cores; i ++) {
1170                 if (lcore_ids[i] == lcore_id) {
1171                         tx_q->txq_id = i;
1172                         break;
1173                 }
1174         }
1175
1176         while(1) {
1177                 cur_tsc = rte_rdtsc();
1178                 /*
1179                  * TX burst queue drain
1180                  */
1181                 diff_tsc = cur_tsc - prev_tsc;
1182                 if (unlikely(diff_tsc > drain_tsc)) {
1183
1184                         if (tx_q->len) {
1185                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1186
1187                                 /*Tx any packets in the queue*/
1188                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1189                                                                            (struct rte_mbuf **)tx_q->m_table,
1190                                                                            (uint16_t)tx_q->len);
1191                                 if (unlikely(ret < tx_q->len)) {
1192                                         do {
1193                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1194                                         } while (++ret < tx_q->len);
1195                                 }
1196
1197                                 tx_q->len = 0;
1198                         }
1199
1200                         prev_tsc = cur_tsc;
1201
1202                 }
1203
1204                 rte_prefetch0(lcore_ll->ll_root_used);
1205                 /*
1206                  * Inform the configuration core that we have exited the linked list and that no devices are
1207                  * in use if requested.
1208                  */
1209                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1210                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1211
1212                 /*
1213                  * Process devices
1214                  */
1215                 dev_ll = lcore_ll->ll_root_used;
1216
1217                 while (dev_ll != NULL) {
1218                         /*get virtio device ID*/
1219                         vdev = dev_ll->vdev;
1220                         dev = vdev->dev;
1221
1222                         if (unlikely(vdev->remove)) {
1223                                 dev_ll = dev_ll->next;
1224                                 unlink_vmdq(vdev);
1225                                 vdev->ready = DEVICE_SAFE_REMOVE;
1226                                 continue;
1227                         }
1228                         if (likely(vdev->ready == DEVICE_RX)) {
1229                                 /*Handle guest RX*/
1230                                 rx_count = rte_eth_rx_burst(ports[0],
1231                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1232
1233                                 if (rx_count) {
1234                                         /*
1235                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1236                                         * Here MAX_PKT_BURST must be less than virtio queue size
1237                                         */
1238                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1239                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1240                                                         rte_delay_us(burst_rx_delay_time);
1241                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1242                                                                 break;
1243                                                 }
1244                                         }
1245                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1246                                         if (enable_stats) {
1247                                                 rte_atomic64_add(
1248                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1249                                                 rx_count);
1250                                                 rte_atomic64_add(
1251                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1252                                         }
1253                                         while (likely(rx_count)) {
1254                                                 rx_count--;
1255                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1256                                         }
1257
1258                                 }
1259                         }
1260
1261                         if (likely(!vdev->remove)) {
1262                                 /* Handle guest TX*/
1263                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1264                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1265                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1266                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1267                                                 while (tx_count--)
1268                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1269                                         }
1270                                 }
1271                                 while (tx_count)
1272                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1273                         }
1274
1275                         /*move to the next device in the list*/
1276                         dev_ll = dev_ll->next;
1277                 }
1278         }
1279
1280         return 0;
1281 }
1282
1283 /*
1284  * This function gets available ring number for zero copy rx.
1285  * Only one thread will call this funciton for a paticular virtio device,
1286  * so, it is designed as non-thread-safe function.
1287  */
1288 static inline uint32_t __attribute__((always_inline))
1289 get_available_ring_num_zcp(struct virtio_net *dev)
1290 {
1291         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1292         uint16_t avail_idx;
1293
1294         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1295         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1296 }
1297
1298 /*
1299  * This function gets available ring index for zero copy rx,
1300  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1301  * Only one thread will call this funciton for a paticular virtio device,
1302  * so, it is designed as non-thread-safe function.
1303  */
1304 static inline uint32_t __attribute__((always_inline))
1305 get_available_ring_index_zcp(struct virtio_net *dev,
1306         uint16_t *res_base_idx, uint32_t count)
1307 {
1308         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1309         uint16_t avail_idx;
1310         uint32_t retry = 0;
1311         uint16_t free_entries;
1312
1313         *res_base_idx = vq->last_used_idx_res;
1314         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1315         free_entries = (avail_idx - *res_base_idx);
1316
1317         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1318                         "avail idx: %d, "
1319                         "res base idx:%d, free entries:%d\n",
1320                         dev->device_fh, avail_idx, *res_base_idx,
1321                         free_entries);
1322
1323         /*
1324          * If retry is enabled and the queue is full then we wait
1325          * and retry to avoid packet loss.
1326          */
1327         if (enable_retry && unlikely(count > free_entries)) {
1328                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1329                         rte_delay_us(burst_rx_delay_time);
1330                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1331                         free_entries = (avail_idx - *res_base_idx);
1332                         if (count <= free_entries)
1333                                 break;
1334                 }
1335         }
1336
1337         /*check that we have enough buffers*/
1338         if (unlikely(count > free_entries))
1339                 count = free_entries;
1340
1341         if (unlikely(count == 0)) {
1342                 LOG_DEBUG(VHOST_DATA,
1343                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1344                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1345                         dev->device_fh, avail_idx,
1346                         *res_base_idx, free_entries);
1347                 return 0;
1348         }
1349
1350         vq->last_used_idx_res = *res_base_idx + count;
1351
1352         return count;
1353 }
1354
1355 /*
1356  * This function put descriptor back to used list.
1357  */
1358 static inline void __attribute__((always_inline))
1359 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1360 {
1361         uint16_t res_cur_idx = vq->last_used_idx;
1362         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1363         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1364         rte_compiler_barrier();
1365         *(volatile uint16_t *)&vq->used->idx += 1;
1366         vq->last_used_idx += 1;
1367
1368         /* Kick the guest if necessary. */
1369         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1370                 eventfd_write((int)vq->kickfd, 1);
1371 }
1372
1373 /*
1374  * This function get available descriptor from vitio vring and un-attached mbuf
1375  * from vpool->ring, and then attach them together. It needs adjust the offset
1376  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1377  * frame data may be put to wrong location in mbuf.
1378  */
1379 static inline void __attribute__((always_inline))
1380 attach_rxmbuf_zcp(struct virtio_net *dev)
1381 {
1382         uint16_t res_base_idx, desc_idx;
1383         uint64_t buff_addr, phys_addr;
1384         struct vhost_virtqueue *vq;
1385         struct vring_desc *desc;
1386         struct rte_mbuf *mbuf = NULL;
1387         struct vpool *vpool;
1388         hpa_type addr_type;
1389         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1390
1391         vpool = &vpool_array[vdev->vmdq_rx_q];
1392         vq = dev->virtqueue[VIRTIO_RXQ];
1393
1394         do {
1395                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1396                                 1) != 1))
1397                         return;
1398                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1399
1400                 desc = &vq->desc[desc_idx];
1401                 if (desc->flags & VRING_DESC_F_NEXT) {
1402                         desc = &vq->desc[desc->next];
1403                         buff_addr = gpa_to_vva(dev, desc->addr);
1404                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1405                                         &addr_type);
1406                 } else {
1407                         buff_addr = gpa_to_vva(dev,
1408                                         desc->addr + vq->vhost_hlen);
1409                         phys_addr = gpa_to_hpa(vdev,
1410                                         desc->addr + vq->vhost_hlen,
1411                                         desc->len, &addr_type);
1412                 }
1413
1414                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1415                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1416                                 " address found when attaching RX frame buffer"
1417                                 " address!\n", dev->device_fh);
1418                         put_desc_to_used_list_zcp(vq, desc_idx);
1419                         continue;
1420                 }
1421
1422                 /*
1423                  * Check if the frame buffer address from guest crosses
1424                  * sub-region or not.
1425                  */
1426                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1427                         RTE_LOG(ERR, VHOST_DATA,
1428                                 "(%"PRIu64") Frame buffer address cross "
1429                                 "sub-regioin found when attaching RX frame "
1430                                 "buffer address!\n",
1431                                 dev->device_fh);
1432                         put_desc_to_used_list_zcp(vq, desc_idx);
1433                         continue;
1434                 }
1435         } while (unlikely(phys_addr == 0));
1436
1437         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1438         if (unlikely(mbuf == NULL)) {
1439                 LOG_DEBUG(VHOST_DATA,
1440                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1441                         "ring_sc_dequeue fail.\n",
1442                         dev->device_fh);
1443                 put_desc_to_used_list_zcp(vq, desc_idx);
1444                 return;
1445         }
1446
1447         if (unlikely(vpool->buf_size > desc->len)) {
1448                 LOG_DEBUG(VHOST_DATA,
1449                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1450                         "length(%d) of descriptor idx: %d less than room "
1451                         "size required: %d\n",
1452                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1453                 put_desc_to_used_list_zcp(vq, desc_idx);
1454                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1455                 return;
1456         }
1457
1458         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1459         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1460         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1461         mbuf->data_len = desc->len;
1462         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1463
1464         LOG_DEBUG(VHOST_DATA,
1465                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1466                 "descriptor idx:%d\n",
1467                 dev->device_fh, res_base_idx, desc_idx);
1468
1469         __rte_mbuf_raw_free(mbuf);
1470
1471         return;
1472 }
1473
1474 /*
1475  * Detach an attched packet mbuf -
1476  *  - restore original mbuf address and length values.
1477  *  - reset pktmbuf data and data_len to their default values.
1478  *  All other fields of the given packet mbuf will be left intact.
1479  *
1480  * @param m
1481  *   The attached packet mbuf.
1482  */
1483 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1484 {
1485         const struct rte_mempool *mp = m->pool;
1486         void *buf = RTE_MBUF_TO_BADDR(m);
1487         uint32_t buf_ofs;
1488         uint32_t buf_len = mp->elt_size - sizeof(*m);
1489         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1490
1491         m->buf_addr = buf;
1492         m->buf_len = (uint16_t)buf_len;
1493
1494         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1495                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1496         m->data_off = buf_ofs;
1497
1498         m->data_len = 0;
1499 }
1500
1501 /*
1502  * This function is called after packets have been transimited. It fetchs mbuf
1503  * from vpool->pool, detached it and put into vpool->ring. It also update the
1504  * used index and kick the guest if necessary.
1505  */
1506 static inline uint32_t __attribute__((always_inline))
1507 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1508 {
1509         struct rte_mbuf *mbuf;
1510         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1511         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1512         uint32_t index = 0;
1513         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1514
1515         LOG_DEBUG(VHOST_DATA,
1516                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1517                 "clean is: %d\n",
1518                 dev->device_fh, mbuf_count);
1519         LOG_DEBUG(VHOST_DATA,
1520                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1521                 "clean  is : %d\n",
1522                 dev->device_fh, rte_ring_count(vpool->ring));
1523
1524         for (index = 0; index < mbuf_count; index++) {
1525                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1526                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1527                         pktmbuf_detach_zcp(mbuf);
1528                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1529
1530                 /* Update used index buffer information. */
1531                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1532                 vq->used->ring[used_idx].len = 0;
1533
1534                 used_idx = (used_idx + 1) & (vq->size - 1);
1535         }
1536
1537         LOG_DEBUG(VHOST_DATA,
1538                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1539                 "clean is: %d\n",
1540                 dev->device_fh, rte_mempool_count(vpool->pool));
1541         LOG_DEBUG(VHOST_DATA,
1542                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1543                 "clean  is : %d\n",
1544                 dev->device_fh, rte_ring_count(vpool->ring));
1545         LOG_DEBUG(VHOST_DATA,
1546                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1547                 "vq->last_used_idx:%d\n",
1548                 dev->device_fh, vq->last_used_idx);
1549
1550         vq->last_used_idx += mbuf_count;
1551
1552         LOG_DEBUG(VHOST_DATA,
1553                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1554                 "vq->last_used_idx:%d\n",
1555                 dev->device_fh, vq->last_used_idx);
1556
1557         rte_compiler_barrier();
1558
1559         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1560
1561         /* Kick guest if required. */
1562         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1563                 eventfd_write((int)vq->kickfd, 1);
1564
1565         return 0;
1566 }
1567
1568 /*
1569  * This function is called when a virtio device is destroy.
1570  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1571  */
1572 static void mbuf_destroy_zcp(struct vpool *vpool)
1573 {
1574         struct rte_mbuf *mbuf = NULL;
1575         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1576
1577         LOG_DEBUG(VHOST_CONFIG,
1578                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1579                 "mbuf_destroy_zcp is: %d\n",
1580                 mbuf_count);
1581         LOG_DEBUG(VHOST_CONFIG,
1582                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1583                 "mbuf_destroy_zcp  is : %d\n",
1584                 rte_ring_count(vpool->ring));
1585
1586         for (index = 0; index < mbuf_count; index++) {
1587                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1588                 if (likely(mbuf != NULL)) {
1589                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1590                                 pktmbuf_detach_zcp(mbuf);
1591                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1592                 }
1593         }
1594
1595         LOG_DEBUG(VHOST_CONFIG,
1596                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1597                 "mbuf_destroy_zcp is: %d\n",
1598                 rte_mempool_count(vpool->pool));
1599         LOG_DEBUG(VHOST_CONFIG,
1600                 "in mbuf_destroy_zcp: mbuf count in ring after "
1601                 "mbuf_destroy_zcp is : %d\n",
1602                 rte_ring_count(vpool->ring));
1603 }
1604
1605 /*
1606  * This function update the use flag and counter.
1607  */
1608 static inline uint32_t __attribute__((always_inline))
1609 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1610         uint32_t count)
1611 {
1612         struct vhost_virtqueue *vq;
1613         struct vring_desc *desc;
1614         struct rte_mbuf *buff;
1615         /* The virtio_hdr is initialised to 0. */
1616         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1617                 = {{0, 0, 0, 0, 0, 0}, 0};
1618         uint64_t buff_hdr_addr = 0;
1619         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1620         uint32_t head_idx, packet_success = 0;
1621         uint16_t res_cur_idx;
1622
1623         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1624
1625         if (count == 0)
1626                 return 0;
1627
1628         vq = dev->virtqueue[VIRTIO_RXQ];
1629         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1630
1631         res_cur_idx = vq->last_used_idx;
1632         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1633                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1634
1635         /* Retrieve all of the head indexes first to avoid caching issues. */
1636         for (head_idx = 0; head_idx < count; head_idx++)
1637                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1638
1639         /*Prefetch descriptor index. */
1640         rte_prefetch0(&vq->desc[head[packet_success]]);
1641
1642         while (packet_success != count) {
1643                 /* Get descriptor from available ring */
1644                 desc = &vq->desc[head[packet_success]];
1645
1646                 buff = pkts[packet_success];
1647                 LOG_DEBUG(VHOST_DATA,
1648                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1649                         "pkt[%d] descriptor idx: %d\n",
1650                         dev->device_fh, packet_success,
1651                         MBUF_HEADROOM_UINT32(buff));
1652
1653                 PRINT_PACKET(dev,
1654                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1655                         + RTE_PKTMBUF_HEADROOM),
1656                         rte_pktmbuf_data_len(buff), 0);
1657
1658                 /* Buffer address translation for virtio header. */
1659                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1660                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1661
1662                 /*
1663                  * If the descriptors are chained the header and data are
1664                  * placed in separate buffers.
1665                  */
1666                 if (desc->flags & VRING_DESC_F_NEXT) {
1667                         desc->len = vq->vhost_hlen;
1668                         desc = &vq->desc[desc->next];
1669                         desc->len = rte_pktmbuf_data_len(buff);
1670                 } else {
1671                         desc->len = packet_len;
1672                 }
1673
1674                 /* Update used ring with desc information */
1675                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1676                         = head[packet_success];
1677                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1678                         = packet_len;
1679                 res_cur_idx++;
1680                 packet_success++;
1681
1682                 /* A header is required per buffer. */
1683                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1684                         (const void *)&virtio_hdr, vq->vhost_hlen);
1685
1686                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1687
1688                 if (likely(packet_success < count)) {
1689                         /* Prefetch descriptor index. */
1690                         rte_prefetch0(&vq->desc[head[packet_success]]);
1691                 }
1692         }
1693
1694         rte_compiler_barrier();
1695
1696         LOG_DEBUG(VHOST_DATA,
1697                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1698                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1699                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1700
1701         *(volatile uint16_t *)&vq->used->idx += count;
1702         vq->last_used_idx += count;
1703
1704         LOG_DEBUG(VHOST_DATA,
1705                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1706                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1707                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1708
1709         /* Kick the guest if necessary. */
1710         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1711                 eventfd_write((int)vq->kickfd, 1);
1712
1713         return count;
1714 }
1715
1716 /*
1717  * This function routes the TX packet to the correct interface.
1718  * This may be a local device or the physical port.
1719  */
1720 static inline void __attribute__((always_inline))
1721 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1722         uint32_t desc_idx, uint8_t need_copy)
1723 {
1724         struct mbuf_table *tx_q;
1725         struct rte_mbuf **m_table;
1726         struct rte_mbuf *mbuf = NULL;
1727         unsigned len, ret, offset = 0;
1728         struct vpool *vpool;
1729         struct virtio_net_data_ll *dev_ll = ll_root_used;
1730         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1731         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1732         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1733
1734         /*Add packet to the port tx queue*/
1735         tx_q = &tx_queue_zcp[vmdq_rx_q];
1736         len = tx_q->len;
1737
1738         /* Allocate an mbuf and populate the structure. */
1739         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1740         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1741         if (unlikely(mbuf == NULL)) {
1742                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1743                 RTE_LOG(ERR, VHOST_DATA,
1744                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1745                         dev->device_fh);
1746                 put_desc_to_used_list_zcp(vq, desc_idx);
1747                 return;
1748         }
1749
1750         if (vm2vm_mode == VM2VM_HARDWARE) {
1751                 /* Avoid using a vlan tag from any vm for external pkt, such as
1752                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1753                  * selection, MAC address determines it as an external pkt
1754                  * which should go to network, while vlan tag determine it as
1755                  * a vm2vm pkt should forward to another vm. Hardware confuse
1756                  * such a ambiguous situation, so pkt will lost.
1757                  */
1758                 vlan_tag = external_pkt_default_vlan_tag;
1759                 while (dev_ll != NULL) {
1760                         if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1761                                 ether_addr_cmp(&(pkt_hdr->d_addr),
1762                                 &dev_ll->vdev->mac_address)) {
1763
1764                                 /*
1765                                  * Drop the packet if the TX packet is destined
1766                                  * for the TX device.
1767                                  */
1768                                 if (unlikely(dev_ll->vdev->dev->device_fh
1769                                         == dev->device_fh)) {
1770                                         LOG_DEBUG(VHOST_DATA,
1771                                         "(%"PRIu64") TX: Source and destination"
1772                                         "MAC addresses are the same. Dropping "
1773                                         "packet.\n",
1774                                         dev_ll->vdev->dev->device_fh);
1775                                         MBUF_HEADROOM_UINT32(mbuf)
1776                                                 = (uint32_t)desc_idx;
1777                                         __rte_mbuf_raw_free(mbuf);
1778                                         return;
1779                                 }
1780
1781                                 /*
1782                                  * Packet length offset 4 bytes for HW vlan
1783                                  * strip when L2 switch back.
1784                                  */
1785                                 offset = 4;
1786                                 vlan_tag =
1787                                 (uint16_t)
1788                                 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1789
1790                                 LOG_DEBUG(VHOST_DATA,
1791                                 "(%"PRIu64") TX: pkt to local VM device id:"
1792                                 "(%"PRIu64") vlan tag: %d.\n",
1793                                 dev->device_fh, dev_ll->vdev->dev->device_fh,
1794                                 vlan_tag);
1795
1796                                 break;
1797                         }
1798                         dev_ll = dev_ll->next;
1799                 }
1800         }
1801
1802         mbuf->nb_segs = m->nb_segs;
1803         mbuf->next = m->next;
1804         mbuf->data_len = m->data_len + offset;
1805         mbuf->pkt_len = mbuf->data_len;
1806         if (unlikely(need_copy)) {
1807                 /* Copy the packet contents to the mbuf. */
1808                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1809                         rte_pktmbuf_mtod(m, void *),
1810                         m->data_len);
1811         } else {
1812                 mbuf->data_off = m->data_off;
1813                 mbuf->buf_physaddr = m->buf_physaddr;
1814                 mbuf->buf_addr = m->buf_addr;
1815         }
1816         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1817         mbuf->vlan_tci = vlan_tag;
1818         mbuf->l2_len = sizeof(struct ether_hdr);
1819         mbuf->l3_len = sizeof(struct ipv4_hdr);
1820         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1821
1822         tx_q->m_table[len] = mbuf;
1823         len++;
1824
1825         LOG_DEBUG(VHOST_DATA,
1826                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1827                 dev->device_fh,
1828                 mbuf->nb_segs,
1829                 (mbuf->next == NULL) ? "null" : "non-null");
1830
1831         if (enable_stats) {
1832                 dev_statistics[dev->device_fh].tx_total++;
1833                 dev_statistics[dev->device_fh].tx++;
1834         }
1835
1836         if (unlikely(len == MAX_PKT_BURST)) {
1837                 m_table = (struct rte_mbuf **)tx_q->m_table;
1838                 ret = rte_eth_tx_burst(ports[0],
1839                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1840
1841                 /*
1842                  * Free any buffers not handled by TX and update
1843                  * the port stats.
1844                  */
1845                 if (unlikely(ret < len)) {
1846                         do {
1847                                 rte_pktmbuf_free(m_table[ret]);
1848                         } while (++ret < len);
1849                 }
1850
1851                 len = 0;
1852                 txmbuf_clean_zcp(dev, vpool);
1853         }
1854
1855         tx_q->len = len;
1856
1857         return;
1858 }
1859
1860 /*
1861  * This function TX all available packets in virtio TX queue for one
1862  * virtio-net device. If it is first packet, it learns MAC address and
1863  * setup VMDQ.
1864  */
1865 static inline void __attribute__((always_inline))
1866 virtio_dev_tx_zcp(struct virtio_net *dev)
1867 {
1868         struct rte_mbuf m;
1869         struct vhost_virtqueue *vq;
1870         struct vring_desc *desc;
1871         uint64_t buff_addr = 0, phys_addr;
1872         uint32_t head[MAX_PKT_BURST];
1873         uint32_t i;
1874         uint16_t free_entries, packet_success = 0;
1875         uint16_t avail_idx;
1876         uint8_t need_copy = 0;
1877         hpa_type addr_type;
1878         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1879
1880         vq = dev->virtqueue[VIRTIO_TXQ];
1881         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1882
1883         /* If there are no available buffers then return. */
1884         if (vq->last_used_idx_res == avail_idx)
1885                 return;
1886
1887         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1888
1889         /* Prefetch available ring to retrieve head indexes. */
1890         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1891
1892         /* Get the number of free entries in the ring */
1893         free_entries = (avail_idx - vq->last_used_idx_res);
1894
1895         /* Limit to MAX_PKT_BURST. */
1896         free_entries
1897                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1898
1899         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1900                 dev->device_fh, free_entries);
1901
1902         /* Retrieve all of the head indexes first to avoid caching issues. */
1903         for (i = 0; i < free_entries; i++)
1904                 head[i]
1905                         = vq->avail->ring[(vq->last_used_idx_res + i)
1906                         & (vq->size - 1)];
1907
1908         vq->last_used_idx_res += free_entries;
1909
1910         /* Prefetch descriptor index. */
1911         rte_prefetch0(&vq->desc[head[packet_success]]);
1912         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1913
1914         while (packet_success < free_entries) {
1915                 desc = &vq->desc[head[packet_success]];
1916
1917                 /* Discard first buffer as it is the virtio header */
1918                 desc = &vq->desc[desc->next];
1919
1920                 /* Buffer address translation. */
1921                 buff_addr = gpa_to_vva(dev, desc->addr);
1922                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1923
1924                 if (likely(packet_success < (free_entries - 1)))
1925                         /* Prefetch descriptor index. */
1926                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1927
1928                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1929                         RTE_LOG(ERR, VHOST_DATA,
1930                                 "(%"PRIu64") Invalid frame buffer address found"
1931                                 "when TX packets!\n",
1932                                 dev->device_fh);
1933                         packet_success++;
1934                         continue;
1935                 }
1936
1937                 /* Prefetch buffer address. */
1938                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1939
1940                 /*
1941                  * Setup dummy mbuf. This is copied to a real mbuf if
1942                  * transmitted out the physical port.
1943                  */
1944                 m.data_len = desc->len;
1945                 m.nb_segs = 1;
1946                 m.next = NULL;
1947                 m.data_off = 0;
1948                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1949                 m.buf_physaddr = phys_addr;
1950
1951                 /*
1952                  * Check if the frame buffer address from guest crosses
1953                  * sub-region or not.
1954                  */
1955                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1956                         RTE_LOG(ERR, VHOST_DATA,
1957                                 "(%"PRIu64") Frame buffer address cross "
1958                                 "sub-regioin found when attaching TX frame "
1959                                 "buffer address!\n",
1960                                 dev->device_fh);
1961                         need_copy = 1;
1962                 } else
1963                         need_copy = 0;
1964
1965                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1966
1967                 /*
1968                  * If this is the first received packet we need to learn
1969                  * the MAC and setup VMDQ
1970                  */
1971                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1972                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1973                                 /*
1974                                  * Discard frame if device is scheduled for
1975                                  * removal or a duplicate MAC address is found.
1976                                  */
1977                                 packet_success += free_entries;
1978                                 vq->last_used_idx += packet_success;
1979                                 break;
1980                         }
1981                 }
1982
1983                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1984                 packet_success++;
1985         }
1986 }
1987
1988 /*
1989  * This function is called by each data core. It handles all RX/TX registered
1990  * with the core. For TX the specific lcore linked list is used. For RX, MAC
1991  * addresses are compared with all devices in the main linked list.
1992  */
1993 static int
1994 switch_worker_zcp(__attribute__((unused)) void *arg)
1995 {
1996         struct virtio_net *dev = NULL;
1997         struct vhost_dev  *vdev = NULL;
1998         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1999         struct virtio_net_data_ll *dev_ll;
2000         struct mbuf_table *tx_q;
2001         volatile struct lcore_ll_info *lcore_ll;
2002         const uint64_t drain_tsc
2003                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2004                 * BURST_TX_DRAIN_US;
2005         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2006         unsigned ret;
2007         const uint16_t lcore_id = rte_lcore_id();
2008         uint16_t count_in_ring, rx_count = 0;
2009
2010         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2011
2012         lcore_ll = lcore_info[lcore_id].lcore_ll;
2013         prev_tsc = 0;
2014
2015         while (1) {
2016                 cur_tsc = rte_rdtsc();
2017
2018                 /* TX burst queue drain */
2019                 diff_tsc = cur_tsc - prev_tsc;
2020                 if (unlikely(diff_tsc > drain_tsc)) {
2021                         /*
2022                          * Get mbuf from vpool.pool and detach mbuf and
2023                          * put back into vpool.ring.
2024                          */
2025                         dev_ll = lcore_ll->ll_root_used;
2026                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2027                                 /* Get virtio device ID */
2028                                 vdev = dev_ll->vdev;
2029                                 dev = vdev->dev;
2030
2031                                 if (likely(!vdev->remove)) {
2032                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2033                                         if (tx_q->len) {
2034                                                 LOG_DEBUG(VHOST_DATA,
2035                                                 "TX queue drained after timeout"
2036                                                 " with burst size %u\n",
2037                                                 tx_q->len);
2038
2039                                                 /*
2040                                                  * Tx any packets in the queue
2041                                                  */
2042                                                 ret = rte_eth_tx_burst(
2043                                                         ports[0],
2044                                                         (uint16_t)tx_q->txq_id,
2045                                                         (struct rte_mbuf **)
2046                                                         tx_q->m_table,
2047                                                         (uint16_t)tx_q->len);
2048                                                 if (unlikely(ret < tx_q->len)) {
2049                                                         do {
2050                                                                 rte_pktmbuf_free(
2051                                                                         tx_q->m_table[ret]);
2052                                                         } while (++ret < tx_q->len);
2053                                                 }
2054                                                 tx_q->len = 0;
2055
2056                                                 txmbuf_clean_zcp(dev,
2057                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2058                                         }
2059                                 }
2060                                 dev_ll = dev_ll->next;
2061                         }
2062                         prev_tsc = cur_tsc;
2063                 }
2064
2065                 rte_prefetch0(lcore_ll->ll_root_used);
2066
2067                 /*
2068                  * Inform the configuration core that we have exited the linked
2069                  * list and that no devices are in use if requested.
2070                  */
2071                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2072                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2073
2074                 /* Process devices */
2075                 dev_ll = lcore_ll->ll_root_used;
2076
2077                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2078                         vdev = dev_ll->vdev;
2079                         dev  = vdev->dev;
2080                         if (unlikely(vdev->remove)) {
2081                                 dev_ll = dev_ll->next;
2082                                 unlink_vmdq(vdev);
2083                                 vdev->ready = DEVICE_SAFE_REMOVE;
2084                                 continue;
2085                         }
2086
2087                         if (likely(vdev->ready == DEVICE_RX)) {
2088                                 uint32_t index = vdev->vmdq_rx_q;
2089                                 uint16_t i;
2090                                 count_in_ring
2091                                 = rte_ring_count(vpool_array[index].ring);
2092                                 uint16_t free_entries
2093                                 = (uint16_t)get_available_ring_num_zcp(dev);
2094
2095                                 /*
2096                                  * Attach all mbufs in vpool.ring and put back
2097                                  * into vpool.pool.
2098                                  */
2099                                 for (i = 0;
2100                                 i < RTE_MIN(free_entries,
2101                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2102                                 i++)
2103                                         attach_rxmbuf_zcp(dev);
2104
2105                                 /* Handle guest RX */
2106                                 rx_count = rte_eth_rx_burst(ports[0],
2107                                         vdev->vmdq_rx_q, pkts_burst,
2108                                         MAX_PKT_BURST);
2109
2110                                 if (rx_count) {
2111                                         ret_count = virtio_dev_rx_zcp(dev,
2112                                                         pkts_burst, rx_count);
2113                                         if (enable_stats) {
2114                                                 dev_statistics[dev->device_fh].rx_total
2115                                                         += rx_count;
2116                                                 dev_statistics[dev->device_fh].rx
2117                                                         += ret_count;
2118                                         }
2119                                         while (likely(rx_count)) {
2120                                                 rx_count--;
2121                                                 pktmbuf_detach_zcp(
2122                                                         pkts_burst[rx_count]);
2123                                                 rte_ring_sp_enqueue(
2124                                                         vpool_array[index].ring,
2125                                                         (void *)pkts_burst[rx_count]);
2126                                         }
2127                                 }
2128                         }
2129
2130                         if (likely(!vdev->remove))
2131                                 /* Handle guest TX */
2132                                 virtio_dev_tx_zcp(dev);
2133
2134                         /* Move to the next device in the list */
2135                         dev_ll = dev_ll->next;
2136                 }
2137         }
2138
2139         return 0;
2140 }
2141
2142
2143 /*
2144  * Add an entry to a used linked list. A free entry must first be found
2145  * in the free linked list using get_data_ll_free_entry();
2146  */
2147 static void
2148 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2149         struct virtio_net_data_ll *ll_dev)
2150 {
2151         struct virtio_net_data_ll *ll = *ll_root_addr;
2152
2153         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2154         ll_dev->next = NULL;
2155         rte_compiler_barrier();
2156
2157         /* If ll == NULL then this is the first device. */
2158         if (ll) {
2159                 /* Increment to the tail of the linked list. */
2160                 while ((ll->next != NULL) )
2161                         ll = ll->next;
2162
2163                 ll->next = ll_dev;
2164         } else {
2165                 *ll_root_addr = ll_dev;
2166         }
2167 }
2168
2169 /*
2170  * Remove an entry from a used linked list. The entry must then be added to
2171  * the free linked list using put_data_ll_free_entry().
2172  */
2173 static void
2174 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2175         struct virtio_net_data_ll *ll_dev,
2176         struct virtio_net_data_ll *ll_dev_last)
2177 {
2178         struct virtio_net_data_ll *ll = *ll_root_addr;
2179
2180         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2181                 return;
2182
2183         if (ll_dev == ll)
2184                 *ll_root_addr = ll_dev->next;
2185         else
2186                 if (likely(ll_dev_last != NULL))
2187                         ll_dev_last->next = ll_dev->next;
2188                 else
2189                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2190 }
2191
2192 /*
2193  * Find and return an entry from the free linked list.
2194  */
2195 static struct virtio_net_data_ll *
2196 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2197 {
2198         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2199         struct virtio_net_data_ll *ll_dev;
2200
2201         if (ll_free == NULL)
2202                 return NULL;
2203
2204         ll_dev = ll_free;
2205         *ll_root_addr = ll_free->next;
2206
2207         return ll_dev;
2208 }
2209
2210 /*
2211  * Place an entry back on to the free linked list.
2212  */
2213 static void
2214 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2215         struct virtio_net_data_ll *ll_dev)
2216 {
2217         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2218
2219         if (ll_dev == NULL)
2220                 return;
2221
2222         ll_dev->next = ll_free;
2223         *ll_root_addr = ll_dev;
2224 }
2225
2226 /*
2227  * Creates a linked list of a given size.
2228  */
2229 static struct virtio_net_data_ll *
2230 alloc_data_ll(uint32_t size)
2231 {
2232         struct virtio_net_data_ll *ll_new;
2233         uint32_t i;
2234
2235         /* Malloc and then chain the linked list. */
2236         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2237         if (ll_new == NULL) {
2238                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2239                 return NULL;
2240         }
2241
2242         for (i = 0; i < size - 1; i++) {
2243                 ll_new[i].vdev = NULL;
2244                 ll_new[i].next = &ll_new[i+1];
2245         }
2246         ll_new[i].next = NULL;
2247
2248         return (ll_new);
2249 }
2250
2251 /*
2252  * Create the main linked list along with each individual cores linked list. A used and a free list
2253  * are created to manage entries.
2254  */
2255 static int
2256 init_data_ll (void)
2257 {
2258         int lcore;
2259
2260         RTE_LCORE_FOREACH_SLAVE(lcore) {
2261                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2262                 if (lcore_info[lcore].lcore_ll == NULL) {
2263                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2264                         return -1;
2265                 }
2266
2267                 lcore_info[lcore].lcore_ll->device_num = 0;
2268                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2269                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2270                 if (num_devices % num_switching_cores)
2271                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2272                 else
2273                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2274         }
2275
2276         /* Allocate devices up to a maximum of MAX_DEVICES. */
2277         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2278
2279         return 0;
2280 }
2281
2282 /*
2283  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2284  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2285  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2286  */
2287 static void
2288 destroy_device (volatile struct virtio_net *dev)
2289 {
2290         struct virtio_net_data_ll *ll_lcore_dev_cur;
2291         struct virtio_net_data_ll *ll_main_dev_cur;
2292         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2293         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2294         struct vhost_dev *vdev;
2295         int lcore;
2296
2297         dev->flags &= ~VIRTIO_DEV_RUNNING;
2298
2299         vdev = (struct vhost_dev *)dev->priv;
2300         /*set the remove flag. */
2301         vdev->remove = 1;
2302         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2303                 rte_pause();
2304         }
2305
2306         /* Search for entry to be removed from lcore ll */
2307         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2308         while (ll_lcore_dev_cur != NULL) {
2309                 if (ll_lcore_dev_cur->vdev == vdev) {
2310                         break;
2311                 } else {
2312                         ll_lcore_dev_last = ll_lcore_dev_cur;
2313                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2314                 }
2315         }
2316
2317         if (ll_lcore_dev_cur == NULL) {
2318                 RTE_LOG(ERR, VHOST_CONFIG,
2319                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2320                         dev->device_fh);
2321                 return;
2322         }
2323
2324         /* Search for entry to be removed from main ll */
2325         ll_main_dev_cur = ll_root_used;
2326         ll_main_dev_last = NULL;
2327         while (ll_main_dev_cur != NULL) {
2328                 if (ll_main_dev_cur->vdev == vdev) {
2329                         break;
2330                 } else {
2331                         ll_main_dev_last = ll_main_dev_cur;
2332                         ll_main_dev_cur = ll_main_dev_cur->next;
2333                 }
2334         }
2335
2336         /* Remove entries from the lcore and main ll. */
2337         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2338         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2339
2340         /* Set the dev_removal_flag on each lcore. */
2341         RTE_LCORE_FOREACH_SLAVE(lcore) {
2342                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2343         }
2344
2345         /*
2346          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2347          * they can no longer access the device removed from the linked lists and that the devices
2348          * are no longer in use.
2349          */
2350         RTE_LCORE_FOREACH_SLAVE(lcore) {
2351                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2352                         rte_pause();
2353                 }
2354         }
2355
2356         /* Add the entries back to the lcore and main free ll.*/
2357         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2358         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2359
2360         /* Decrement number of device on the lcore. */
2361         lcore_info[vdev->coreid].lcore_ll->device_num--;
2362
2363         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2364
2365         if (zero_copy) {
2366                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2367
2368                 /* Stop the RX queue. */
2369                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2370                         LOG_DEBUG(VHOST_CONFIG,
2371                                 "(%"PRIu64") In destroy_device: Failed to stop "
2372                                 "rx queue:%d\n",
2373                                 dev->device_fh,
2374                                 vdev->vmdq_rx_q);
2375                 }
2376
2377                 LOG_DEBUG(VHOST_CONFIG,
2378                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2379                         "mempool back to ring for RX queue: %d\n",
2380                         dev->device_fh, vdev->vmdq_rx_q);
2381
2382                 mbuf_destroy_zcp(vpool);
2383
2384                 /* Stop the TX queue. */
2385                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2386                         LOG_DEBUG(VHOST_CONFIG,
2387                                 "(%"PRIu64") In destroy_device: Failed to "
2388                                 "stop tx queue:%d\n",
2389                                 dev->device_fh, vdev->vmdq_rx_q);
2390                 }
2391
2392                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2393
2394                 LOG_DEBUG(VHOST_CONFIG,
2395                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2396                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2397                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2398                         dev->device_fh);
2399
2400                 mbuf_destroy_zcp(vpool);
2401                 rte_free(vdev->regions_hpa);
2402         }
2403         rte_free(vdev);
2404
2405 }
2406
2407 /*
2408  * Calculate the region count of physical continous regions for one particular
2409  * region of whose vhost virtual address is continous. The particular region
2410  * start from vva_start, with size of 'size' in argument.
2411  */
2412 static uint32_t
2413 check_hpa_regions(uint64_t vva_start, uint64_t size)
2414 {
2415         uint32_t i, nregions = 0, page_size = getpagesize();
2416         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2417         if (vva_start % page_size) {
2418                 LOG_DEBUG(VHOST_CONFIG,
2419                         "in check_countinous: vva start(%p) mod page_size(%d) "
2420                         "has remainder\n",
2421                         (void *)(uintptr_t)vva_start, page_size);
2422                 return 0;
2423         }
2424         if (size % page_size) {
2425                 LOG_DEBUG(VHOST_CONFIG,
2426                         "in check_countinous: "
2427                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2428                         size, page_size);
2429                 return 0;
2430         }
2431         for (i = 0; i < size - page_size; i = i + page_size) {
2432                 cur_phys_addr
2433                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2434                 next_phys_addr = rte_mem_virt2phy(
2435                         (void *)(uintptr_t)(vva_start + i + page_size));
2436                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2437                         ++nregions;
2438                         LOG_DEBUG(VHOST_CONFIG,
2439                                 "in check_continuous: hva addr:(%p) is not "
2440                                 "continuous with hva addr:(%p), diff:%d\n",
2441                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2442                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2443                                 + page_size), page_size);
2444                         LOG_DEBUG(VHOST_CONFIG,
2445                                 "in check_continuous: hpa addr:(%p) is not "
2446                                 "continuous with hpa addr:(%p), "
2447                                 "diff:(%"PRIu64")\n",
2448                                 (void *)(uintptr_t)cur_phys_addr,
2449                                 (void *)(uintptr_t)next_phys_addr,
2450                                 (next_phys_addr-cur_phys_addr));
2451                 }
2452         }
2453         return nregions;
2454 }
2455
2456 /*
2457  * Divide each region whose vhost virtual address is continous into a few
2458  * sub-regions, make sure the physical address within each sub-region are
2459  * continous. And fill offset(to GPA) and size etc. information of each
2460  * sub-region into regions_hpa.
2461  */
2462 static uint32_t
2463 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2464 {
2465         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2466         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2467
2468         if (mem_region_hpa == NULL)
2469                 return 0;
2470
2471         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2472                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2473                         virtio_memory->regions[regionidx].address_offset;
2474                 mem_region_hpa[regionidx_hpa].guest_phys_address
2475                         = virtio_memory->regions[regionidx].guest_phys_address;
2476                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2477                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2478                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2479                 LOG_DEBUG(VHOST_CONFIG,
2480                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2481                         regionidx_hpa,
2482                         (void *)(uintptr_t)
2483                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2484                 LOG_DEBUG(VHOST_CONFIG,
2485                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2486                         regionidx_hpa,
2487                         (void *)(uintptr_t)
2488                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2489                 for (i = 0, k = 0;
2490                         i < virtio_memory->regions[regionidx].memory_size -
2491                                 page_size;
2492                         i += page_size) {
2493                         cur_phys_addr = rte_mem_virt2phy(
2494                                         (void *)(uintptr_t)(vva_start + i));
2495                         next_phys_addr = rte_mem_virt2phy(
2496                                         (void *)(uintptr_t)(vva_start +
2497                                         i + page_size));
2498                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2499                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2500                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2501                                         k + page_size;
2502                                 mem_region_hpa[regionidx_hpa].memory_size
2503                                         = k + page_size;
2504                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2505                                         "phys addr end  [%d]:(%p)\n",
2506                                         regionidx_hpa,
2507                                         (void *)(uintptr_t)
2508                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2509                                 LOG_DEBUG(VHOST_CONFIG,
2510                                         "in fill_hpa_regions: guest phys addr "
2511                                         "size [%d]:(%p)\n",
2512                                         regionidx_hpa,
2513                                         (void *)(uintptr_t)
2514                                         (mem_region_hpa[regionidx_hpa].memory_size));
2515                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2516                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2517                                 ++regionidx_hpa;
2518                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2519                                         next_phys_addr -
2520                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2521                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2522                                         " phys addr start[%d]:(%p)\n",
2523                                         regionidx_hpa,
2524                                         (void *)(uintptr_t)
2525                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2526                                 LOG_DEBUG(VHOST_CONFIG,
2527                                         "in fill_hpa_regions: host  phys addr "
2528                                         "start[%d]:(%p)\n",
2529                                         regionidx_hpa,
2530                                         (void *)(uintptr_t)
2531                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2532                                 k = 0;
2533                         } else {
2534                                 k += page_size;
2535                         }
2536                 }
2537                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2538                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2539                         + k + page_size;
2540                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2541                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2542                         "[%d]:(%p)\n", regionidx_hpa,
2543                         (void *)(uintptr_t)
2544                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2545                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2546                         "[%d]:(%p)\n", regionidx_hpa,
2547                         (void *)(uintptr_t)
2548                         (mem_region_hpa[regionidx_hpa].memory_size));
2549                 ++regionidx_hpa;
2550         }
2551         return regionidx_hpa;
2552 }
2553
2554 /*
2555  * A new device is added to a data core. First the device is added to the main linked list
2556  * and the allocated to a specific data core.
2557  */
2558 static int
2559 new_device (struct virtio_net *dev)
2560 {
2561         struct virtio_net_data_ll *ll_dev;
2562         int lcore, core_add = 0;
2563         uint32_t device_num_min = num_devices;
2564         struct vhost_dev *vdev;
2565         uint32_t regionidx;
2566
2567         vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2568         if (vdev == NULL) {
2569                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2570                         dev->device_fh);
2571                 return -1;
2572         }
2573         vdev->dev = dev;
2574         dev->priv = vdev;
2575
2576         if (zero_copy) {
2577                 vdev->nregions_hpa = dev->mem->nregions;
2578                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2579                         vdev->nregions_hpa
2580                                 += check_hpa_regions(
2581                                         dev->mem->regions[regionidx].guest_phys_address
2582                                         + dev->mem->regions[regionidx].address_offset,
2583                                         dev->mem->regions[regionidx].memory_size);
2584
2585                 }
2586
2587                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2588                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2589                         CACHE_LINE_SIZE);
2590                 if (vdev->regions_hpa == NULL) {
2591                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2592                         rte_free(vdev);
2593                         return -1;
2594                 }
2595
2596
2597                 if (fill_hpa_memory_regions(
2598                         vdev->regions_hpa, dev->mem
2599                         ) != vdev->nregions_hpa) {
2600
2601                         RTE_LOG(ERR, VHOST_CONFIG,
2602                                 "hpa memory regions number mismatch: "
2603                                 "[%d]\n", vdev->nregions_hpa);
2604                         rte_free(vdev->regions_hpa);
2605                         rte_free(vdev);
2606                         return -1;
2607                 }
2608         }
2609
2610
2611         /* Add device to main ll */
2612         ll_dev = get_data_ll_free_entry(&ll_root_free);
2613         if (ll_dev == NULL) {
2614                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2615                         "of %d devices per core has been reached\n",
2616                         dev->device_fh, num_devices);
2617                 if (vdev->regions_hpa)
2618                         rte_free(vdev->regions_hpa);
2619                 rte_free(vdev);
2620                 return -1;
2621         }
2622         ll_dev->vdev = vdev;
2623         add_data_ll_entry(&ll_root_used, ll_dev);
2624         vdev->vmdq_rx_q
2625                 = dev->device_fh * (num_queues / num_devices);
2626
2627         if (zero_copy) {
2628                 uint32_t index = vdev->vmdq_rx_q;
2629                 uint32_t count_in_ring, i;
2630                 struct mbuf_table *tx_q;
2631
2632                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2633
2634                 LOG_DEBUG(VHOST_CONFIG,
2635                         "(%"PRIu64") in new_device: mbuf count in mempool "
2636                         "before attach is: %d\n",
2637                         dev->device_fh,
2638                         rte_mempool_count(vpool_array[index].pool));
2639                 LOG_DEBUG(VHOST_CONFIG,
2640                         "(%"PRIu64") in new_device: mbuf count in  ring "
2641                         "before attach  is : %d\n",
2642                         dev->device_fh, count_in_ring);
2643
2644                 /*
2645                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2646                  */
2647                 for (i = 0; i < count_in_ring; i++)
2648                         attach_rxmbuf_zcp(dev);
2649
2650                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2651                         "mempool after attach is: %d\n",
2652                         dev->device_fh,
2653                         rte_mempool_count(vpool_array[index].pool));
2654                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2655                         "ring after attach  is : %d\n",
2656                         dev->device_fh,
2657                         rte_ring_count(vpool_array[index].ring));
2658
2659                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2660                 tx_q->txq_id = vdev->vmdq_rx_q;
2661
2662                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2663                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2664
2665                         LOG_DEBUG(VHOST_CONFIG,
2666                                 "(%"PRIu64") In new_device: Failed to start "
2667                                 "tx queue:%d\n",
2668                                 dev->device_fh, vdev->vmdq_rx_q);
2669
2670                         mbuf_destroy_zcp(vpool);
2671                         rte_free(vdev->regions_hpa);
2672                         rte_free(vdev);
2673                         return -1;
2674                 }
2675
2676                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2677                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2678
2679                         LOG_DEBUG(VHOST_CONFIG,
2680                                 "(%"PRIu64") In new_device: Failed to start "
2681                                 "rx queue:%d\n",
2682                                 dev->device_fh, vdev->vmdq_rx_q);
2683
2684                         /* Stop the TX queue. */
2685                         if (rte_eth_dev_tx_queue_stop(ports[0],
2686                                 vdev->vmdq_rx_q) != 0) {
2687                                 LOG_DEBUG(VHOST_CONFIG,
2688                                         "(%"PRIu64") In new_device: Failed to "
2689                                         "stop tx queue:%d\n",
2690                                         dev->device_fh, vdev->vmdq_rx_q);
2691                         }
2692
2693                         mbuf_destroy_zcp(vpool);
2694                         rte_free(vdev->regions_hpa);
2695                         rte_free(vdev);
2696                         return -1;
2697                 }
2698
2699         }
2700
2701         /*reset ready flag*/
2702         vdev->ready = DEVICE_MAC_LEARNING;
2703         vdev->remove = 0;
2704
2705         /* Find a suitable lcore to add the device. */
2706         RTE_LCORE_FOREACH_SLAVE(lcore) {
2707                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2708                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2709                         core_add = lcore;
2710                 }
2711         }
2712         /* Add device to lcore ll */
2713         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2714         if (ll_dev == NULL) {
2715                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2716                 vdev->ready = DEVICE_SAFE_REMOVE;
2717                 destroy_device(dev);
2718                 if (vdev->regions_hpa)
2719                         rte_free(vdev->regions_hpa);
2720                 rte_free(vdev);
2721                 return -1;
2722         }
2723         ll_dev->vdev = vdev;
2724         vdev->coreid = core_add;
2725
2726         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2727
2728         /* Initialize device stats */
2729         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2730
2731         /* Disable notifications. */
2732         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2733         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2734         lcore_info[vdev->coreid].lcore_ll->device_num++;
2735         dev->flags |= VIRTIO_DEV_RUNNING;
2736
2737         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2738
2739         return 0;
2740 }
2741
2742 /*
2743  * These callback allow devices to be added to the data core when configuration
2744  * has been fully complete.
2745  */
2746 static const struct virtio_net_device_ops virtio_net_device_ops =
2747 {
2748         .new_device =  new_device,
2749         .destroy_device = destroy_device,
2750 };
2751
2752 /*
2753  * This is a thread will wake up after a period to print stats if the user has
2754  * enabled them.
2755  */
2756 static void
2757 print_stats(void)
2758 {
2759         struct virtio_net_data_ll *dev_ll;
2760         uint64_t tx_dropped, rx_dropped;
2761         uint64_t tx, tx_total, rx, rx_total;
2762         uint32_t device_fh;
2763         const char clr[] = { 27, '[', '2', 'J', '\0' };
2764         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2765
2766         while(1) {
2767                 sleep(enable_stats);
2768
2769                 /* Clear screen and move to top left */
2770                 printf("%s%s", clr, top_left);
2771
2772                 printf("\nDevice statistics ====================================");
2773
2774                 dev_ll = ll_root_used;
2775                 while (dev_ll != NULL) {
2776                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2777                         tx_total = dev_statistics[device_fh].tx_total;
2778                         tx = dev_statistics[device_fh].tx;
2779                         tx_dropped = tx_total - tx;
2780                         if (zero_copy == 0) {
2781                                 rx_total = rte_atomic64_read(
2782                                         &dev_statistics[device_fh].rx_total_atomic);
2783                                 rx = rte_atomic64_read(
2784                                         &dev_statistics[device_fh].rx_atomic);
2785                         } else {
2786                                 rx_total = dev_statistics[device_fh].rx_total;
2787                                 rx = dev_statistics[device_fh].rx;
2788                         }
2789                         rx_dropped = rx_total - rx;
2790
2791                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2792                                         "\nTX total:            %"PRIu64""
2793                                         "\nTX dropped:          %"PRIu64""
2794                                         "\nTX successful:               %"PRIu64""
2795                                         "\nRX total:            %"PRIu64""
2796                                         "\nRX dropped:          %"PRIu64""
2797                                         "\nRX successful:               %"PRIu64"",
2798                                         device_fh,
2799                                         tx_total,
2800                                         tx_dropped,
2801                                         tx,
2802                                         rx_total,
2803                                         rx_dropped,
2804                                         rx);
2805
2806                         dev_ll = dev_ll->next;
2807                 }
2808                 printf("\n======================================================\n");
2809         }
2810 }
2811
2812 static void
2813 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2814         char *ring_name, uint32_t nb_mbuf)
2815 {
2816         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2817         vpool_array[index].pool
2818                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2819                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2820                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2821                 rte_pktmbuf_init, NULL, socket, 0);
2822         if (vpool_array[index].pool != NULL) {
2823                 vpool_array[index].ring
2824                         = rte_ring_create(ring_name,
2825                                 rte_align32pow2(nb_mbuf + 1),
2826                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2827                 if (likely(vpool_array[index].ring != NULL)) {
2828                         LOG_DEBUG(VHOST_CONFIG,
2829                                 "in setup_mempool_tbl: mbuf count in "
2830                                 "mempool is: %d\n",
2831                                 rte_mempool_count(vpool_array[index].pool));
2832                         LOG_DEBUG(VHOST_CONFIG,
2833                                 "in setup_mempool_tbl: mbuf count in "
2834                                 "ring   is: %d\n",
2835                                 rte_ring_count(vpool_array[index].ring));
2836                 } else {
2837                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2838                                 ring_name);
2839                 }
2840
2841                 /* Need consider head room. */
2842                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2843         } else {
2844                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2845         }
2846 }
2847
2848
2849 /*
2850  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2851  * device is also registered here to handle the IOCTLs.
2852  */
2853 int
2854 MAIN(int argc, char *argv[])
2855 {
2856         struct rte_mempool *mbuf_pool = NULL;
2857         unsigned lcore_id, core_id = 0;
2858         unsigned nb_ports, valid_num_ports;
2859         int ret;
2860         uint8_t portid, queue_id = 0;
2861         static pthread_t tid;
2862
2863         /* init EAL */
2864         ret = rte_eal_init(argc, argv);
2865         if (ret < 0)
2866                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2867         argc -= ret;
2868         argv += ret;
2869
2870         /* parse app arguments */
2871         ret = us_vhost_parse_args(argc, argv);
2872         if (ret < 0)
2873                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2874
2875         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2876                 if (rte_lcore_is_enabled(lcore_id))
2877                         lcore_ids[core_id ++] = lcore_id;
2878
2879         if (rte_lcore_count() > RTE_MAX_LCORE)
2880                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2881
2882         /*set the number of swithcing cores available*/
2883         num_switching_cores = rte_lcore_count()-1;
2884
2885         /* Get the number of physical ports. */
2886         nb_ports = rte_eth_dev_count();
2887         if (nb_ports > RTE_MAX_ETHPORTS)
2888                 nb_ports = RTE_MAX_ETHPORTS;
2889
2890         /*
2891          * Update the global var NUM_PORTS and global array PORTS
2892          * and get value of var VALID_NUM_PORTS according to system ports number
2893          */
2894         valid_num_ports = check_ports_num(nb_ports);
2895
2896         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2897                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2898                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2899                 return -1;
2900         }
2901
2902         if (zero_copy == 0) {
2903                 /* Create the mbuf pool. */
2904                 mbuf_pool = rte_mempool_create(
2905                                 "MBUF_POOL",
2906                                 NUM_MBUFS_PER_PORT
2907                                 * valid_num_ports,
2908                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2909                                 sizeof(struct rte_pktmbuf_pool_private),
2910                                 rte_pktmbuf_pool_init, NULL,
2911                                 rte_pktmbuf_init, NULL,
2912                                 rte_socket_id(), 0);
2913                 if (mbuf_pool == NULL)
2914                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2915
2916                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2917                         vpool_array[queue_id].pool = mbuf_pool;
2918
2919                 if (vm2vm_mode == VM2VM_HARDWARE) {
2920                         /* Enable VT loop back to let L2 switch to do it. */
2921                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2922                         LOG_DEBUG(VHOST_CONFIG,
2923                                 "Enable loop back for L2 switch in vmdq.\n");
2924                 }
2925         } else {
2926                 uint32_t nb_mbuf;
2927                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2928                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2929
2930                 /*
2931                  * Zero copy defers queue RX/TX start to the time when guest
2932                  * finishes its startup and packet buffers from that guest are
2933                  * available.
2934                  */
2935                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2936                 rx_conf_default.rx_drop_en = 0;
2937                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2938                 nb_mbuf = num_rx_descriptor
2939                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2940                         + num_switching_cores * MAX_PKT_BURST;
2941
2942                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2943                         snprintf(pool_name, sizeof(pool_name),
2944                                 "rxmbuf_pool_%u", queue_id);
2945                         snprintf(ring_name, sizeof(ring_name),
2946                                 "rxmbuf_ring_%u", queue_id);
2947                         setup_mempool_tbl(rte_socket_id(), queue_id,
2948                                 pool_name, ring_name, nb_mbuf);
2949                 }
2950
2951                 nb_mbuf = num_tx_descriptor
2952                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2953                                 + num_switching_cores * MAX_PKT_BURST;
2954
2955                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2956                         snprintf(pool_name, sizeof(pool_name),
2957                                 "txmbuf_pool_%u", queue_id);
2958                         snprintf(ring_name, sizeof(ring_name),
2959                                 "txmbuf_ring_%u", queue_id);
2960                         setup_mempool_tbl(rte_socket_id(),
2961                                 (queue_id + MAX_QUEUES),
2962                                 pool_name, ring_name, nb_mbuf);
2963                 }
2964
2965                 if (vm2vm_mode == VM2VM_HARDWARE) {
2966                         /* Enable VT loop back to let L2 switch to do it. */
2967                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2968                         LOG_DEBUG(VHOST_CONFIG,
2969                                 "Enable loop back for L2 switch in vmdq.\n");
2970                 }
2971         }
2972         /* Set log level. */
2973         rte_set_log_level(LOG_LEVEL);
2974
2975         /* initialize all ports */
2976         for (portid = 0; portid < nb_ports; portid++) {
2977                 /* skip ports that are not enabled */
2978                 if ((enabled_port_mask & (1 << portid)) == 0) {
2979                         RTE_LOG(INFO, VHOST_PORT,
2980                                 "Skipping disabled port %d\n", portid);
2981                         continue;
2982                 }
2983                 if (port_init(portid) != 0)
2984                         rte_exit(EXIT_FAILURE,
2985                                 "Cannot initialize network ports\n");
2986         }
2987
2988         /* Initialise all linked lists. */
2989         if (init_data_ll() == -1)
2990                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2991
2992         /* Initialize device stats */
2993         memset(&dev_statistics, 0, sizeof(dev_statistics));
2994
2995         /* Enable stats if the user option is set. */
2996         if (enable_stats)
2997                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2998
2999         /* Launch all data cores. */
3000         if (zero_copy == 0) {
3001                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3002                         rte_eal_remote_launch(switch_worker,
3003                                 mbuf_pool, lcore_id);
3004                 }
3005         } else {
3006                 uint32_t count_in_mempool, index, i;
3007                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3008                         /* For all RX and TX queues. */
3009                         count_in_mempool
3010                                 = rte_mempool_count(vpool_array[index].pool);
3011
3012                         /*
3013                          * Transfer all un-attached mbufs from vpool.pool
3014                          * to vpoo.ring.
3015                          */
3016                         for (i = 0; i < count_in_mempool; i++) {
3017                                 struct rte_mbuf *mbuf
3018                                         = __rte_mbuf_raw_alloc(
3019                                                 vpool_array[index].pool);
3020                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3021                                                 (void *)mbuf);
3022                         }
3023
3024                         LOG_DEBUG(VHOST_CONFIG,
3025                                 "in MAIN: mbuf count in mempool at initial "
3026                                 "is: %d\n", count_in_mempool);
3027                         LOG_DEBUG(VHOST_CONFIG,
3028                                 "in MAIN: mbuf count in  ring at initial  is :"
3029                                 " %d\n",
3030                                 rte_ring_count(vpool_array[index].ring));
3031                 }
3032
3033                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3034                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3035                                 lcore_id);
3036         }
3037
3038         if (mergeable == 0)
3039                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3040
3041         /* Register CUSE device to handle IOCTLs. */
3042         ret = rte_vhost_driver_register((char *)&dev_basename);
3043         if (ret != 0)
3044                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3045
3046         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3047
3048         /* Start CUSE session. */
3049         rte_vhost_driver_session_start();
3050         return 0;
3051
3052 }
3053