remove trailing whitespaces
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56
57 #define MAX_QUEUES 128
58
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
66                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
67                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68                                                         (num_switching_cores*MBUF_CACHE_SIZE))
69
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80         + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101
102 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
105
106 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for rte_snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
166
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 uint32_t num_devices = 0;
170
171 /*
172  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173  * disabled on default.
174  */
175 static uint32_t zero_copy;
176
177 /* number of descriptors to apply*/
178 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
179 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
180
181 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
182 #define MAX_RING_DESC 4096
183
184 struct vpool {
185         struct rte_mempool *pool;
186         struct rte_ring *ring;
187         uint32_t buf_size;
188 } vpool_array[MAX_QUEUES+MAX_QUEUES];
189
190 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
191 typedef enum {
192         VM2VM_DISABLED = 0,
193         VM2VM_SOFTWARE = 1,
194         VM2VM_HARDWARE = 2,
195         VM2VM_LAST
196 } vm2vm_type;
197 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
198
199 /* The type of host physical address translated from guest physical address. */
200 typedef enum {
201         PHYS_ADDR_CONTINUOUS = 0,
202         PHYS_ADDR_CROSS_SUBREG = 1,
203         PHYS_ADDR_INVALID = 2,
204         PHYS_ADDR_LAST
205 } hpa_type;
206
207 /* Enable stats. */
208 static uint32_t enable_stats = 0;
209 /* Enable retries on RX. */
210 static uint32_t enable_retry = 1;
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218
219 /* Charater device index. Can be set by user. */
220 static uint32_t dev_index = 0;
221
222 /* This can be set by the user so it is made available here. */
223 extern uint64_t VHOST_FEATURES;
224
225 /* Default configuration for rx and tx thresholds etc. */
226 static struct rte_eth_rxconf rx_conf_default = {
227         .rx_thresh = {
228                 .pthresh = RX_PTHRESH,
229                 .hthresh = RX_HTHRESH,
230                 .wthresh = RX_WTHRESH,
231         },
232         .rx_drop_en = 1,
233 };
234
235 /*
236  * These default values are optimized for use with the Intel(R) 82599 10 GbE
237  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
238  * network controllers and/or network drivers.
239  */
240 static struct rte_eth_txconf tx_conf_default = {
241         .tx_thresh = {
242                 .pthresh = TX_PTHRESH,
243                 .hthresh = TX_HTHRESH,
244                 .wthresh = TX_WTHRESH,
245         },
246         .tx_free_thresh = 0, /* Use PMD default values */
247         .tx_rs_thresh = 0, /* Use PMD default values */
248 };
249
250 /* empty vmdq configuration structure. Filled in programatically */
251 static struct rte_eth_conf vmdq_conf_default = {
252         .rxmode = {
253                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
254                 .split_hdr_size = 0,
255                 .header_split   = 0, /**< Header Split disabled */
256                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
257                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
258                 /*
259                  * It is necessary for 1G NIC such as I350,
260                  * this fixes bug of ipv4 forwarding in guest can't
261                  * forward pakets from one virtio dev to another virtio dev.
262                  */
263                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
264                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
265                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
266         },
267
268         .txmode = {
269                 .mq_mode = ETH_MQ_TX_NONE,
270         },
271         .rx_adv_conf = {
272                 /*
273                  * should be overridden separately in code with
274                  * appropriate values
275                  */
276                 .vmdq_rx_conf = {
277                         .nb_queue_pools = ETH_8_POOLS,
278                         .enable_default_pool = 0,
279                         .default_pool = 0,
280                         .nb_pool_maps = 0,
281                         .pool_map = {{0, 0},},
282                 },
283         },
284 };
285
286 static unsigned lcore_ids[RTE_MAX_LCORE];
287 static uint8_t ports[RTE_MAX_ETHPORTS];
288 static unsigned num_ports = 0; /**< The number of ports specified in command line */
289
290 static const uint16_t external_pkt_default_vlan_tag = 2000;
291 const uint16_t vlan_tags[] = {
292         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
293         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
294         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
295         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
296         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
297         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
298         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
299         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
300 };
301
302 /* ethernet addresses of ports */
303 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
304
305 /* heads for the main used and free linked lists for the data path. */
306 static struct virtio_net_data_ll *ll_root_used = NULL;
307 static struct virtio_net_data_ll *ll_root_free = NULL;
308
309 /* Array of data core structures containing information on individual core linked lists. */
310 static struct lcore_info lcore_info[RTE_MAX_LCORE];
311
312 /* Used for queueing bursts of TX packets. */
313 struct mbuf_table {
314         unsigned len;
315         unsigned txq_id;
316         struct rte_mbuf *m_table[MAX_PKT_BURST];
317 };
318
319 /* TX queue for each data core. */
320 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
321
322 /* TX queue fori each virtio device for zero copy. */
323 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
324
325 /* Vlan header struct used to insert vlan tags on TX. */
326 struct vlan_ethhdr {
327         unsigned char   h_dest[ETH_ALEN];
328         unsigned char   h_source[ETH_ALEN];
329         __be16          h_vlan_proto;
330         __be16          h_vlan_TCI;
331         __be16          h_vlan_encapsulated_proto;
332 };
333
334 /* IPv4 Header */
335 struct ipv4_hdr {
336         uint8_t  version_ihl;           /**< version and header length */
337         uint8_t  type_of_service;       /**< type of service */
338         uint16_t total_length;          /**< length of packet */
339         uint16_t packet_id;             /**< packet ID */
340         uint16_t fragment_offset;       /**< fragmentation offset */
341         uint8_t  time_to_live;          /**< time to live */
342         uint8_t  next_proto_id;         /**< protocol ID */
343         uint16_t hdr_checksum;          /**< header checksum */
344         uint32_t src_addr;              /**< source address */
345         uint32_t dst_addr;              /**< destination address */
346 } __attribute__((__packed__));
347
348 /* Header lengths. */
349 #define VLAN_HLEN       4
350 #define VLAN_ETH_HLEN   18
351
352 /* Per-device statistics struct */
353 struct device_statistics {
354         uint64_t tx_total;
355         rte_atomic64_t rx_total_atomic;
356         uint64_t rx_total;
357         uint64_t tx;
358         rte_atomic64_t rx_atomic;
359         uint64_t rx;
360 } __rte_cache_aligned;
361 struct device_statistics dev_statistics[MAX_DEVICES];
362
363 /*
364  * Builds up the correct configuration for VMDQ VLAN pool map
365  * according to the pool & queue limits.
366  */
367 static inline int
368 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
369 {
370         struct rte_eth_vmdq_rx_conf conf;
371         unsigned i;
372
373         memset(&conf, 0, sizeof(conf));
374         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
375         conf.nb_pool_maps = num_devices;
376         conf.enable_loop_back =
377                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
378
379         for (i = 0; i < conf.nb_pool_maps; i++) {
380                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
381                 conf.pool_map[i].pools = (1UL << i);
382         }
383
384         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
385         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
386                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
387         return 0;
388 }
389
390 /*
391  * Validate the device number according to the max pool number gotten form
392  * dev_info. If the device number is invalid, give the error message and
393  * return -1. Each device must have its own pool.
394  */
395 static inline int
396 validate_num_devices(uint32_t max_nb_devices)
397 {
398         if (num_devices > max_nb_devices) {
399                 RTE_LOG(ERR, PORT, "invalid number of devices\n");
400                 return -1;
401         }
402         return 0;
403 }
404
405 /*
406  * Initialises a given port using global settings and with the rx buffers
407  * coming from the mbuf_pool passed as parameter
408  */
409 static inline int
410 port_init(uint8_t port)
411 {
412         struct rte_eth_dev_info dev_info;
413         struct rte_eth_conf port_conf;
414         uint16_t rx_rings, tx_rings;
415         uint16_t rx_ring_size, tx_ring_size;
416         int retval;
417         uint16_t q;
418
419         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
420         rte_eth_dev_info_get (port, &dev_info);
421
422         /*configure the number of supported virtio devices based on VMDQ limits */
423         num_devices = dev_info.max_vmdq_pools;
424         num_queues = dev_info.max_rx_queues;
425
426         if (zero_copy) {
427                 rx_ring_size = num_rx_descriptor;
428                 tx_ring_size = num_tx_descriptor;
429                 tx_rings = dev_info.max_tx_queues;
430         } else {
431                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
432                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
433                 tx_rings = (uint16_t)rte_lcore_count();
434         }
435
436         retval = validate_num_devices(MAX_DEVICES);
437         if (retval < 0)
438                 return retval;
439
440         /* Get port configuration. */
441         retval = get_eth_conf(&port_conf, num_devices);
442         if (retval < 0)
443                 return retval;
444
445         if (port >= rte_eth_dev_count()) return -1;
446
447         rx_rings = (uint16_t)num_queues,
448         /* Configure ethernet device. */
449         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
450         if (retval != 0)
451                 return retval;
452
453         /* Setup the queues. */
454         for (q = 0; q < rx_rings; q ++) {
455                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
456                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
457                                                 vpool_array[q].pool);
458                 if (retval < 0)
459                         return retval;
460         }
461         for (q = 0; q < tx_rings; q ++) {
462                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
463                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
464                 if (retval < 0)
465                         return retval;
466         }
467
468         /* Start the device. */
469         retval  = rte_eth_dev_start(port);
470         if (retval < 0) {
471                 RTE_LOG(ERR, DATA, "Failed to start the device.\n");
472                 return retval;
473         }
474
475         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
476         RTE_LOG(INFO, PORT, "Max virtio devices supported: %u\n", num_devices);
477         RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
478                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
479                         (unsigned)port,
480                         vmdq_ports_eth_addr[port].addr_bytes[0],
481                         vmdq_ports_eth_addr[port].addr_bytes[1],
482                         vmdq_ports_eth_addr[port].addr_bytes[2],
483                         vmdq_ports_eth_addr[port].addr_bytes[3],
484                         vmdq_ports_eth_addr[port].addr_bytes[4],
485                         vmdq_ports_eth_addr[port].addr_bytes[5]);
486
487         return 0;
488 }
489
490 /*
491  * Set character device basename.
492  */
493 static int
494 us_vhost_parse_basename(const char *q_arg)
495 {
496         /* parse number string */
497
498         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
499                 return -1;
500         else
501                 rte_snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
502
503         return 0;
504 }
505
506 /*
507  * Parse the portmask provided at run time.
508  */
509 static int
510 parse_portmask(const char *portmask)
511 {
512         char *end = NULL;
513         unsigned long pm;
514
515         errno = 0;
516
517         /* parse hexadecimal string */
518         pm = strtoul(portmask, &end, 16);
519         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
520                 return -1;
521
522         if (pm == 0)
523                 return -1;
524
525         return pm;
526
527 }
528
529 /*
530  * Parse num options at run time.
531  */
532 static int
533 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
534 {
535         char *end = NULL;
536         unsigned long num;
537
538         errno = 0;
539
540         /* parse unsigned int string */
541         num = strtoul(q_arg, &end, 10);
542         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
543                 return -1;
544
545         if (num > max_valid_value)
546                 return -1;
547
548         return num;
549
550 }
551
552 /*
553  * Display usage
554  */
555 static void
556 us_vhost_usage(const char *prgname)
557 {
558         RTE_LOG(INFO, CONFIG, "%s [EAL options] -- -p PORTMASK\n"
559         "               --vm2vm [0|1|2]\n"
560         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
561         "               --dev-basename <name> --dev-index [0-N]\n"
562         "               --nb-devices ND\n"
563         "               -p PORTMASK: Set mask for ports to be used by application\n"
564         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
565         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
566         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
567         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
568         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
569         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
570         "               --dev-basename: The basename to be used for the character device.\n"
571         "               --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
572         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
573                         "zero copy\n"
574         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
575                         "used only when zero copy is enabled.\n"
576         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
577                         "used only when zero copy is enabled.\n",
578                prgname);
579 }
580
581 /*
582  * Parse the arguments given in the command line of the application.
583  */
584 static int
585 us_vhost_parse_args(int argc, char **argv)
586 {
587         int opt, ret;
588         int option_index;
589         unsigned i;
590         const char *prgname = argv[0];
591         static struct option long_option[] = {
592                 {"vm2vm", required_argument, NULL, 0},
593                 {"rx-retry", required_argument, NULL, 0},
594                 {"rx-retry-delay", required_argument, NULL, 0},
595                 {"rx-retry-num", required_argument, NULL, 0},
596                 {"mergeable", required_argument, NULL, 0},
597                 {"stats", required_argument, NULL, 0},
598                 {"dev-basename", required_argument, NULL, 0},
599                 {"dev-index", required_argument, NULL, 0},
600                 {"zero-copy", required_argument, NULL, 0},
601                 {"rx-desc-num", required_argument, NULL, 0},
602                 {"tx-desc-num", required_argument, NULL, 0},
603                 {NULL, 0, 0, 0},
604         };
605
606         /* Parse command line */
607         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
608                 switch (opt) {
609                 /* Portmask */
610                 case 'p':
611                         enabled_port_mask = parse_portmask(optarg);
612                         if (enabled_port_mask == 0) {
613                                 RTE_LOG(INFO, CONFIG, "Invalid portmask\n");
614                                 us_vhost_usage(prgname);
615                                 return -1;
616                         }
617                         break;
618
619                 case 0:
620                         /* Enable/disable vm2vm comms. */
621                         if (!strncmp(long_option[option_index].name, "vm2vm",
622                                 MAX_LONG_OPT_SZ)) {
623                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
624                                 if (ret == -1) {
625                                         RTE_LOG(INFO, CONFIG,
626                                                 "Invalid argument for "
627                                                 "vm2vm [0|1|2]\n");
628                                         us_vhost_usage(prgname);
629                                         return -1;
630                                 } else {
631                                         vm2vm_mode = (vm2vm_type)ret;
632                                 }
633                         }
634
635                         /* Enable/disable retries on RX. */
636                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
637                                 ret = parse_num_opt(optarg, 1);
638                                 if (ret == -1) {
639                                         RTE_LOG(INFO, CONFIG, "Invalid argument for rx-retry [0|1]\n");
640                                         us_vhost_usage(prgname);
641                                         return -1;
642                                 } else {
643                                         enable_retry = ret;
644                                 }
645                         }
646
647                         /* Specify the retries delay time (in useconds) on RX. */
648                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
649                                 ret = parse_num_opt(optarg, INT32_MAX);
650                                 if (ret == -1) {
651                                         RTE_LOG(INFO, CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
652                                         us_vhost_usage(prgname);
653                                         return -1;
654                                 } else {
655                                         burst_rx_delay_time = ret;
656                                 }
657                         }
658
659                         /* Specify the retries number on RX. */
660                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
661                                 ret = parse_num_opt(optarg, INT32_MAX);
662                                 if (ret == -1) {
663                                         RTE_LOG(INFO, CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
664                                         us_vhost_usage(prgname);
665                                         return -1;
666                                 } else {
667                                         burst_rx_retry_num = ret;
668                                 }
669                         }
670
671                         /* Enable/disable RX mergeable buffers. */
672                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
673                                 ret = parse_num_opt(optarg, 1);
674                                 if (ret == -1) {
675                                         RTE_LOG(INFO, CONFIG, "Invalid argument for mergeable [0|1]\n");
676                                         us_vhost_usage(prgname);
677                                         return -1;
678                                 } else {
679                                         if (ret)
680                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
681                                 }
682                         }
683
684                         /* Enable/disable stats. */
685                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
686                                 ret = parse_num_opt(optarg, INT32_MAX);
687                                 if (ret == -1) {
688                                         RTE_LOG(INFO, CONFIG, "Invalid argument for stats [0..N]\n");
689                                         us_vhost_usage(prgname);
690                                         return -1;
691                                 } else {
692                                         enable_stats = ret;
693                                 }
694                         }
695
696                         /* Set character device basename. */
697                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
698                                 if (us_vhost_parse_basename(optarg) == -1) {
699                                         RTE_LOG(INFO, CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
700                                         us_vhost_usage(prgname);
701                                         return -1;
702                                 }
703                         }
704
705                         /* Set character device index. */
706                         if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
707                                 ret = parse_num_opt(optarg, INT32_MAX);
708                                 if (ret == -1) {
709                                         RTE_LOG(INFO, CONFIG, "Invalid argument for character device index [0..N]\n");
710                                         us_vhost_usage(prgname);
711                                         return -1;
712                                 } else
713                                         dev_index = ret;
714                         }
715
716                         /* Enable/disable rx/tx zero copy. */
717                         if (!strncmp(long_option[option_index].name,
718                                 "zero-copy", MAX_LONG_OPT_SZ)) {
719                                 ret = parse_num_opt(optarg, 1);
720                                 if (ret == -1) {
721                                         RTE_LOG(INFO, CONFIG,
722                                                 "Invalid argument"
723                                                 " for zero-copy [0|1]\n");
724                                         us_vhost_usage(prgname);
725                                         return -1;
726                                 } else
727                                         zero_copy = ret;
728
729                                 if (zero_copy) {
730 #ifdef RTE_MBUF_SCATTER_GATHER
731                                         RTE_LOG(ERR, CONFIG, "Before running "
732                                         "zero copy vhost APP, please "
733                                         "disable RTE_MBUF_SCATTER_GATHER\n"
734                                         "in config file and then rebuild DPDK "
735                                         "core lib!\n"
736                                         "Otherwise please disable zero copy "
737                                         "flag in command line!\n");
738                                         return -1;
739 #endif
740                                 }
741                         }
742
743                         /* Specify the descriptor number on RX. */
744                         if (!strncmp(long_option[option_index].name,
745                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
746                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
747                                 if ((ret == -1) || (!POWEROF2(ret))) {
748                                         RTE_LOG(INFO, CONFIG,
749                                         "Invalid argument for rx-desc-num[0-N],"
750                                         "power of 2 required.\n");
751                                         us_vhost_usage(prgname);
752                                         return -1;
753                                 } else {
754                                         num_rx_descriptor = ret;
755                                 }
756                         }
757
758                         /* Specify the descriptor number on TX. */
759                         if (!strncmp(long_option[option_index].name,
760                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
761                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
762                                 if ((ret == -1) || (!POWEROF2(ret))) {
763                                         RTE_LOG(INFO, CONFIG,
764                                         "Invalid argument for tx-desc-num [0-N],"
765                                         "power of 2 required.\n");
766                                         us_vhost_usage(prgname);
767                                         return -1;
768                                 } else {
769                                         num_tx_descriptor = ret;
770                                 }
771                         }
772
773                         break;
774
775                         /* Invalid option - print options. */
776                 default:
777                         us_vhost_usage(prgname);
778                         return -1;
779                 }
780         }
781
782         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
783                 if (enabled_port_mask & (1 << i))
784                         ports[num_ports++] = (uint8_t)i;
785         }
786
787         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
788                 RTE_LOG(INFO, PORT, "Current enabled port number is %u,"
789                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
790                 return -1;
791         }
792
793         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
794                 RTE_LOG(INFO, PORT,
795                         "Vhost zero copy doesn't support software vm2vm,"
796                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
797                 return -1;
798         }
799
800         return 0;
801 }
802
803 /*
804  * Update the global var NUM_PORTS and array PORTS according to system ports number
805  * and return valid ports number
806  */
807 static unsigned check_ports_num(unsigned nb_ports)
808 {
809         unsigned valid_num_ports = num_ports;
810         unsigned portid;
811
812         if (num_ports > nb_ports) {
813                 RTE_LOG(INFO, PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
814                         num_ports, nb_ports);
815                 num_ports = nb_ports;
816         }
817
818         for (portid = 0; portid < num_ports; portid ++) {
819                 if (ports[portid] >= nb_ports) {
820                         RTE_LOG(INFO, PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
821                                 ports[portid], (nb_ports - 1));
822                         ports[portid] = INVALID_PORT_ID;
823                         valid_num_ports--;
824                 }
825         }
826         return valid_num_ports;
827 }
828
829 /*
830  * Macro to print out packet contents. Wrapped in debug define so that the
831  * data path is not effected when debug is disabled.
832  */
833 #ifdef DEBUG
834 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
835         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
836         unsigned int index;                                                                                                                                                                                             \
837         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
838                                                                                                                                                                                                                                         \
839         if ((header))                                                                                                                                                                                                   \
840                 rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                              \
841         else                                                                                                                                                                                                                    \
842                 rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                              \
843         for (index = 0; index < (size); index++) {                                                                                                                                              \
844                 rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),        \
845                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
846         }                                                                                                                                                                                                                               \
847         rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
848                                                                                                                                                                                                                                         \
849         LOG_DEBUG(DATA, "%s", packet);                                                                                                                                                                  \
850 } while(0)
851 #else
852 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
853 #endif
854
855 /*
856  * Function to convert guest physical addresses to vhost virtual addresses. This
857  * is used to convert virtio buffer addresses.
858  */
859 static inline uint64_t __attribute__((always_inline))
860 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
861 {
862         struct virtio_memory_regions *region;
863         uint32_t regionidx;
864         uint64_t vhost_va = 0;
865
866         for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
867                 region = &dev->mem->regions[regionidx];
868                 if ((guest_pa >= region->guest_phys_address) &&
869                         (guest_pa <= region->guest_phys_address_end)) {
870                         vhost_va = region->address_offset + guest_pa;
871                         break;
872                 }
873         }
874         LOG_DEBUG(DATA, "(%"PRIu64") GPA %p| VVA %p\n",
875                 dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
876
877         return vhost_va;
878 }
879
880 /*
881  * Function to convert guest physical addresses to vhost physical addresses.
882  * This is used to convert virtio buffer addresses.
883  */
884 static inline uint64_t __attribute__((always_inline))
885 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
886         uint32_t buf_len, hpa_type *addr_type)
887 {
888         struct virtio_memory_regions_hpa *region;
889         uint32_t regionidx;
890         uint64_t vhost_pa = 0;
891
892         *addr_type = PHYS_ADDR_INVALID;
893
894         for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
895                 region = &dev->mem->regions_hpa[regionidx];
896                 if ((guest_pa >= region->guest_phys_address) &&
897                         (guest_pa <= region->guest_phys_address_end)) {
898                         vhost_pa = region->host_phys_addr_offset + guest_pa;
899                         if (likely((guest_pa + buf_len - 1)
900                                 <= region->guest_phys_address_end))
901                                 *addr_type = PHYS_ADDR_CONTINUOUS;
902                         else
903                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
904                         break;
905                 }
906         }
907
908         LOG_DEBUG(DATA, "(%"PRIu64") GPA %p| HPA %p\n",
909                 dev->device_fh, (void *)(uintptr_t)guest_pa,
910                 (void *)(uintptr_t)vhost_pa);
911
912         return vhost_pa;
913 }
914
915 /*
916  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
917  * be received from the physical port or from another virtio device. A packet
918  * count is returned to indicate the number of packets that were succesfully
919  * added to the RX queue.
920  */
921 static inline uint32_t __attribute__((always_inline))
922 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
923 {
924         struct vhost_virtqueue *vq;
925         struct vring_desc *desc;
926         struct rte_mbuf *buff;
927         /* The virtio_hdr is initialised to 0. */
928         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
929         uint64_t buff_addr = 0;
930         uint64_t buff_hdr_addr = 0;
931         uint32_t head[MAX_PKT_BURST], packet_len = 0;
932         uint32_t head_idx, packet_success = 0;
933         uint32_t mergeable, mrg_count = 0;
934         uint32_t retry = 0;
935         uint16_t avail_idx, res_cur_idx;
936         uint16_t res_base_idx, res_end_idx;
937         uint16_t free_entries;
938         uint8_t success = 0;
939
940         LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
941         vq = dev->virtqueue[VIRTIO_RXQ];
942         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
943         /* As many data cores may want access to available buffers, they need to be reserved. */
944         do {
945                 res_base_idx = vq->last_used_idx_res;
946                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
947
948                 free_entries = (avail_idx - res_base_idx);
949                 /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
950                 if (enable_retry && unlikely(count > free_entries)) {
951                         for (retry = 0; retry < burst_rx_retry_num; retry++) {
952                                 rte_delay_us(burst_rx_delay_time);
953                                 avail_idx =
954                                         *((volatile uint16_t *)&vq->avail->idx);
955                                 free_entries = (avail_idx - res_base_idx);
956                                 if (count <= free_entries)
957                                         break;
958                         }
959                 }
960
961                 /*check that we have enough buffers*/
962                 if (unlikely(count > free_entries))
963                         count = free_entries;
964
965                 if (count == 0)
966                         return 0;
967
968                 res_end_idx = res_base_idx + count;
969                 /* vq->last_used_idx_res is atomically updated. */
970                 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
971                                                                         res_end_idx);
972         } while (unlikely(success == 0));
973         res_cur_idx = res_base_idx;
974         LOG_DEBUG(DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
975
976         /* Prefetch available ring to retrieve indexes. */
977         rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
978
979         /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
980         mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
981
982         /* Retrieve all of the head indexes first to avoid caching issues. */
983         for (head_idx = 0; head_idx < count; head_idx++)
984                 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
985
986         /*Prefetch descriptor index. */
987         rte_prefetch0(&vq->desc[head[packet_success]]);
988
989         while (res_cur_idx != res_end_idx) {
990                 /* Get descriptor from available ring */
991                 desc = &vq->desc[head[packet_success]];
992
993                 buff = pkts[packet_success];
994
995                 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
996                 buff_addr = gpa_to_vva(dev, desc->addr);
997                 /* Prefetch buffer address. */
998                 rte_prefetch0((void*)(uintptr_t)buff_addr);
999
1000                 if (mergeable && (mrg_count != 0)) {
1001                         desc->len = packet_len = rte_pktmbuf_data_len(buff);
1002                 } else {
1003                         /* Copy virtio_hdr to packet and increment buffer address */
1004                         buff_hdr_addr = buff_addr;
1005                         packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1006
1007                         /*
1008                          * If the descriptors are chained the header and data are placed in
1009                          * separate buffers.
1010                          */
1011                         if (desc->flags & VRING_DESC_F_NEXT) {
1012                                 desc->len = vq->vhost_hlen;
1013                                 desc = &vq->desc[desc->next];
1014                                 /* Buffer address translation. */
1015                                 buff_addr = gpa_to_vva(dev, desc->addr);
1016                                 desc->len = rte_pktmbuf_data_len(buff);
1017                         } else {
1018                                 buff_addr += vq->vhost_hlen;
1019                                 desc->len = packet_len;
1020                         }
1021                 }
1022
1023                 PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0);
1024
1025                 /* Update used ring with desc information */
1026                 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1027                 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1028
1029                 /* Copy mbuf data to buffer */
1030                 rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
1031
1032                 res_cur_idx++;
1033                 packet_success++;
1034
1035                 /* If mergeable is disabled then a header is required per buffer. */
1036                 if (!mergeable) {
1037                         rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
1038                         PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1039                 } else {
1040                         mrg_count++;
1041                         /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
1042                         if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
1043                                 virtio_hdr.num_buffers = mrg_count;
1044                                 LOG_DEBUG(DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
1045                                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
1046                                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1047                                 mrg_count = 0;
1048                         }
1049                 }
1050                 if (res_cur_idx < res_end_idx) {
1051                         /* Prefetch descriptor index. */
1052                         rte_prefetch0(&vq->desc[head[packet_success]]);
1053                 }
1054         }
1055
1056         rte_compiler_barrier();
1057
1058         /* Wait until it's our turn to add our buffer to the used ring. */
1059         while (unlikely(vq->last_used_idx != res_base_idx))
1060                 rte_pause();
1061
1062         *(volatile uint16_t *)&vq->used->idx += count;
1063         vq->last_used_idx = res_end_idx;
1064
1065         /* Kick the guest if necessary. */
1066         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1067                 eventfd_write((int)vq->kickfd, 1);
1068         return count;
1069 }
1070
1071 /*
1072  * Compares a packet destination MAC address to a device MAC address.
1073  */
1074 static inline int __attribute__((always_inline))
1075 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1076 {
1077         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1078 }
1079
1080 /*
1081  * This function learns the MAC address of the device and registers this along with a
1082  * vlan tag to a VMDQ.
1083  */
1084 static int
1085 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1086 {
1087         struct ether_hdr *pkt_hdr;
1088         struct virtio_net_data_ll *dev_ll;
1089         int i, ret;
1090
1091         /* Learn MAC address of guest device from packet */
1092         pkt_hdr = (struct ether_hdr *)m->pkt.data;
1093
1094         dev_ll = ll_root_used;
1095
1096         while (dev_ll != NULL) {
1097                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1098                         RTE_LOG(INFO, DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1099                         return -1;
1100                 }
1101                 dev_ll = dev_ll->next;
1102         }
1103
1104         for (i = 0; i < ETHER_ADDR_LEN; i++)
1105                 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1106
1107         /* vlan_tag currently uses the device_id. */
1108         dev->vlan_tag = vlan_tags[dev->device_fh];
1109
1110         /* Print out VMDQ registration info. */
1111         RTE_LOG(INFO, DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1112                 dev->device_fh,
1113                 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1114                 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1115                 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1116                 dev->vlan_tag);
1117
1118         /* Register the MAC address. */
1119         ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1120         if (ret)
1121                 RTE_LOG(ERR, DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1122                                         dev->device_fh);
1123
1124         /* Enable stripping of the vlan tag as we handle routing. */
1125         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1126
1127         /* Set device as ready for RX. */
1128         dev->ready = DEVICE_RX;
1129
1130         return 0;
1131 }
1132
1133 /*
1134  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1135  * queue before disabling RX on the device.
1136  */
1137 static inline void
1138 unlink_vmdq(struct virtio_net *dev)
1139 {
1140         unsigned i = 0;
1141         unsigned rx_count;
1142         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1143
1144         if (dev->ready == DEVICE_RX) {
1145                 /*clear MAC and VLAN settings*/
1146                 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1147                 for (i = 0; i < 6; i++)
1148                         dev->mac_address.addr_bytes[i] = 0;
1149
1150                 dev->vlan_tag = 0;
1151
1152                 /*Clear out the receive buffers*/
1153                 rx_count = rte_eth_rx_burst(ports[0],
1154                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1155
1156                 while (rx_count) {
1157                         for (i = 0; i < rx_count; i++)
1158                                 rte_pktmbuf_free(pkts_burst[i]);
1159
1160                         rx_count = rte_eth_rx_burst(ports[0],
1161                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1162                 }
1163
1164                 dev->ready = DEVICE_MAC_LEARNING;
1165         }
1166 }
1167
1168 /*
1169  * Check if the packet destination MAC address is for a local device. If so then put
1170  * the packet on that devices RX queue. If not then return.
1171  */
1172 static inline unsigned __attribute__((always_inline))
1173 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1174 {
1175         struct virtio_net_data_ll *dev_ll;
1176         struct ether_hdr *pkt_hdr;
1177         uint64_t ret = 0;
1178
1179         pkt_hdr = (struct ether_hdr *)m->pkt.data;
1180
1181         /*get the used devices list*/
1182         dev_ll = ll_root_used;
1183
1184         while (dev_ll != NULL) {
1185                 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1186                                           &dev_ll->dev->mac_address)) {
1187
1188                         /* Drop the packet if the TX packet is destined for the TX device. */
1189                         if (dev_ll->dev->device_fh == dev->device_fh) {
1190                                 LOG_DEBUG(DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1191                                                         dev_ll->dev->device_fh);
1192                                 return 0;
1193                         }
1194
1195
1196                         LOG_DEBUG(DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1197
1198                         if (dev_ll->dev->remove) {
1199                                 /*drop the packet if the device is marked for removal*/
1200                                 LOG_DEBUG(DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1201                         } else {
1202                                 /*send the packet to the local virtio device*/
1203                                 ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1204                                 if (enable_stats) {
1205                                         rte_atomic64_add(
1206                                         &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1207                                         1);
1208                                         rte_atomic64_add(
1209                                         &dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1210                                         ret);
1211                                         dev_statistics[dev->device_fh].tx_total++;
1212                                         dev_statistics[dev->device_fh].tx += ret;
1213                                 }
1214                         }
1215
1216                         return 0;
1217                 }
1218                 dev_ll = dev_ll->next;
1219         }
1220
1221         return -1;
1222 }
1223
1224 /*
1225  * This function routes the TX packet to the correct interface. This may be a local device
1226  * or the physical port.
1227  */
1228 static inline void __attribute__((always_inline))
1229 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1230 {
1231         struct mbuf_table *tx_q;
1232         struct vlan_ethhdr *vlan_hdr;
1233         struct rte_mbuf **m_table;
1234         struct rte_mbuf *mbuf;
1235         unsigned len, ret, offset = 0;
1236         const uint16_t lcore_id = rte_lcore_id();
1237         struct virtio_net_data_ll *dev_ll = ll_root_used;
1238         struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
1239
1240         /*check if destination is local VM*/
1241         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1242                 return;
1243
1244         if (vm2vm_mode == VM2VM_HARDWARE) {
1245                 while (dev_ll != NULL) {
1246                         if ((dev_ll->dev->ready == DEVICE_RX)
1247                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1248                                 &dev_ll->dev->mac_address)) {
1249                                 /*
1250                                  * Drop the packet if the TX packet is
1251                                  * destined for the TX device.
1252                                  */
1253                                 if (dev_ll->dev->device_fh == dev->device_fh) {
1254                                         LOG_DEBUG(DATA,
1255                                         "(%"PRIu64") TX: Source and destination"
1256                                         " MAC addresses are the same. Dropping "
1257                                         "packet.\n",
1258                                         dev_ll->dev->device_fh);
1259                                         return;
1260                                 }
1261                                 offset = 4;
1262                                 vlan_tag =
1263                                 (uint16_t)
1264                                 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1265
1266                                 LOG_DEBUG(DATA,
1267                                 "(%"PRIu64") TX: pkt to local VM device id:"
1268                                 "(%"PRIu64") vlan tag: %d.\n",
1269                                 dev->device_fh, dev_ll->dev->device_fh,
1270                                 vlan_tag);
1271
1272                                 break;
1273                         }
1274                         dev_ll = dev_ll->next;
1275                 }
1276         }
1277
1278         LOG_DEBUG(DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1279
1280         /*Add packet to the port tx queue*/
1281         tx_q = &lcore_tx_queue[lcore_id];
1282         len = tx_q->len;
1283
1284         /* Allocate an mbuf and populate the structure. */
1285         mbuf = rte_pktmbuf_alloc(mbuf_pool);
1286         if (unlikely(mbuf == NULL)) {
1287                 RTE_LOG(ERR, DATA, "Failed to allocate memory for mbuf.\n");
1288                 return;
1289         }
1290
1291         mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
1292         mbuf->pkt.pkt_len = mbuf->pkt.data_len;
1293
1294         /* Copy ethernet header to mbuf. */
1295         rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN);
1296
1297
1298         /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1299         vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data;
1300         vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1301         vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1302         vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1303
1304         /* Copy the remaining packet contents to the mbuf. */
1305         rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
1306                 (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN));
1307         tx_q->m_table[len] = mbuf;
1308         len++;
1309         if (enable_stats) {
1310                 dev_statistics[dev->device_fh].tx_total++;
1311                 dev_statistics[dev->device_fh].tx++;
1312         }
1313
1314         if (unlikely(len == MAX_PKT_BURST)) {
1315                 m_table = (struct rte_mbuf **)tx_q->m_table;
1316                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1317                 /* Free any buffers not handled by TX and update the port stats. */
1318                 if (unlikely(ret < len)) {
1319                         do {
1320                                 rte_pktmbuf_free(m_table[ret]);
1321                         } while (++ret < len);
1322                 }
1323
1324                 len = 0;
1325         }
1326
1327         tx_q->len = len;
1328         return;
1329 }
1330
1331 static inline void __attribute__((always_inline))
1332 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1333 {
1334         struct rte_mbuf m;
1335         struct vhost_virtqueue *vq;
1336         struct vring_desc *desc;
1337         uint64_t buff_addr = 0;
1338         uint32_t head[MAX_PKT_BURST];
1339         uint32_t used_idx;
1340         uint32_t i;
1341         uint16_t free_entries, packet_success = 0;
1342         uint16_t avail_idx;
1343
1344         vq = dev->virtqueue[VIRTIO_TXQ];
1345         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1346
1347         /* If there are no available buffers then return. */
1348         if (vq->last_used_idx == avail_idx)
1349                 return;
1350
1351         LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1352
1353         /* Prefetch available ring to retrieve head indexes. */
1354         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1355
1356         /*get the number of free entries in the ring*/
1357         free_entries = (avail_idx - vq->last_used_idx);
1358
1359         /* Limit to MAX_PKT_BURST. */
1360         if (free_entries > MAX_PKT_BURST)
1361                 free_entries = MAX_PKT_BURST;
1362
1363         LOG_DEBUG(DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1364         /* Retrieve all of the head indexes first to avoid caching issues. */
1365         for (i = 0; i < free_entries; i++)
1366                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1367
1368         /* Prefetch descriptor index. */
1369         rte_prefetch0(&vq->desc[head[packet_success]]);
1370         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1371
1372         while (packet_success < free_entries) {
1373                 desc = &vq->desc[head[packet_success]];
1374
1375                 /* Discard first buffer as it is the virtio header */
1376                 desc = &vq->desc[desc->next];
1377
1378                 /* Buffer address translation. */
1379                 buff_addr = gpa_to_vva(dev, desc->addr);
1380                 /* Prefetch buffer address. */
1381                 rte_prefetch0((void*)(uintptr_t)buff_addr);
1382
1383                 used_idx = vq->last_used_idx & (vq->size - 1);
1384
1385                 if (packet_success < (free_entries - 1)) {
1386                         /* Prefetch descriptor index. */
1387                         rte_prefetch0(&vq->desc[head[packet_success+1]]);
1388                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1389                 }
1390
1391                 /* Update used index buffer information. */
1392                 vq->used->ring[used_idx].id = head[packet_success];
1393                 vq->used->ring[used_idx].len = 0;
1394
1395                 /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1396                 m.pkt.data_len = desc->len;
1397                 m.pkt.data = (void*)(uintptr_t)buff_addr;
1398
1399                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1400
1401                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1402                 if (dev->ready == DEVICE_MAC_LEARNING) {
1403                         if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1404                                 /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1405                                 packet_success += free_entries;
1406                                 vq->last_used_idx += packet_success;
1407                                 break;
1408                         }
1409                 }
1410                 virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1411
1412                 vq->last_used_idx++;
1413                 packet_success++;
1414         }
1415
1416         rte_compiler_barrier();
1417         vq->used->idx += packet_success;
1418         /* Kick guest if required. */
1419         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1420                 eventfd_write((int)vq->kickfd, 1);
1421 }
1422
1423 /*
1424  * This function is called by each data core. It handles all RX/TX registered with the
1425  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1426  * with all devices in the main linked list.
1427  */
1428 static int
1429 switch_worker(__attribute__((unused)) void *arg)
1430 {
1431         struct rte_mempool *mbuf_pool = arg;
1432         struct virtio_net *dev = NULL;
1433         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1434         struct virtio_net_data_ll *dev_ll;
1435         struct mbuf_table *tx_q;
1436         volatile struct lcore_ll_info *lcore_ll;
1437         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1438         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1439         unsigned ret, i;
1440         const uint16_t lcore_id = rte_lcore_id();
1441         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1442         uint16_t rx_count = 0;
1443
1444         RTE_LOG(INFO, DATA, "Procesing on Core %u started \n", lcore_id);
1445         lcore_ll = lcore_info[lcore_id].lcore_ll;
1446         prev_tsc = 0;
1447
1448         tx_q = &lcore_tx_queue[lcore_id];
1449         for (i = 0; i < num_cores; i ++) {
1450                 if (lcore_ids[i] == lcore_id) {
1451                         tx_q->txq_id = i;
1452                         break;
1453                 }
1454         }
1455
1456         while(1) {
1457                 cur_tsc = rte_rdtsc();
1458                 /*
1459                  * TX burst queue drain
1460                  */
1461                 diff_tsc = cur_tsc - prev_tsc;
1462                 if (unlikely(diff_tsc > drain_tsc)) {
1463
1464                         if (tx_q->len) {
1465                                 LOG_DEBUG(DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1466
1467                                 /*Tx any packets in the queue*/
1468                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1469                                                                            (struct rte_mbuf **)tx_q->m_table,
1470                                                                            (uint16_t)tx_q->len);
1471                                 if (unlikely(ret < tx_q->len)) {
1472                                         do {
1473                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1474                                         } while (++ret < tx_q->len);
1475                                 }
1476
1477                                 tx_q->len = 0;
1478                         }
1479
1480                         prev_tsc = cur_tsc;
1481
1482                 }
1483
1484                 rte_prefetch0(lcore_ll->ll_root_used);
1485                 /*
1486                  * Inform the configuration core that we have exited the linked list and that no devices are
1487                  * in use if requested.
1488                  */
1489                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1490                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1491
1492                 /*
1493                  * Process devices
1494                  */
1495                 dev_ll = lcore_ll->ll_root_used;
1496
1497                 while (dev_ll != NULL) {
1498                         /*get virtio device ID*/
1499                         dev = dev_ll->dev;
1500
1501                         if (dev->remove) {
1502                                 dev_ll = dev_ll->next;
1503                                 unlink_vmdq(dev);
1504                                 dev->ready = DEVICE_SAFE_REMOVE;
1505                                 continue;
1506                         }
1507                         if (likely(dev->ready == DEVICE_RX)) {
1508                                 /*Handle guest RX*/
1509                                 rx_count = rte_eth_rx_burst(ports[0],
1510                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1511
1512                                 if (rx_count) {
1513                                         ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
1514                                         if (enable_stats) {
1515                                                 rte_atomic64_add(
1516                                                 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1517                                                 rx_count);
1518                                                 rte_atomic64_add(
1519                                                 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
1520                                         }
1521                                         while (likely(rx_count)) {
1522                                                 rx_count--;
1523                                                 rte_pktmbuf_free_seg(pkts_burst[rx_count]);
1524                                         }
1525
1526                                 }
1527                         }
1528
1529                         if (!dev->remove)
1530                                 /*Handle guest TX*/
1531                                 virtio_dev_tx(dev, mbuf_pool);
1532
1533                         /*move to the next device in the list*/
1534                         dev_ll = dev_ll->next;
1535                 }
1536         }
1537
1538         return 0;
1539 }
1540
1541 /*
1542  * This function gets available ring number for zero copy rx.
1543  * Only one thread will call this funciton for a paticular virtio device,
1544  * so, it is designed as non-thread-safe function.
1545  */
1546 static inline uint32_t __attribute__((always_inline))
1547 get_available_ring_num_zcp(struct virtio_net *dev)
1548 {
1549         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1550         uint16_t avail_idx;
1551
1552         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1553         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1554 }
1555
1556 /*
1557  * This function gets available ring index for zero copy rx,
1558  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1559  * Only one thread will call this funciton for a paticular virtio device,
1560  * so, it is designed as non-thread-safe function.
1561  */
1562 static inline uint32_t __attribute__((always_inline))
1563 get_available_ring_index_zcp(struct virtio_net *dev,
1564         uint16_t *res_base_idx, uint32_t count)
1565 {
1566         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1567         uint16_t avail_idx;
1568         uint32_t retry = 0;
1569         uint16_t free_entries;
1570
1571         *res_base_idx = vq->last_used_idx_res;
1572         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1573         free_entries = (avail_idx - *res_base_idx);
1574
1575         LOG_DEBUG(DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1576                         "avail idx: %d, "
1577                         "res base idx:%d, free entries:%d\n",
1578                         dev->device_fh, avail_idx, *res_base_idx,
1579                         free_entries);
1580
1581         /*
1582          * If retry is enabled and the queue is full then we wait
1583          * and retry to avoid packet loss.
1584          */
1585         if (enable_retry && unlikely(count > free_entries)) {
1586                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1587                         rte_delay_us(burst_rx_delay_time);
1588                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1589                         free_entries = (avail_idx - *res_base_idx);
1590                         if (count <= free_entries)
1591                                 break;
1592                 }
1593         }
1594
1595         /*check that we have enough buffers*/
1596         if (unlikely(count > free_entries))
1597                 count = free_entries;
1598
1599         if (unlikely(count == 0)) {
1600                 LOG_DEBUG(DATA,
1601                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1602                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1603                         dev->device_fh, avail_idx,
1604                         *res_base_idx, free_entries);
1605                 return 0;
1606         }
1607
1608         vq->last_used_idx_res = *res_base_idx + count;
1609
1610         return count;
1611 }
1612
1613 /*
1614  * This function put descriptor back to used list.
1615  */
1616 static inline void __attribute__((always_inline))
1617 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1618 {
1619         uint16_t res_cur_idx = vq->last_used_idx;
1620         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1621         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1622         rte_compiler_barrier();
1623         *(volatile uint16_t *)&vq->used->idx += 1;
1624         vq->last_used_idx += 1;
1625
1626         /* Kick the guest if necessary. */
1627         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1628                 eventfd_write((int)vq->kickfd, 1);
1629 }
1630
1631 /*
1632  * This function get available descriptor from vitio vring and un-attached mbuf
1633  * from vpool->ring, and then attach them together. It needs adjust the offset
1634  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1635  * frame data may be put to wrong location in mbuf.
1636  */
1637 static inline void __attribute__((always_inline))
1638 attach_rxmbuf_zcp(struct virtio_net *dev)
1639 {
1640         uint16_t res_base_idx, desc_idx;
1641         uint64_t buff_addr, phys_addr;
1642         struct vhost_virtqueue *vq;
1643         struct vring_desc *desc;
1644         struct rte_mbuf *mbuf = NULL;
1645         struct vpool *vpool;
1646         hpa_type addr_type;
1647
1648         vpool = &vpool_array[dev->vmdq_rx_q];
1649         vq = dev->virtqueue[VIRTIO_RXQ];
1650
1651         do {
1652                 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
1653                                 1) != 1))
1654                         return;
1655                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1656
1657                 desc = &vq->desc[desc_idx];
1658                 if (desc->flags & VRING_DESC_F_NEXT) {
1659                         desc = &vq->desc[desc->next];
1660                         buff_addr = gpa_to_vva(dev, desc->addr);
1661                         phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
1662                                         &addr_type);
1663                 } else {
1664                         buff_addr = gpa_to_vva(dev,
1665                                         desc->addr + vq->vhost_hlen);
1666                         phys_addr = gpa_to_hpa(dev,
1667                                         desc->addr + vq->vhost_hlen,
1668                                         desc->len, &addr_type);
1669                 }
1670
1671                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1672                         RTE_LOG(ERR, DATA, "(%"PRIu64") Invalid frame buffer"
1673                                 " address found when attaching RX frame buffer"
1674                                 " address!\n", dev->device_fh);
1675                         put_desc_to_used_list_zcp(vq, desc_idx);
1676                         continue;
1677                 }
1678
1679                 /*
1680                  * Check if the frame buffer address from guest crosses
1681                  * sub-region or not.
1682                  */
1683                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1684                         RTE_LOG(ERR, DATA,
1685                                 "(%"PRIu64") Frame buffer address cross "
1686                                 "sub-regioin found when attaching RX frame "
1687                                 "buffer address!\n",
1688                                 dev->device_fh);
1689                         put_desc_to_used_list_zcp(vq, desc_idx);
1690                         continue;
1691                 }
1692         } while (unlikely(phys_addr == 0));
1693
1694         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1695         if (unlikely(mbuf == NULL)) {
1696                 LOG_DEBUG(DATA,
1697                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1698                         "ring_sc_dequeue fail.\n",
1699                         dev->device_fh);
1700                 put_desc_to_used_list_zcp(vq, desc_idx);
1701                 return;
1702         }
1703
1704         if (unlikely(vpool->buf_size > desc->len)) {
1705                 LOG_DEBUG(DATA,
1706                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1707                         "length(%d) of descriptor idx: %d less than room "
1708                         "size required: %d\n",
1709                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1710                 put_desc_to_used_list_zcp(vq, desc_idx);
1711                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1712                 return;
1713         }
1714
1715         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1716         mbuf->pkt.data = (void *)(uintptr_t)(buff_addr);
1717         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1718         mbuf->pkt.data_len = desc->len;
1719         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1720
1721         LOG_DEBUG(DATA,
1722                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1723                 "descriptor idx:%d\n",
1724                 dev->device_fh, res_base_idx, desc_idx);
1725
1726         __rte_mbuf_raw_free(mbuf);
1727
1728         return;
1729 }
1730
1731 /*
1732  * Detach an attched packet mbuf -
1733  *  - restore original mbuf address and length values.
1734  *  - reset pktmbuf data and data_len to their default values.
1735  *  All other fields of the given packet mbuf will be left intact.
1736  *
1737  * @param m
1738  *   The attached packet mbuf.
1739  */
1740 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1741 {
1742         const struct rte_mempool *mp = m->pool;
1743         void *buf = RTE_MBUF_TO_BADDR(m);
1744         uint32_t buf_ofs;
1745         uint32_t buf_len = mp->elt_size - sizeof(*m);
1746         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1747
1748         m->buf_addr = buf;
1749         m->buf_len = (uint16_t)buf_len;
1750
1751         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1752                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1753         m->pkt.data = (char *) m->buf_addr + buf_ofs;
1754
1755         m->pkt.data_len = 0;
1756 }
1757
1758 /*
1759  * This function is called after packets have been transimited. It fetchs mbuf
1760  * from vpool->pool, detached it and put into vpool->ring. It also update the
1761  * used index and kick the guest if necessary.
1762  */
1763 static inline uint32_t __attribute__((always_inline))
1764 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1765 {
1766         struct rte_mbuf *mbuf;
1767         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1768         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1769         uint32_t index = 0;
1770         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1771
1772         LOG_DEBUG(DATA,
1773                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1774                 "clean is: %d\n",
1775                 dev->device_fh, mbuf_count);
1776         LOG_DEBUG(DATA,
1777                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1778                 "clean  is : %d\n",
1779                 dev->device_fh, rte_ring_count(vpool->ring));
1780
1781         for (index = 0; index < mbuf_count; index++) {
1782                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1783                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1784                         pktmbuf_detach_zcp(mbuf);
1785                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1786
1787                 /* Update used index buffer information. */
1788                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1789                 vq->used->ring[used_idx].len = 0;
1790
1791                 used_idx = (used_idx + 1) & (vq->size - 1);
1792         }
1793
1794         LOG_DEBUG(DATA,
1795                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1796                 "clean is: %d\n",
1797                 dev->device_fh, rte_mempool_count(vpool->pool));
1798         LOG_DEBUG(DATA,
1799                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1800                 "clean  is : %d\n",
1801                 dev->device_fh, rte_ring_count(vpool->ring));
1802         LOG_DEBUG(DATA,
1803                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1804                 "vq->last_used_idx:%d\n",
1805                 dev->device_fh, vq->last_used_idx);
1806
1807         vq->last_used_idx += mbuf_count;
1808
1809         LOG_DEBUG(DATA,
1810                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1811                 "vq->last_used_idx:%d\n",
1812                 dev->device_fh, vq->last_used_idx);
1813
1814         rte_compiler_barrier();
1815
1816         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1817
1818         /* Kick guest if required. */
1819         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1820                 eventfd_write((int)vq->kickfd, 1);
1821
1822         return 0;
1823 }
1824
1825 /*
1826  * This function is called when a virtio device is destroy.
1827  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1828  */
1829 static void mbuf_destroy_zcp(struct vpool *vpool)
1830 {
1831         struct rte_mbuf *mbuf = NULL;
1832         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1833
1834         LOG_DEBUG(CONFIG,
1835                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1836                 "mbuf_destroy_zcp is: %d\n",
1837                 mbuf_count);
1838         LOG_DEBUG(CONFIG,
1839                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1840                 "mbuf_destroy_zcp  is : %d\n",
1841                 rte_ring_count(vpool->ring));
1842
1843         for (index = 0; index < mbuf_count; index++) {
1844                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1845                 if (likely(mbuf != NULL)) {
1846                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1847                                 pktmbuf_detach_zcp(mbuf);
1848                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1849                 }
1850         }
1851
1852         LOG_DEBUG(CONFIG,
1853                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1854                 "mbuf_destroy_zcp is: %d\n",
1855                 rte_mempool_count(vpool->pool));
1856         LOG_DEBUG(CONFIG,
1857                 "in mbuf_destroy_zcp: mbuf count in ring after "
1858                 "mbuf_destroy_zcp is : %d\n",
1859                 rte_ring_count(vpool->ring));
1860 }
1861
1862 /*
1863  * This function update the use flag and counter.
1864  */
1865 static inline uint32_t __attribute__((always_inline))
1866 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1867         uint32_t count)
1868 {
1869         struct vhost_virtqueue *vq;
1870         struct vring_desc *desc;
1871         struct rte_mbuf *buff;
1872         /* The virtio_hdr is initialised to 0. */
1873         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1874                 = {{0, 0, 0, 0, 0, 0}, 0};
1875         uint64_t buff_hdr_addr = 0;
1876         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1877         uint32_t head_idx, packet_success = 0;
1878         uint16_t res_cur_idx;
1879
1880         LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1881
1882         if (count == 0)
1883                 return 0;
1884
1885         vq = dev->virtqueue[VIRTIO_RXQ];
1886         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1887
1888         res_cur_idx = vq->last_used_idx;
1889         LOG_DEBUG(DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1890                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1891
1892         /* Retrieve all of the head indexes first to avoid caching issues. */
1893         for (head_idx = 0; head_idx < count; head_idx++)
1894                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1895
1896         /*Prefetch descriptor index. */
1897         rte_prefetch0(&vq->desc[head[packet_success]]);
1898
1899         while (packet_success != count) {
1900                 /* Get descriptor from available ring */
1901                 desc = &vq->desc[head[packet_success]];
1902
1903                 buff = pkts[packet_success];
1904                 LOG_DEBUG(DATA,
1905                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1906                         "pkt[%d] descriptor idx: %d\n",
1907                         dev->device_fh, packet_success,
1908                         MBUF_HEADROOM_UINT32(buff));
1909
1910                 PRINT_PACKET(dev,
1911                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1912                         + RTE_PKTMBUF_HEADROOM),
1913                         rte_pktmbuf_data_len(buff), 0);
1914
1915                 /* Buffer address translation for virtio header. */
1916                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1917                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1918
1919                 /*
1920                  * If the descriptors are chained the header and data are
1921                  * placed in separate buffers.
1922                  */
1923                 if (desc->flags & VRING_DESC_F_NEXT) {
1924                         desc->len = vq->vhost_hlen;
1925                         desc = &vq->desc[desc->next];
1926                         desc->len = rte_pktmbuf_data_len(buff);
1927                 } else {
1928                         desc->len = packet_len;
1929                 }
1930
1931                 /* Update used ring with desc information */
1932                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1933                         = head[packet_success];
1934                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1935                         = packet_len;
1936                 res_cur_idx++;
1937                 packet_success++;
1938
1939                 /* A header is required per buffer. */
1940                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1941                         (const void *)&virtio_hdr, vq->vhost_hlen);
1942
1943                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1944
1945                 if (likely(packet_success < count)) {
1946                         /* Prefetch descriptor index. */
1947                         rte_prefetch0(&vq->desc[head[packet_success]]);
1948                 }
1949         }
1950
1951         rte_compiler_barrier();
1952
1953         LOG_DEBUG(DATA,
1954                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1955                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1956                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1957
1958         *(volatile uint16_t *)&vq->used->idx += count;
1959         vq->last_used_idx += count;
1960
1961         LOG_DEBUG(DATA,
1962                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1963                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1964                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1965
1966         /* Kick the guest if necessary. */
1967         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1968                 eventfd_write((int)vq->kickfd, 1);
1969
1970         return count;
1971 }
1972
1973 /*
1974  * This function routes the TX packet to the correct interface.
1975  * This may be a local device or the physical port.
1976  */
1977 static inline void __attribute__((always_inline))
1978 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1979         uint32_t desc_idx, uint8_t need_copy)
1980 {
1981         struct mbuf_table *tx_q;
1982         struct rte_mbuf **m_table;
1983         struct rte_mbuf *mbuf = NULL;
1984         unsigned len, ret, offset = 0;
1985         struct vpool *vpool;
1986         struct virtio_net_data_ll *dev_ll = ll_root_used;
1987         struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
1988         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1989
1990         /*Add packet to the port tx queue*/
1991         tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
1992         len = tx_q->len;
1993
1994         /* Allocate an mbuf and populate the structure. */
1995         vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
1996         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1997         if (unlikely(mbuf == NULL)) {
1998                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1999                 RTE_LOG(ERR, DATA,
2000                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
2001                         dev->device_fh);
2002                 put_desc_to_used_list_zcp(vq, desc_idx);
2003                 return;
2004         }
2005
2006         if (vm2vm_mode == VM2VM_HARDWARE) {
2007                 /* Avoid using a vlan tag from any vm for external pkt, such as
2008                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2009                  * selection, MAC address determines it as an external pkt
2010                  * which should go to network, while vlan tag determine it as
2011                  * a vm2vm pkt should forward to another vm. Hardware confuse
2012                  * such a ambiguous situation, so pkt will lost.
2013                  */
2014                 vlan_tag = external_pkt_default_vlan_tag;
2015                 while (dev_ll != NULL) {
2016                         if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2017                                 ether_addr_cmp(&(pkt_hdr->d_addr),
2018                                 &dev_ll->dev->mac_address)) {
2019
2020                                 /*
2021                                  * Drop the packet if the TX packet is destined
2022                                  * for the TX device.
2023                                  */
2024                                 if (unlikely(dev_ll->dev->device_fh
2025                                         == dev->device_fh)) {
2026                                         LOG_DEBUG(DATA,
2027                                         "(%"PRIu64") TX: Source and destination"
2028                                         "MAC addresses are the same. Dropping "
2029                                         "packet.\n",
2030                                         dev_ll->dev->device_fh);
2031                                         MBUF_HEADROOM_UINT32(mbuf)
2032                                                 = (uint32_t)desc_idx;
2033                                         __rte_mbuf_raw_free(mbuf);
2034                                         return;
2035                                 }
2036
2037                                 /*
2038                                  * Packet length offset 4 bytes for HW vlan
2039                                  * strip when L2 switch back.
2040                                  */
2041                                 offset = 4;
2042                                 vlan_tag =
2043                                 (uint16_t)
2044                                 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2045
2046                                 LOG_DEBUG(DATA,
2047                                 "(%"PRIu64") TX: pkt to local VM device id:"
2048                                 "(%"PRIu64") vlan tag: %d.\n",
2049                                 dev->device_fh, dev_ll->dev->device_fh,
2050                                 vlan_tag);
2051
2052                                 break;
2053                         }
2054                         dev_ll = dev_ll->next;
2055                 }
2056         }
2057
2058         mbuf->pkt.nb_segs = m->pkt.nb_segs;
2059         mbuf->pkt.next = m->pkt.next;
2060         mbuf->pkt.data_len = m->pkt.data_len + offset;
2061         mbuf->pkt.pkt_len = mbuf->pkt.data_len;
2062         if (unlikely(need_copy)) {
2063                 /* Copy the packet contents to the mbuf. */
2064                 rte_memcpy((void *)((uint8_t *)mbuf->pkt.data),
2065                         (const void *) ((uint8_t *)m->pkt.data),
2066                         m->pkt.data_len);
2067         } else {
2068                 mbuf->pkt.data = m->pkt.data;
2069                 mbuf->buf_physaddr = m->buf_physaddr;
2070                 mbuf->buf_addr = m->buf_addr;
2071         }
2072         mbuf->ol_flags = PKT_TX_VLAN_PKT;
2073         mbuf->pkt.vlan_macip.f.vlan_tci = vlan_tag;
2074         mbuf->pkt.vlan_macip.f.l2_len = sizeof(struct ether_hdr);
2075         mbuf->pkt.vlan_macip.f.l3_len = sizeof(struct ipv4_hdr);
2076         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2077
2078         tx_q->m_table[len] = mbuf;
2079         len++;
2080
2081         LOG_DEBUG(DATA,
2082                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2083                 dev->device_fh,
2084                 mbuf->pkt.nb_segs,
2085                 (mbuf->pkt.next == NULL) ? "null" : "non-null");
2086
2087         if (enable_stats) {
2088                 dev_statistics[dev->device_fh].tx_total++;
2089                 dev_statistics[dev->device_fh].tx++;
2090         }
2091
2092         if (unlikely(len == MAX_PKT_BURST)) {
2093                 m_table = (struct rte_mbuf **)tx_q->m_table;
2094                 ret = rte_eth_tx_burst(ports[0],
2095                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2096
2097                 /*
2098                  * Free any buffers not handled by TX and update
2099                  * the port stats.
2100                  */
2101                 if (unlikely(ret < len)) {
2102                         do {
2103                                 rte_pktmbuf_free(m_table[ret]);
2104                         } while (++ret < len);
2105                 }
2106
2107                 len = 0;
2108                 txmbuf_clean_zcp(dev, vpool);
2109         }
2110
2111         tx_q->len = len;
2112
2113         return;
2114 }
2115
2116 /*
2117  * This function TX all available packets in virtio TX queue for one
2118  * virtio-net device. If it is first packet, it learns MAC address and
2119  * setup VMDQ.
2120  */
2121 static inline void __attribute__((always_inline))
2122 virtio_dev_tx_zcp(struct virtio_net *dev)
2123 {
2124         struct rte_mbuf m;
2125         struct vhost_virtqueue *vq;
2126         struct vring_desc *desc;
2127         uint64_t buff_addr = 0, phys_addr;
2128         uint32_t head[MAX_PKT_BURST];
2129         uint32_t i;
2130         uint16_t free_entries, packet_success = 0;
2131         uint16_t avail_idx;
2132         uint8_t need_copy = 0;
2133         hpa_type addr_type;
2134
2135         vq = dev->virtqueue[VIRTIO_TXQ];
2136         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2137
2138         /* If there are no available buffers then return. */
2139         if (vq->last_used_idx_res == avail_idx)
2140                 return;
2141
2142         LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2143
2144         /* Prefetch available ring to retrieve head indexes. */
2145         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2146
2147         /* Get the number of free entries in the ring */
2148         free_entries = (avail_idx - vq->last_used_idx_res);
2149
2150         /* Limit to MAX_PKT_BURST. */
2151         free_entries
2152                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2153
2154         LOG_DEBUG(DATA, "(%"PRIu64") Buffers available %d\n",
2155                 dev->device_fh, free_entries);
2156
2157         /* Retrieve all of the head indexes first to avoid caching issues. */
2158         for (i = 0; i < free_entries; i++)
2159                 head[i]
2160                         = vq->avail->ring[(vq->last_used_idx_res + i)
2161                         & (vq->size - 1)];
2162
2163         vq->last_used_idx_res += free_entries;
2164
2165         /* Prefetch descriptor index. */
2166         rte_prefetch0(&vq->desc[head[packet_success]]);
2167         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2168
2169         while (packet_success < free_entries) {
2170                 desc = &vq->desc[head[packet_success]];
2171
2172                 /* Discard first buffer as it is the virtio header */
2173                 desc = &vq->desc[desc->next];
2174
2175                 /* Buffer address translation. */
2176                 buff_addr = gpa_to_vva(dev, desc->addr);
2177                 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2178
2179                 if (likely(packet_success < (free_entries - 1)))
2180                         /* Prefetch descriptor index. */
2181                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2182
2183                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2184                         RTE_LOG(ERR, DATA,
2185                                 "(%"PRIu64") Invalid frame buffer address found"
2186                                 "when TX packets!\n",
2187                                 dev->device_fh);
2188                         packet_success++;
2189                         continue;
2190                 }
2191
2192                 /* Prefetch buffer address. */
2193                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2194
2195                 /*
2196                  * Setup dummy mbuf. This is copied to a real mbuf if
2197                  * transmitted out the physical port.
2198                  */
2199                 m.pkt.data_len = desc->len;
2200                 m.pkt.nb_segs = 1;
2201                 m.pkt.next = NULL;
2202                 m.pkt.data = (void *)(uintptr_t)buff_addr;
2203                 m.buf_addr = m.pkt.data;
2204                 m.buf_physaddr = phys_addr;
2205
2206                 /*
2207                  * Check if the frame buffer address from guest crosses
2208                  * sub-region or not.
2209                  */
2210                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2211                         RTE_LOG(ERR, DATA,
2212                                 "(%"PRIu64") Frame buffer address cross "
2213                                 "sub-regioin found when attaching TX frame "
2214                                 "buffer address!\n",
2215                                 dev->device_fh);
2216                         need_copy = 1;
2217                 } else
2218                         need_copy = 0;
2219
2220                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2221
2222                 /*
2223                  * If this is the first received packet we need to learn
2224                  * the MAC and setup VMDQ
2225                  */
2226                 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2227                         if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2228                                 /*
2229                                  * Discard frame if device is scheduled for
2230                                  * removal or a duplicate MAC address is found.
2231                                  */
2232                                 packet_success += free_entries;
2233                                 vq->last_used_idx += packet_success;
2234                                 break;
2235                         }
2236                 }
2237
2238                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2239                 packet_success++;
2240         }
2241 }
2242
2243 /*
2244  * This function is called by each data core. It handles all RX/TX registered
2245  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2246  * addresses are compared with all devices in the main linked list.
2247  */
2248 static int
2249 switch_worker_zcp(__attribute__((unused)) void *arg)
2250 {
2251         struct virtio_net *dev = NULL;
2252         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2253         struct virtio_net_data_ll *dev_ll;
2254         struct mbuf_table *tx_q;
2255         volatile struct lcore_ll_info *lcore_ll;
2256         const uint64_t drain_tsc
2257                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2258                 * BURST_TX_DRAIN_US;
2259         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2260         unsigned ret;
2261         const uint16_t lcore_id = rte_lcore_id();
2262         uint16_t count_in_ring, rx_count = 0;
2263
2264         RTE_LOG(INFO, DATA, "Procesing on Core %u started\n", lcore_id);
2265
2266         lcore_ll = lcore_info[lcore_id].lcore_ll;
2267         prev_tsc = 0;
2268
2269         while (1) {
2270                 cur_tsc = rte_rdtsc();
2271
2272                 /* TX burst queue drain */
2273                 diff_tsc = cur_tsc - prev_tsc;
2274                 if (unlikely(diff_tsc > drain_tsc)) {
2275                         /*
2276                          * Get mbuf from vpool.pool and detach mbuf and
2277                          * put back into vpool.ring.
2278                          */
2279                         dev_ll = lcore_ll->ll_root_used;
2280                         while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2281                                 /* Get virtio device ID */
2282                                 dev = dev_ll->dev;
2283
2284                                 if (likely(!dev->remove)) {
2285                                         tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2286                                         if (tx_q->len) {
2287                                                 LOG_DEBUG(DATA,
2288                                                 "TX queue drained after timeout"
2289                                                 " with burst size %u\n",
2290                                                 tx_q->len);
2291
2292                                                 /*
2293                                                  * Tx any packets in the queue
2294                                                  */
2295                                                 ret = rte_eth_tx_burst(
2296                                                         ports[0],
2297                                                         (uint16_t)tx_q->txq_id,
2298                                                         (struct rte_mbuf **)
2299                                                         tx_q->m_table,
2300                                                         (uint16_t)tx_q->len);
2301                                                 if (unlikely(ret < tx_q->len)) {
2302                                                         do {
2303                                                                 rte_pktmbuf_free(
2304                                                                         tx_q->m_table[ret]);
2305                                                         } while (++ret < tx_q->len);
2306                                                 }
2307                                                 tx_q->len = 0;
2308
2309                                                 txmbuf_clean_zcp(dev,
2310                                                         &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2311                                         }
2312                                 }
2313                                 dev_ll = dev_ll->next;
2314                         }
2315                         prev_tsc = cur_tsc;
2316                 }
2317
2318                 rte_prefetch0(lcore_ll->ll_root_used);
2319
2320                 /*
2321                  * Inform the configuration core that we have exited the linked
2322                  * list and that no devices are in use if requested.
2323                  */
2324                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2325                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2326
2327                 /* Process devices */
2328                 dev_ll = lcore_ll->ll_root_used;
2329
2330                 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2331                         dev = dev_ll->dev;
2332                         if (unlikely(dev->remove)) {
2333                                 dev_ll = dev_ll->next;
2334                                 unlink_vmdq(dev);
2335                                 dev->ready = DEVICE_SAFE_REMOVE;
2336                                 continue;
2337                         }
2338
2339                         if (likely(dev->ready == DEVICE_RX)) {
2340                                 uint32_t index = dev->vmdq_rx_q;
2341                                 uint16_t i;
2342                                 count_in_ring
2343                                 = rte_ring_count(vpool_array[index].ring);
2344                                 uint16_t free_entries
2345                                 = (uint16_t)get_available_ring_num_zcp(dev);
2346
2347                                 /*
2348                                  * Attach all mbufs in vpool.ring and put back
2349                                  * into vpool.pool.
2350                                  */
2351                                 for (i = 0;
2352                                 i < RTE_MIN(free_entries,
2353                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2354                                 i++)
2355                                         attach_rxmbuf_zcp(dev);
2356
2357                                 /* Handle guest RX */
2358                                 rx_count = rte_eth_rx_burst(ports[0],
2359                                         (uint16_t)dev->vmdq_rx_q, pkts_burst,
2360                                         MAX_PKT_BURST);
2361
2362                                 if (rx_count) {
2363                                         ret_count = virtio_dev_rx_zcp(dev,
2364                                                         pkts_burst, rx_count);
2365                                         if (enable_stats) {
2366                                                 dev_statistics[dev->device_fh].rx_total
2367                                                         += rx_count;
2368                                                 dev_statistics[dev->device_fh].rx
2369                                                         += ret_count;
2370                                         }
2371                                         while (likely(rx_count)) {
2372                                                 rx_count--;
2373                                                 pktmbuf_detach_zcp(
2374                                                         pkts_burst[rx_count]);
2375                                                 rte_ring_sp_enqueue(
2376                                                         vpool_array[index].ring,
2377                                                         (void *)pkts_burst[rx_count]);
2378                                         }
2379                                 }
2380                         }
2381
2382                         if (likely(!dev->remove))
2383                                 /* Handle guest TX */
2384                                 virtio_dev_tx_zcp(dev);
2385
2386                         /* Move to the next device in the list */
2387                         dev_ll = dev_ll->next;
2388                 }
2389         }
2390
2391         return 0;
2392 }
2393
2394
2395 /*
2396  * Add an entry to a used linked list. A free entry must first be found
2397  * in the free linked list using get_data_ll_free_entry();
2398  */
2399 static void
2400 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2401         struct virtio_net_data_ll *ll_dev)
2402 {
2403         struct virtio_net_data_ll *ll = *ll_root_addr;
2404
2405         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2406         ll_dev->next = NULL;
2407         rte_compiler_barrier();
2408
2409         /* If ll == NULL then this is the first device. */
2410         if (ll) {
2411                 /* Increment to the tail of the linked list. */
2412                 while ((ll->next != NULL) )
2413                         ll = ll->next;
2414
2415                 ll->next = ll_dev;
2416         } else {
2417                 *ll_root_addr = ll_dev;
2418         }
2419 }
2420
2421 /*
2422  * Remove an entry from a used linked list. The entry must then be added to
2423  * the free linked list using put_data_ll_free_entry().
2424  */
2425 static void
2426 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2427         struct virtio_net_data_ll *ll_dev,
2428         struct virtio_net_data_ll *ll_dev_last)
2429 {
2430         struct virtio_net_data_ll *ll = *ll_root_addr;
2431
2432         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2433                 return;
2434
2435         if (ll_dev == ll)
2436                 *ll_root_addr = ll_dev->next;
2437         else
2438                 if (likely(ll_dev_last != NULL))
2439                         ll_dev_last->next = ll_dev->next;
2440                 else
2441                         RTE_LOG(ERR, CONFIG, "Remove entry form ll failed.\n");
2442 }
2443
2444 /*
2445  * Find and return an entry from the free linked list.
2446  */
2447 static struct virtio_net_data_ll *
2448 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2449 {
2450         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2451         struct virtio_net_data_ll *ll_dev;
2452
2453         if (ll_free == NULL)
2454                 return NULL;
2455
2456         ll_dev = ll_free;
2457         *ll_root_addr = ll_free->next;
2458
2459         return ll_dev;
2460 }
2461
2462 /*
2463  * Place an entry back on to the free linked list.
2464  */
2465 static void
2466 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2467         struct virtio_net_data_ll *ll_dev)
2468 {
2469         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2470
2471         if (ll_dev == NULL)
2472                 return;
2473
2474         ll_dev->next = ll_free;
2475         *ll_root_addr = ll_dev;
2476 }
2477
2478 /*
2479  * Creates a linked list of a given size.
2480  */
2481 static struct virtio_net_data_ll *
2482 alloc_data_ll(uint32_t size)
2483 {
2484         struct virtio_net_data_ll *ll_new;
2485         uint32_t i;
2486
2487         /* Malloc and then chain the linked list. */
2488         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2489         if (ll_new == NULL) {
2490                 RTE_LOG(ERR, CONFIG, "Failed to allocate memory for ll_new.\n");
2491                 return NULL;
2492         }
2493
2494         for (i = 0; i < size - 1; i++) {
2495                 ll_new[i].dev = NULL;
2496                 ll_new[i].next = &ll_new[i+1];
2497         }
2498         ll_new[i].next = NULL;
2499
2500         return (ll_new);
2501 }
2502
2503 /*
2504  * Create the main linked list along with each individual cores linked list. A used and a free list
2505  * are created to manage entries.
2506  */
2507 static int
2508 init_data_ll (void)
2509 {
2510         int lcore;
2511
2512         RTE_LCORE_FOREACH_SLAVE(lcore) {
2513                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2514                 if (lcore_info[lcore].lcore_ll == NULL) {
2515                         RTE_LOG(ERR, CONFIG, "Failed to allocate memory for lcore_ll.\n");
2516                         return -1;
2517                 }
2518
2519                 lcore_info[lcore].lcore_ll->device_num = 0;
2520                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2521                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2522                 if (num_devices % num_switching_cores)
2523                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2524                 else
2525                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2526         }
2527
2528         /* Allocate devices up to a maximum of MAX_DEVICES. */
2529         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2530
2531         return 0;
2532 }
2533
2534 /*
2535  * Set virtqueue flags so that we do not receive interrupts.
2536  */
2537 static void
2538 set_irq_status (struct virtio_net *dev)
2539 {
2540         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2541         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2542 }
2543
2544 /*
2545  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2546  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2547  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2548  */
2549 static void
2550 destroy_device (volatile struct virtio_net *dev)
2551 {
2552         struct virtio_net_data_ll *ll_lcore_dev_cur;
2553         struct virtio_net_data_ll *ll_main_dev_cur;
2554         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2555         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2556         int lcore;
2557
2558         dev->flags &= ~VIRTIO_DEV_RUNNING;
2559
2560         /*set the remove flag. */
2561         dev->remove = 1;
2562
2563         while(dev->ready != DEVICE_SAFE_REMOVE) {
2564                 rte_pause();
2565         }
2566
2567         /* Search for entry to be removed from lcore ll */
2568         ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
2569         while (ll_lcore_dev_cur != NULL) {
2570                 if (ll_lcore_dev_cur->dev == dev) {
2571                         break;
2572                 } else {
2573                         ll_lcore_dev_last = ll_lcore_dev_cur;
2574                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2575                 }
2576         }
2577
2578         if (ll_lcore_dev_cur == NULL) {
2579                 RTE_LOG(ERR, CONFIG,
2580                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2581                         dev->device_fh);
2582                 return;
2583         }
2584
2585         /* Search for entry to be removed from main ll */
2586         ll_main_dev_cur = ll_root_used;
2587         ll_main_dev_last = NULL;
2588         while (ll_main_dev_cur != NULL) {
2589                 if (ll_main_dev_cur->dev == dev) {
2590                         break;
2591                 } else {
2592                         ll_main_dev_last = ll_main_dev_cur;
2593                         ll_main_dev_cur = ll_main_dev_cur->next;
2594                 }
2595         }
2596
2597         /* Remove entries from the lcore and main ll. */
2598         rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2599         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2600
2601         /* Set the dev_removal_flag on each lcore. */
2602         RTE_LCORE_FOREACH_SLAVE(lcore) {
2603                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2604         }
2605
2606         /*
2607          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2608          * they can no longer access the device removed from the linked lists and that the devices
2609          * are no longer in use.
2610          */
2611         RTE_LCORE_FOREACH_SLAVE(lcore) {
2612                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2613                         rte_pause();
2614                 }
2615         }
2616
2617         /* Add the entries back to the lcore and main free ll.*/
2618         put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2619         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2620
2621         /* Decrement number of device on the lcore. */
2622         lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
2623
2624         RTE_LOG(INFO, DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2625
2626         if (zero_copy) {
2627                 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2628
2629                 /* Stop the RX queue. */
2630                 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2631                         LOG_DEBUG(CONFIG,
2632                                 "(%"PRIu64") In destroy_device: Failed to stop "
2633                                 "rx queue:%d\n",
2634                                 dev->device_fh,
2635                                 dev->vmdq_rx_q);
2636                 }
2637
2638                 LOG_DEBUG(CONFIG,
2639                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2640                         "mempool back to ring for RX queue: %d\n",
2641                         dev->device_fh, dev->vmdq_rx_q);
2642
2643                 mbuf_destroy_zcp(vpool);
2644
2645                 /* Stop the TX queue. */
2646                 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2647                         LOG_DEBUG(CONFIG,
2648                                 "(%"PRIu64") In destroy_device: Failed to "
2649                                 "stop tx queue:%d\n",
2650                                 dev->device_fh, dev->vmdq_rx_q);
2651                 }
2652
2653                 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
2654
2655                 LOG_DEBUG(CONFIG,
2656                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2657                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2658                         dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
2659                         dev->device_fh);
2660
2661                 mbuf_destroy_zcp(vpool);
2662         }
2663
2664 }
2665
2666 /*
2667  * A new device is added to a data core. First the device is added to the main linked list
2668  * and the allocated to a specific data core.
2669  */
2670 static int
2671 new_device (struct virtio_net *dev)
2672 {
2673         struct virtio_net_data_ll *ll_dev;
2674         int lcore, core_add = 0;
2675         uint32_t device_num_min = num_devices;
2676
2677         /* Add device to main ll */
2678         ll_dev = get_data_ll_free_entry(&ll_root_free);
2679         if (ll_dev == NULL) {
2680                 RTE_LOG(INFO, DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2681                         "of %d devices per core has been reached\n",
2682                         dev->device_fh, num_devices);
2683                 return -1;
2684         }
2685         ll_dev->dev = dev;
2686         add_data_ll_entry(&ll_root_used, ll_dev);
2687         ll_dev->dev->vmdq_rx_q
2688                 = ll_dev->dev->device_fh * (num_queues / num_devices);
2689
2690         if (zero_copy) {
2691                 uint32_t index = ll_dev->dev->vmdq_rx_q;
2692                 uint32_t count_in_ring, i;
2693                 struct mbuf_table *tx_q;
2694
2695                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2696
2697                 LOG_DEBUG(CONFIG,
2698                         "(%"PRIu64") in new_device: mbuf count in mempool "
2699                         "before attach is: %d\n",
2700                         dev->device_fh,
2701                         rte_mempool_count(vpool_array[index].pool));
2702                 LOG_DEBUG(CONFIG,
2703                         "(%"PRIu64") in new_device: mbuf count in  ring "
2704                         "before attach  is : %d\n",
2705                         dev->device_fh, count_in_ring);
2706
2707                 /*
2708                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2709                  */
2710                 for (i = 0; i < count_in_ring; i++)
2711                         attach_rxmbuf_zcp(dev);
2712
2713                 LOG_DEBUG(CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2714                         "mempool after attach is: %d\n",
2715                         dev->device_fh,
2716                         rte_mempool_count(vpool_array[index].pool));
2717                 LOG_DEBUG(CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2718                         "ring after attach  is : %d\n",
2719                         dev->device_fh,
2720                         rte_ring_count(vpool_array[index].ring));
2721
2722                 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2723                 tx_q->txq_id = dev->vmdq_rx_q;
2724
2725                 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2726                         struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2727
2728                         LOG_DEBUG(CONFIG,
2729                                 "(%"PRIu64") In new_device: Failed to start "
2730                                 "tx queue:%d\n",
2731                                 dev->device_fh, dev->vmdq_rx_q);
2732
2733                         mbuf_destroy_zcp(vpool);
2734                         return -1;
2735                 }
2736
2737                 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2738                         struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2739
2740                         LOG_DEBUG(CONFIG,
2741                                 "(%"PRIu64") In new_device: Failed to start "
2742                                 "rx queue:%d\n",
2743                                 dev->device_fh, dev->vmdq_rx_q);
2744
2745                         /* Stop the TX queue. */
2746                         if (rte_eth_dev_tx_queue_stop(ports[0],
2747                                 dev->vmdq_rx_q) != 0) {
2748                                 LOG_DEBUG(CONFIG,
2749                                         "(%"PRIu64") In new_device: Failed to "
2750                                         "stop tx queue:%d\n",
2751                                         dev->device_fh, dev->vmdq_rx_q);
2752                         }
2753
2754                         mbuf_destroy_zcp(vpool);
2755                         return -1;
2756                 }
2757
2758         }
2759
2760         /*reset ready flag*/
2761         dev->ready = DEVICE_MAC_LEARNING;
2762         dev->remove = 0;
2763
2764         /* Find a suitable lcore to add the device. */
2765         RTE_LCORE_FOREACH_SLAVE(lcore) {
2766                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2767                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2768                         core_add = lcore;
2769                 }
2770         }
2771         /* Add device to lcore ll */
2772         ll_dev->dev->coreid = core_add;
2773         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2774         if (ll_dev == NULL) {
2775                 RTE_LOG(INFO, DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2776                 dev->ready = DEVICE_SAFE_REMOVE;
2777                 destroy_device(dev);
2778                 return -1;
2779         }
2780         ll_dev->dev = dev;
2781         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2782
2783         /* Initialize device stats */
2784         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2785
2786         /* Disable notifications. */
2787         set_irq_status(dev);
2788         lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
2789         dev->flags |= VIRTIO_DEV_RUNNING;
2790
2791         RTE_LOG(INFO, DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
2792
2793         return 0;
2794 }
2795
2796 /*
2797  * These callback allow devices to be added to the data core when configuration
2798  * has been fully complete.
2799  */
2800 static const struct virtio_net_device_ops virtio_net_device_ops =
2801 {
2802         .new_device =  new_device,
2803         .destroy_device = destroy_device,
2804 };
2805
2806 /*
2807  * This is a thread will wake up after a period to print stats if the user has
2808  * enabled them.
2809  */
2810 static void
2811 print_stats(void)
2812 {
2813         struct virtio_net_data_ll *dev_ll;
2814         uint64_t tx_dropped, rx_dropped;
2815         uint64_t tx, tx_total, rx, rx_total;
2816         uint32_t device_fh;
2817         const char clr[] = { 27, '[', '2', 'J', '\0' };
2818         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2819
2820         while(1) {
2821                 sleep(enable_stats);
2822
2823                 /* Clear screen and move to top left */
2824                 printf("%s%s", clr, top_left);
2825
2826                 printf("\nDevice statistics ====================================");
2827
2828                 dev_ll = ll_root_used;
2829                 while (dev_ll != NULL) {
2830                         device_fh = (uint32_t)dev_ll->dev->device_fh;
2831                         tx_total = dev_statistics[device_fh].tx_total;
2832                         tx = dev_statistics[device_fh].tx;
2833                         tx_dropped = tx_total - tx;
2834                         if (zero_copy == 0) {
2835                                 rx_total = rte_atomic64_read(
2836                                         &dev_statistics[device_fh].rx_total_atomic);
2837                                 rx = rte_atomic64_read(
2838                                         &dev_statistics[device_fh].rx_atomic);
2839                         } else {
2840                                 rx_total = dev_statistics[device_fh].rx_total;
2841                                 rx = dev_statistics[device_fh].rx;
2842                         }
2843                         rx_dropped = rx_total - rx;
2844
2845                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2846                                         "\nTX total:            %"PRIu64""
2847                                         "\nTX dropped:          %"PRIu64""
2848                                         "\nTX successful:               %"PRIu64""
2849                                         "\nRX total:            %"PRIu64""
2850                                         "\nRX dropped:          %"PRIu64""
2851                                         "\nRX successful:               %"PRIu64"",
2852                                         device_fh,
2853                                         tx_total,
2854                                         tx_dropped,
2855                                         tx,
2856                                         rx_total,
2857                                         rx_dropped,
2858                                         rx);
2859
2860                         dev_ll = dev_ll->next;
2861                 }
2862                 printf("\n======================================================\n");
2863         }
2864 }
2865
2866 static void
2867 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2868         char *ring_name, uint32_t nb_mbuf)
2869 {
2870         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2871         vpool_array[index].pool
2872                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2873                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2874                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2875                 rte_pktmbuf_init, NULL, socket, 0);
2876         if (vpool_array[index].pool != NULL) {
2877                 vpool_array[index].ring
2878                         = rte_ring_create(ring_name,
2879                                 rte_align32pow2(nb_mbuf + 1),
2880                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2881                 if (likely(vpool_array[index].ring != NULL)) {
2882                         LOG_DEBUG(CONFIG,
2883                                 "in setup_mempool_tbl: mbuf count in "
2884                                 "mempool is: %d\n",
2885                                 rte_mempool_count(vpool_array[index].pool));
2886                         LOG_DEBUG(CONFIG,
2887                                 "in setup_mempool_tbl: mbuf count in "
2888                                 "ring   is: %d\n",
2889                                 rte_ring_count(vpool_array[index].ring));
2890                 } else {
2891                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2892                                 ring_name);
2893                 }
2894
2895                 /* Need consider head room. */
2896                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2897         } else {
2898                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2899         }
2900 }
2901
2902
2903 /*
2904  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2905  * device is also registered here to handle the IOCTLs.
2906  */
2907 int
2908 MAIN(int argc, char *argv[])
2909 {
2910         struct rte_mempool *mbuf_pool = NULL;
2911         unsigned lcore_id, core_id = 0;
2912         unsigned nb_ports, valid_num_ports;
2913         int ret;
2914         uint8_t portid, queue_id = 0;
2915         static pthread_t tid;
2916
2917         /* init EAL */
2918         ret = rte_eal_init(argc, argv);
2919         if (ret < 0)
2920                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2921         argc -= ret;
2922         argv += ret;
2923
2924         /* parse app arguments */
2925         ret = us_vhost_parse_args(argc, argv);
2926         if (ret < 0)
2927                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2928
2929         if (rte_eal_pci_probe() != 0)
2930                 rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n");
2931
2932         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2933                 if (rte_lcore_is_enabled(lcore_id))
2934                         lcore_ids[core_id ++] = lcore_id;
2935
2936         if (rte_lcore_count() > RTE_MAX_LCORE)
2937                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2938
2939         /*set the number of swithcing cores available*/
2940         num_switching_cores = rte_lcore_count()-1;
2941
2942         /* Get the number of physical ports. */
2943         nb_ports = rte_eth_dev_count();
2944         if (nb_ports > RTE_MAX_ETHPORTS)
2945                 nb_ports = RTE_MAX_ETHPORTS;
2946
2947         /*
2948          * Update the global var NUM_PORTS and global array PORTS
2949          * and get value of var VALID_NUM_PORTS according to system ports number
2950          */
2951         valid_num_ports = check_ports_num(nb_ports);
2952
2953         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2954                 RTE_LOG(INFO, PORT, "Current enabled port number is %u,"
2955                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2956                 return -1;
2957         }
2958
2959         if (zero_copy == 0) {
2960                 /* Create the mbuf pool. */
2961                 mbuf_pool = rte_mempool_create(
2962                                 "MBUF_POOL",
2963                                 NUM_MBUFS_PER_PORT
2964                                 * valid_num_ports,
2965                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2966                                 sizeof(struct rte_pktmbuf_pool_private),
2967                                 rte_pktmbuf_pool_init, NULL,
2968                                 rte_pktmbuf_init, NULL,
2969                                 rte_socket_id(), 0);
2970                 if (mbuf_pool == NULL)
2971                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2972
2973                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2974                         vpool_array[queue_id].pool = mbuf_pool;
2975
2976                 if (vm2vm_mode == VM2VM_HARDWARE) {
2977                         /* Enable VT loop back to let L2 switch to do it. */
2978                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2979                         LOG_DEBUG(CONFIG,
2980                                 "Enable loop back for L2 switch in vmdq.\n");
2981                 }
2982         } else {
2983                 uint32_t nb_mbuf;
2984                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2985                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2986
2987                 rx_conf_default.start_rx_per_q = (uint8_t)zero_copy;
2988                 rx_conf_default.rx_drop_en = 0;
2989                 tx_conf_default.start_tx_per_q = (uint8_t)zero_copy;
2990                 nb_mbuf = num_rx_descriptor
2991                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2992                         + num_switching_cores * MAX_PKT_BURST;
2993
2994                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2995                         rte_snprintf(pool_name, sizeof(pool_name),
2996                                 "rxmbuf_pool_%u", queue_id);
2997                         rte_snprintf(ring_name, sizeof(ring_name),
2998                                 "rxmbuf_ring_%u", queue_id);
2999                         setup_mempool_tbl(rte_socket_id(), queue_id,
3000                                 pool_name, ring_name, nb_mbuf);
3001                 }
3002
3003                 nb_mbuf = num_tx_descriptor
3004                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3005                                 + num_switching_cores * MAX_PKT_BURST;
3006
3007                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3008                         rte_snprintf(pool_name, sizeof(pool_name),
3009                                 "txmbuf_pool_%u", queue_id);
3010                         rte_snprintf(ring_name, sizeof(ring_name),
3011                                 "txmbuf_ring_%u", queue_id);
3012                         setup_mempool_tbl(rte_socket_id(),
3013                                 (queue_id + MAX_QUEUES),
3014                                 pool_name, ring_name, nb_mbuf);
3015                 }
3016
3017                 if (vm2vm_mode == VM2VM_HARDWARE) {
3018                         /* Enable VT loop back to let L2 switch to do it. */
3019                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3020                         LOG_DEBUG(CONFIG,
3021                                 "Enable loop back for L2 switch in vmdq.\n");
3022                 }
3023         }
3024         /* Set log level. */
3025         rte_set_log_level(LOG_LEVEL);
3026
3027         /* initialize all ports */
3028         for (portid = 0; portid < nb_ports; portid++) {
3029                 /* skip ports that are not enabled */
3030                 if ((enabled_port_mask & (1 << portid)) == 0) {
3031                         RTE_LOG(INFO, PORT,
3032                                 "Skipping disabled port %d\n", portid);
3033                         continue;
3034                 }
3035                 if (port_init(portid) != 0)
3036                         rte_exit(EXIT_FAILURE,
3037                                 "Cannot initialize network ports\n");
3038         }
3039
3040         /* Initialise all linked lists. */
3041         if (init_data_ll() == -1)
3042                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3043
3044         /* Initialize device stats */
3045         memset(&dev_statistics, 0, sizeof(dev_statistics));
3046
3047         /* Enable stats if the user option is set. */
3048         if (enable_stats)
3049                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3050
3051         /* Launch all data cores. */
3052         if (zero_copy == 0) {
3053                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3054                         rte_eal_remote_launch(switch_worker,
3055                                 mbuf_pool, lcore_id);
3056                 }
3057         } else {
3058                 uint32_t count_in_mempool, index, i;
3059                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3060                         /* For all RX and TX queues. */
3061                         count_in_mempool
3062                                 = rte_mempool_count(vpool_array[index].pool);
3063
3064                         /*
3065                          * Transfer all un-attached mbufs from vpool.pool
3066                          * to vpoo.ring.
3067                          */
3068                         for (i = 0; i < count_in_mempool; i++) {
3069                                 struct rte_mbuf *mbuf
3070                                         = __rte_mbuf_raw_alloc(
3071                                                 vpool_array[index].pool);
3072                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3073                                                 (void *)mbuf);
3074                         }
3075
3076                         LOG_DEBUG(CONFIG,
3077                                 "in MAIN: mbuf count in mempool at initial "
3078                                 "is: %d\n", count_in_mempool);
3079                         LOG_DEBUG(CONFIG,
3080                                 "in MAIN: mbuf count in  ring at initial  is :"
3081                                 " %d\n",
3082                                 rte_ring_count(vpool_array[index].ring));
3083                 }
3084
3085                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3086                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3087                                 lcore_id);
3088         }
3089
3090         /* Register CUSE device to handle IOCTLs. */
3091         ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3092         if (ret != 0)
3093                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3094
3095         init_virtio_net(&virtio_net_device_ops);
3096
3097         /* Start CUSE session. */
3098         start_cuse_session_loop();
3099         return 0;
3100
3101 }
3102