examples/vhost: remove the non-working zero copy code
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
78
79 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
81
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX                       1
87 #define DEVICE_SAFE_REMOVE      2
88
89 /* Config_core_flag status definitions. */
90 #define REQUEST_DEV_REMOVAL 1
91 #define ACK_DEV_REMOVAL 0
92
93 /* Configurable number of RX/TX ring descriptors */
94 #define RTE_TEST_RX_DESC_DEFAULT 1024
95 #define RTE_TEST_TX_DESC_DEFAULT 512
96
97 #define INVALID_PORT_ID 0xFF
98
99 /* Max number of devices. Limited by vmdq. */
100 #define MAX_DEVICES 64
101
102 /* Size of buffers used for snprintfs. */
103 #define MAX_PRINT_BUFF 6072
104
105 /* Maximum character device basename size. */
106 #define MAX_BASENAME_SZ 10
107
108 /* Maximum long option length for option parsing. */
109 #define MAX_LONG_OPT_SZ 64
110
111 /* Used to compare MAC addresses. */
112 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
113
114 /* Number of descriptors per cacheline. */
115 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
116
117 /* mask of enabled ports */
118 static uint32_t enabled_port_mask = 0;
119
120 /* Promiscuous mode */
121 static uint32_t promiscuous;
122
123 /*Number of switching cores enabled*/
124 static uint32_t num_switching_cores = 0;
125
126 /* number of devices/queues to support*/
127 static uint32_t num_queues = 0;
128 static uint32_t num_devices;
129
130 static struct rte_mempool *mbuf_pool;
131 static int mergeable;
132
133 /* Do vlan strip on host, enabled on default */
134 static uint32_t vlan_strip = 1;
135
136 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
137 typedef enum {
138         VM2VM_DISABLED = 0,
139         VM2VM_SOFTWARE = 1,
140         VM2VM_HARDWARE = 2,
141         VM2VM_LAST
142 } vm2vm_type;
143 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
144
145 /* Enable stats. */
146 static uint32_t enable_stats = 0;
147 /* Enable retries on RX. */
148 static uint32_t enable_retry = 1;
149
150 /* Disable TX checksum offload */
151 static uint32_t enable_tx_csum;
152
153 /* Disable TSO offload */
154 static uint32_t enable_tso;
155
156 /* Specify timeout (in useconds) between retries on RX. */
157 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
158 /* Specify the number of retries on RX. */
159 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
160
161 /* Character device basename. Can be set by user. */
162 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
163
164 /* empty vmdq configuration structure. Filled in programatically */
165 static struct rte_eth_conf vmdq_conf_default = {
166         .rxmode = {
167                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
168                 .split_hdr_size = 0,
169                 .header_split   = 0, /**< Header Split disabled */
170                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
171                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
172                 /*
173                  * It is necessary for 1G NIC such as I350,
174                  * this fixes bug of ipv4 forwarding in guest can't
175                  * forward pakets from one virtio dev to another virtio dev.
176                  */
177                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
178                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
179                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
180         },
181
182         .txmode = {
183                 .mq_mode = ETH_MQ_TX_NONE,
184         },
185         .rx_adv_conf = {
186                 /*
187                  * should be overridden separately in code with
188                  * appropriate values
189                  */
190                 .vmdq_rx_conf = {
191                         .nb_queue_pools = ETH_8_POOLS,
192                         .enable_default_pool = 0,
193                         .default_pool = 0,
194                         .nb_pool_maps = 0,
195                         .pool_map = {{0, 0},},
196                 },
197         },
198 };
199
200 static unsigned lcore_ids[RTE_MAX_LCORE];
201 static uint8_t ports[RTE_MAX_ETHPORTS];
202 static unsigned num_ports = 0; /**< The number of ports specified in command line */
203 static uint16_t num_pf_queues, num_vmdq_queues;
204 static uint16_t vmdq_pool_base, vmdq_queue_base;
205 static uint16_t queues_per_pool;
206
207 const uint16_t vlan_tags[] = {
208         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
209         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
210         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
211         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
212         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
213         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
214         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
215         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
216 };
217
218 /* ethernet addresses of ports */
219 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
220
221 /* heads for the main used and free linked lists for the data path. */
222 static struct virtio_net_data_ll *ll_root_used = NULL;
223 static struct virtio_net_data_ll *ll_root_free = NULL;
224
225 /* Array of data core structures containing information on individual core linked lists. */
226 static struct lcore_info lcore_info[RTE_MAX_LCORE];
227
228 /* Used for queueing bursts of TX packets. */
229 struct mbuf_table {
230         unsigned len;
231         unsigned txq_id;
232         struct rte_mbuf *m_table[MAX_PKT_BURST];
233 };
234
235 /* TX queue for each data core. */
236 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
237
238 /* Vlan header struct used to insert vlan tags on TX. */
239 struct vlan_ethhdr {
240         unsigned char   h_dest[ETH_ALEN];
241         unsigned char   h_source[ETH_ALEN];
242         __be16          h_vlan_proto;
243         __be16          h_vlan_TCI;
244         __be16          h_vlan_encapsulated_proto;
245 };
246
247 /* Header lengths. */
248 #define VLAN_HLEN       4
249 #define VLAN_ETH_HLEN   18
250
251 /* Per-device statistics struct */
252 struct device_statistics {
253         uint64_t tx_total;
254         rte_atomic64_t rx_total_atomic;
255         uint64_t tx;
256         rte_atomic64_t rx_atomic;
257 } __rte_cache_aligned;
258 struct device_statistics dev_statistics[MAX_DEVICES];
259
260 /*
261  * Builds up the correct configuration for VMDQ VLAN pool map
262  * according to the pool & queue limits.
263  */
264 static inline int
265 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
266 {
267         struct rte_eth_vmdq_rx_conf conf;
268         struct rte_eth_vmdq_rx_conf *def_conf =
269                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
270         unsigned i;
271
272         memset(&conf, 0, sizeof(conf));
273         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
274         conf.nb_pool_maps = num_devices;
275         conf.enable_loop_back = def_conf->enable_loop_back;
276         conf.rx_mode = def_conf->rx_mode;
277
278         for (i = 0; i < conf.nb_pool_maps; i++) {
279                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
280                 conf.pool_map[i].pools = (1UL << i);
281         }
282
283         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
284         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
285                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
286         return 0;
287 }
288
289 /*
290  * Validate the device number according to the max pool number gotten form
291  * dev_info. If the device number is invalid, give the error message and
292  * return -1. Each device must have its own pool.
293  */
294 static inline int
295 validate_num_devices(uint32_t max_nb_devices)
296 {
297         if (num_devices > max_nb_devices) {
298                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
299                 return -1;
300         }
301         return 0;
302 }
303
304 /*
305  * Initialises a given port using global settings and with the rx buffers
306  * coming from the mbuf_pool passed as parameter
307  */
308 static inline int
309 port_init(uint8_t port)
310 {
311         struct rte_eth_dev_info dev_info;
312         struct rte_eth_conf port_conf;
313         struct rte_eth_rxconf *rxconf;
314         struct rte_eth_txconf *txconf;
315         int16_t rx_rings, tx_rings;
316         uint16_t rx_ring_size, tx_ring_size;
317         int retval;
318         uint16_t q;
319
320         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
321         rte_eth_dev_info_get (port, &dev_info);
322
323         if (dev_info.max_rx_queues > MAX_QUEUES) {
324                 rte_exit(EXIT_FAILURE,
325                         "please define MAX_QUEUES no less than %u in %s\n",
326                         dev_info.max_rx_queues, __FILE__);
327         }
328
329         rxconf = &dev_info.default_rxconf;
330         txconf = &dev_info.default_txconf;
331         rxconf->rx_drop_en = 1;
332
333         /* Enable vlan offload */
334         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
335
336         /*configure the number of supported virtio devices based on VMDQ limits */
337         num_devices = dev_info.max_vmdq_pools;
338
339         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
340         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
341         tx_rings = (uint16_t)rte_lcore_count();
342
343         retval = validate_num_devices(MAX_DEVICES);
344         if (retval < 0)
345                 return retval;
346
347         /* Get port configuration. */
348         retval = get_eth_conf(&port_conf, num_devices);
349         if (retval < 0)
350                 return retval;
351         /* NIC queues are divided into pf queues and vmdq queues.  */
352         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
353         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
354         num_vmdq_queues = num_devices * queues_per_pool;
355         num_queues = num_pf_queues + num_vmdq_queues;
356         vmdq_queue_base = dev_info.vmdq_queue_base;
357         vmdq_pool_base  = dev_info.vmdq_pool_base;
358         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
359                 num_pf_queues, num_devices, queues_per_pool);
360
361         if (port >= rte_eth_dev_count()) return -1;
362
363         if (enable_tx_csum == 0)
364                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
365
366         if (enable_tso == 0) {
367                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
368                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
369         }
370
371         rx_rings = (uint16_t)dev_info.max_rx_queues;
372         /* Configure ethernet device. */
373         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
374         if (retval != 0)
375                 return retval;
376
377         /* Setup the queues. */
378         for (q = 0; q < rx_rings; q ++) {
379                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
380                                                 rte_eth_dev_socket_id(port),
381                                                 rxconf,
382                                                 mbuf_pool);
383                 if (retval < 0)
384                         return retval;
385         }
386         for (q = 0; q < tx_rings; q ++) {
387                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
388                                                 rte_eth_dev_socket_id(port),
389                                                 txconf);
390                 if (retval < 0)
391                         return retval;
392         }
393
394         /* Start the device. */
395         retval  = rte_eth_dev_start(port);
396         if (retval < 0) {
397                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
398                 return retval;
399         }
400
401         if (promiscuous)
402                 rte_eth_promiscuous_enable(port);
403
404         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
405         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
406         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
407                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
408                         (unsigned)port,
409                         vmdq_ports_eth_addr[port].addr_bytes[0],
410                         vmdq_ports_eth_addr[port].addr_bytes[1],
411                         vmdq_ports_eth_addr[port].addr_bytes[2],
412                         vmdq_ports_eth_addr[port].addr_bytes[3],
413                         vmdq_ports_eth_addr[port].addr_bytes[4],
414                         vmdq_ports_eth_addr[port].addr_bytes[5]);
415
416         return 0;
417 }
418
419 /*
420  * Set character device basename.
421  */
422 static int
423 us_vhost_parse_basename(const char *q_arg)
424 {
425         /* parse number string */
426
427         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
428                 return -1;
429         else
430                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
431
432         return 0;
433 }
434
435 /*
436  * Parse the portmask provided at run time.
437  */
438 static int
439 parse_portmask(const char *portmask)
440 {
441         char *end = NULL;
442         unsigned long pm;
443
444         errno = 0;
445
446         /* parse hexadecimal string */
447         pm = strtoul(portmask, &end, 16);
448         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
449                 return -1;
450
451         if (pm == 0)
452                 return -1;
453
454         return pm;
455
456 }
457
458 /*
459  * Parse num options at run time.
460  */
461 static int
462 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
463 {
464         char *end = NULL;
465         unsigned long num;
466
467         errno = 0;
468
469         /* parse unsigned int string */
470         num = strtoul(q_arg, &end, 10);
471         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
472                 return -1;
473
474         if (num > max_valid_value)
475                 return -1;
476
477         return num;
478
479 }
480
481 /*
482  * Display usage
483  */
484 static void
485 us_vhost_usage(const char *prgname)
486 {
487         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
488         "               --vm2vm [0|1|2]\n"
489         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
490         "               --dev-basename <name>\n"
491         "               --nb-devices ND\n"
492         "               -p PORTMASK: Set mask for ports to be used by application\n"
493         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
494         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
495         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
496         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
497         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
498         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
499         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
500         "               --dev-basename: The basename to be used for the character device.\n"
501         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
502         "               --tso [0|1] disable/enable TCP segment offload.\n",
503                prgname);
504 }
505
506 /*
507  * Parse the arguments given in the command line of the application.
508  */
509 static int
510 us_vhost_parse_args(int argc, char **argv)
511 {
512         int opt, ret;
513         int option_index;
514         unsigned i;
515         const char *prgname = argv[0];
516         static struct option long_option[] = {
517                 {"vm2vm", required_argument, NULL, 0},
518                 {"rx-retry", required_argument, NULL, 0},
519                 {"rx-retry-delay", required_argument, NULL, 0},
520                 {"rx-retry-num", required_argument, NULL, 0},
521                 {"mergeable", required_argument, NULL, 0},
522                 {"vlan-strip", required_argument, NULL, 0},
523                 {"stats", required_argument, NULL, 0},
524                 {"dev-basename", required_argument, NULL, 0},
525                 {"tx-csum", required_argument, NULL, 0},
526                 {"tso", required_argument, NULL, 0},
527                 {NULL, 0, 0, 0},
528         };
529
530         /* Parse command line */
531         while ((opt = getopt_long(argc, argv, "p:P",
532                         long_option, &option_index)) != EOF) {
533                 switch (opt) {
534                 /* Portmask */
535                 case 'p':
536                         enabled_port_mask = parse_portmask(optarg);
537                         if (enabled_port_mask == 0) {
538                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
539                                 us_vhost_usage(prgname);
540                                 return -1;
541                         }
542                         break;
543
544                 case 'P':
545                         promiscuous = 1;
546                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
547                                 ETH_VMDQ_ACCEPT_BROADCAST |
548                                 ETH_VMDQ_ACCEPT_MULTICAST;
549                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
550
551                         break;
552
553                 case 0:
554                         /* Enable/disable vm2vm comms. */
555                         if (!strncmp(long_option[option_index].name, "vm2vm",
556                                 MAX_LONG_OPT_SZ)) {
557                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
558                                 if (ret == -1) {
559                                         RTE_LOG(INFO, VHOST_CONFIG,
560                                                 "Invalid argument for "
561                                                 "vm2vm [0|1|2]\n");
562                                         us_vhost_usage(prgname);
563                                         return -1;
564                                 } else {
565                                         vm2vm_mode = (vm2vm_type)ret;
566                                 }
567                         }
568
569                         /* Enable/disable retries on RX. */
570                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
571                                 ret = parse_num_opt(optarg, 1);
572                                 if (ret == -1) {
573                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
574                                         us_vhost_usage(prgname);
575                                         return -1;
576                                 } else {
577                                         enable_retry = ret;
578                                 }
579                         }
580
581                         /* Enable/disable TX checksum offload. */
582                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
583                                 ret = parse_num_opt(optarg, 1);
584                                 if (ret == -1) {
585                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
586                                         us_vhost_usage(prgname);
587                                         return -1;
588                                 } else
589                                         enable_tx_csum = ret;
590                         }
591
592                         /* Enable/disable TSO offload. */
593                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, 1);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else
600                                         enable_tso = ret;
601                         }
602
603                         /* Specify the retries delay time (in useconds) on RX. */
604                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
605                                 ret = parse_num_opt(optarg, INT32_MAX);
606                                 if (ret == -1) {
607                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
608                                         us_vhost_usage(prgname);
609                                         return -1;
610                                 } else {
611                                         burst_rx_delay_time = ret;
612                                 }
613                         }
614
615                         /* Specify the retries number on RX. */
616                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
617                                 ret = parse_num_opt(optarg, INT32_MAX);
618                                 if (ret == -1) {
619                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620                                         us_vhost_usage(prgname);
621                                         return -1;
622                                 } else {
623                                         burst_rx_retry_num = ret;
624                                 }
625                         }
626
627                         /* Enable/disable RX mergeable buffers. */
628                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
629                                 ret = parse_num_opt(optarg, 1);
630                                 if (ret == -1) {
631                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
632                                         us_vhost_usage(prgname);
633                                         return -1;
634                                 } else {
635                                         mergeable = !!ret;
636                                         if (ret) {
637                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
638                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
639                                                         = JUMBO_FRAME_MAX_SIZE;
640                                         }
641                                 }
642                         }
643
644                         /* Enable/disable RX VLAN strip on host. */
645                         if (!strncmp(long_option[option_index].name,
646                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
647                                 ret = parse_num_opt(optarg, 1);
648                                 if (ret == -1) {
649                                         RTE_LOG(INFO, VHOST_CONFIG,
650                                                 "Invalid argument for VLAN strip [0|1]\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 } else {
654                                         vlan_strip = !!ret;
655                                         vmdq_conf_default.rxmode.hw_vlan_strip =
656                                                 vlan_strip;
657                                 }
658                         }
659
660                         /* Enable/disable stats. */
661                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
662                                 ret = parse_num_opt(optarg, INT32_MAX);
663                                 if (ret == -1) {
664                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
665                                         us_vhost_usage(prgname);
666                                         return -1;
667                                 } else {
668                                         enable_stats = ret;
669                                 }
670                         }
671
672                         /* Set character device basename. */
673                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
674                                 if (us_vhost_parse_basename(optarg) == -1) {
675                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
676                                         us_vhost_usage(prgname);
677                                         return -1;
678                                 }
679                         }
680
681                         break;
682
683                         /* Invalid option - print options. */
684                 default:
685                         us_vhost_usage(prgname);
686                         return -1;
687                 }
688         }
689
690         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
691                 if (enabled_port_mask & (1 << i))
692                         ports[num_ports++] = (uint8_t)i;
693         }
694
695         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
696                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
697                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
698                 return -1;
699         }
700
701         return 0;
702 }
703
704 /*
705  * Update the global var NUM_PORTS and array PORTS according to system ports number
706  * and return valid ports number
707  */
708 static unsigned check_ports_num(unsigned nb_ports)
709 {
710         unsigned valid_num_ports = num_ports;
711         unsigned portid;
712
713         if (num_ports > nb_ports) {
714                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
715                         num_ports, nb_ports);
716                 num_ports = nb_ports;
717         }
718
719         for (portid = 0; portid < num_ports; portid ++) {
720                 if (ports[portid] >= nb_ports) {
721                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
722                                 ports[portid], (nb_ports - 1));
723                         ports[portid] = INVALID_PORT_ID;
724                         valid_num_ports--;
725                 }
726         }
727         return valid_num_ports;
728 }
729
730 /*
731  * Compares a packet destination MAC address to a device MAC address.
732  */
733 static inline int __attribute__((always_inline))
734 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
735 {
736         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
737 }
738
739 /*
740  * This function learns the MAC address of the device and registers this along with a
741  * vlan tag to a VMDQ.
742  */
743 static int
744 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
745 {
746         struct ether_hdr *pkt_hdr;
747         struct virtio_net_data_ll *dev_ll;
748         struct virtio_net *dev = vdev->dev;
749         int i, ret;
750
751         /* Learn MAC address of guest device from packet */
752         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
753
754         dev_ll = ll_root_used;
755
756         while (dev_ll != NULL) {
757                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
758                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
759                         return -1;
760                 }
761                 dev_ll = dev_ll->next;
762         }
763
764         for (i = 0; i < ETHER_ADDR_LEN; i++)
765                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
766
767         /* vlan_tag currently uses the device_id. */
768         vdev->vlan_tag = vlan_tags[dev->device_fh];
769
770         /* Print out VMDQ registration info. */
771         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
772                 dev->device_fh,
773                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
774                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
775                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
776                 vdev->vlan_tag);
777
778         /* Register the MAC address. */
779         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
780                                 (uint32_t)dev->device_fh + vmdq_pool_base);
781         if (ret)
782                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
783                                         dev->device_fh);
784
785         /* Enable stripping of the vlan tag as we handle routing. */
786         if (vlan_strip)
787                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
788                         (uint16_t)vdev->vmdq_rx_q, 1);
789
790         /* Set device as ready for RX. */
791         vdev->ready = DEVICE_RX;
792
793         return 0;
794 }
795
796 /*
797  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
798  * queue before disabling RX on the device.
799  */
800 static inline void
801 unlink_vmdq(struct vhost_dev *vdev)
802 {
803         unsigned i = 0;
804         unsigned rx_count;
805         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
806
807         if (vdev->ready == DEVICE_RX) {
808                 /*clear MAC and VLAN settings*/
809                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
810                 for (i = 0; i < 6; i++)
811                         vdev->mac_address.addr_bytes[i] = 0;
812
813                 vdev->vlan_tag = 0;
814
815                 /*Clear out the receive buffers*/
816                 rx_count = rte_eth_rx_burst(ports[0],
817                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
818
819                 while (rx_count) {
820                         for (i = 0; i < rx_count; i++)
821                                 rte_pktmbuf_free(pkts_burst[i]);
822
823                         rx_count = rte_eth_rx_burst(ports[0],
824                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
825                 }
826
827                 vdev->ready = DEVICE_MAC_LEARNING;
828         }
829 }
830
831 /*
832  * Check if the packet destination MAC address is for a local device. If so then put
833  * the packet on that devices RX queue. If not then return.
834  */
835 static inline int __attribute__((always_inline))
836 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
837 {
838         struct virtio_net_data_ll *dev_ll;
839         struct ether_hdr *pkt_hdr;
840         uint64_t ret = 0;
841         struct virtio_net *dev = vdev->dev;
842         struct virtio_net *tdev; /* destination virito device */
843
844         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
845
846         /*get the used devices list*/
847         dev_ll = ll_root_used;
848
849         while (dev_ll != NULL) {
850                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
851                                           &dev_ll->vdev->mac_address)) {
852
853                         /* Drop the packet if the TX packet is destined for the TX device. */
854                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
855                                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
856                                         "Source and destination MAC addresses are the same. "
857                                         "Dropping packet.\n",
858                                         dev->device_fh);
859                                 return 0;
860                         }
861                         tdev = dev_ll->vdev->dev;
862
863
864                         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
865                                 "MAC address is local\n", tdev->device_fh);
866
867                         if (unlikely(dev_ll->vdev->remove)) {
868                                 /*drop the packet if the device is marked for removal*/
869                                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
870                                         "Device is marked for removal\n", tdev->device_fh);
871                         } else {
872                                 /*send the packet to the local virtio device*/
873                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
874                                 if (enable_stats) {
875                                         rte_atomic64_add(
876                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
877                                         1);
878                                         rte_atomic64_add(
879                                         &dev_statistics[tdev->device_fh].rx_atomic,
880                                         ret);
881                                         dev_statistics[dev->device_fh].tx_total++;
882                                         dev_statistics[dev->device_fh].tx += ret;
883                                 }
884                         }
885
886                         return 0;
887                 }
888                 dev_ll = dev_ll->next;
889         }
890
891         return -1;
892 }
893
894 /*
895  * Check if the destination MAC of a packet is one local VM,
896  * and get its vlan tag, and offset if it is.
897  */
898 static inline int __attribute__((always_inline))
899 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
900         uint32_t *offset, uint16_t *vlan_tag)
901 {
902         struct virtio_net_data_ll *dev_ll = ll_root_used;
903         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
904
905         while (dev_ll != NULL) {
906                 if ((dev_ll->vdev->ready == DEVICE_RX)
907                         && ether_addr_cmp(&(pkt_hdr->d_addr),
908                 &dev_ll->vdev->mac_address)) {
909                         /*
910                          * Drop the packet if the TX packet is
911                          * destined for the TX device.
912                          */
913                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
914                                 RTE_LOG(DEBUG, VHOST_DATA,
915                                 "(%"PRIu64") TX: Source and destination"
916                                 " MAC addresses are the same. Dropping "
917                                 "packet.\n",
918                                 dev_ll->vdev->dev->device_fh);
919                                 return -1;
920                         }
921
922                         /*
923                          * HW vlan strip will reduce the packet length
924                          * by minus length of vlan tag, so need restore
925                          * the packet length by plus it.
926                          */
927                         *offset = VLAN_HLEN;
928                         *vlan_tag =
929                         (uint16_t)
930                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
931
932                         RTE_LOG(DEBUG, VHOST_DATA,
933                         "(%"PRIu64") TX: pkt to local VM device id:"
934                         "(%"PRIu64") vlan tag: %d.\n",
935                         dev->device_fh, dev_ll->vdev->dev->device_fh,
936                         (int)*vlan_tag);
937
938                         break;
939                 }
940                 dev_ll = dev_ll->next;
941         }
942         return 0;
943 }
944
945 static uint16_t
946 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
947 {
948         if (ol_flags & PKT_TX_IPV4)
949                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
950         else /* assume ethertype == ETHER_TYPE_IPv6 */
951                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
952 }
953
954 static void virtio_tx_offload(struct rte_mbuf *m)
955 {
956         void *l3_hdr;
957         struct ipv4_hdr *ipv4_hdr = NULL;
958         struct tcp_hdr *tcp_hdr = NULL;
959         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960
961         l3_hdr = (char *)eth_hdr + m->l2_len;
962
963         if (m->ol_flags & PKT_TX_IPV4) {
964                 ipv4_hdr = l3_hdr;
965                 ipv4_hdr->hdr_checksum = 0;
966                 m->ol_flags |= PKT_TX_IP_CKSUM;
967         }
968
969         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
970         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
971 }
972
973 /*
974  * This function routes the TX packet to the correct interface. This may be a local device
975  * or the physical port.
976  */
977 static inline void __attribute__((always_inline))
978 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
979 {
980         struct mbuf_table *tx_q;
981         struct rte_mbuf **m_table;
982         unsigned len, ret, offset = 0;
983         const uint16_t lcore_id = rte_lcore_id();
984         struct virtio_net *dev = vdev->dev;
985         struct ether_hdr *nh;
986
987         /*check if destination is local VM*/
988         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
989                 rte_pktmbuf_free(m);
990                 return;
991         }
992
993         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
994                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
995                         rte_pktmbuf_free(m);
996                         return;
997                 }
998         }
999
1000         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1001                 "MAC address is external\n", dev->device_fh);
1002
1003         /*Add packet to the port tx queue*/
1004         tx_q = &lcore_tx_queue[lcore_id];
1005         len = tx_q->len;
1006
1007         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1008         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1009                 /* Guest has inserted the vlan tag. */
1010                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1011                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1012                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1013                         (vh->vlan_tci != vlan_tag_be))
1014                         vh->vlan_tci = vlan_tag_be;
1015         } else {
1016                 m->ol_flags |= PKT_TX_VLAN_PKT;
1017
1018                 /*
1019                  * Find the right seg to adjust the data len when offset is
1020                  * bigger than tail room size.
1021                  */
1022                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1023                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1024                                 m->data_len += offset;
1025                         else {
1026                                 struct rte_mbuf *seg = m;
1027
1028                                 while ((seg->next != NULL) &&
1029                                         (offset > rte_pktmbuf_tailroom(seg)))
1030                                         seg = seg->next;
1031
1032                                 seg->data_len += offset;
1033                         }
1034                         m->pkt_len += offset;
1035                 }
1036
1037                 m->vlan_tci = vlan_tag;
1038         }
1039
1040         if (m->ol_flags & PKT_TX_TCP_SEG)
1041                 virtio_tx_offload(m);
1042
1043         tx_q->m_table[len] = m;
1044         len++;
1045         if (enable_stats) {
1046                 dev_statistics[dev->device_fh].tx_total++;
1047                 dev_statistics[dev->device_fh].tx++;
1048         }
1049
1050         if (unlikely(len == MAX_PKT_BURST)) {
1051                 m_table = (struct rte_mbuf **)tx_q->m_table;
1052                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1053                 /* Free any buffers not handled by TX and update the port stats. */
1054                 if (unlikely(ret < len)) {
1055                         do {
1056                                 rte_pktmbuf_free(m_table[ret]);
1057                         } while (++ret < len);
1058                 }
1059
1060                 len = 0;
1061         }
1062
1063         tx_q->len = len;
1064         return;
1065 }
1066 /*
1067  * This function is called by each data core. It handles all RX/TX registered with the
1068  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1069  * with all devices in the main linked list.
1070  */
1071 static int
1072 switch_worker(__attribute__((unused)) void *arg)
1073 {
1074         struct virtio_net *dev = NULL;
1075         struct vhost_dev *vdev = NULL;
1076         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1077         struct virtio_net_data_ll *dev_ll;
1078         struct mbuf_table *tx_q;
1079         volatile struct lcore_ll_info *lcore_ll;
1080         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1081         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1082         unsigned ret, i;
1083         const uint16_t lcore_id = rte_lcore_id();
1084         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1085         uint16_t rx_count = 0;
1086         uint16_t tx_count;
1087         uint32_t retry = 0;
1088
1089         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1090         lcore_ll = lcore_info[lcore_id].lcore_ll;
1091         prev_tsc = 0;
1092
1093         tx_q = &lcore_tx_queue[lcore_id];
1094         for (i = 0; i < num_cores; i ++) {
1095                 if (lcore_ids[i] == lcore_id) {
1096                         tx_q->txq_id = i;
1097                         break;
1098                 }
1099         }
1100
1101         while(1) {
1102                 cur_tsc = rte_rdtsc();
1103                 /*
1104                  * TX burst queue drain
1105                  */
1106                 diff_tsc = cur_tsc - prev_tsc;
1107                 if (unlikely(diff_tsc > drain_tsc)) {
1108
1109                         if (tx_q->len) {
1110                                 RTE_LOG(DEBUG, VHOST_DATA,
1111                                         "TX queue drained after timeout with burst size %u\n",
1112                                         tx_q->len);
1113
1114                                 /*Tx any packets in the queue*/
1115                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1116                                                                            (struct rte_mbuf **)tx_q->m_table,
1117                                                                            (uint16_t)tx_q->len);
1118                                 if (unlikely(ret < tx_q->len)) {
1119                                         do {
1120                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1121                                         } while (++ret < tx_q->len);
1122                                 }
1123
1124                                 tx_q->len = 0;
1125                         }
1126
1127                         prev_tsc = cur_tsc;
1128
1129                 }
1130
1131                 rte_prefetch0(lcore_ll->ll_root_used);
1132                 /*
1133                  * Inform the configuration core that we have exited the linked list and that no devices are
1134                  * in use if requested.
1135                  */
1136                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1137                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1138
1139                 /*
1140                  * Process devices
1141                  */
1142                 dev_ll = lcore_ll->ll_root_used;
1143
1144                 while (dev_ll != NULL) {
1145                         /*get virtio device ID*/
1146                         vdev = dev_ll->vdev;
1147                         dev = vdev->dev;
1148
1149                         if (unlikely(vdev->remove)) {
1150                                 dev_ll = dev_ll->next;
1151                                 unlink_vmdq(vdev);
1152                                 vdev->ready = DEVICE_SAFE_REMOVE;
1153                                 continue;
1154                         }
1155                         if (likely(vdev->ready == DEVICE_RX)) {
1156                                 /*Handle guest RX*/
1157                                 rx_count = rte_eth_rx_burst(ports[0],
1158                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1159
1160                                 if (rx_count) {
1161                                         /*
1162                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1163                                         * Here MAX_PKT_BURST must be less than virtio queue size
1164                                         */
1165                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1166                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1167                                                         rte_delay_us(burst_rx_delay_time);
1168                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1169                                                                 break;
1170                                                 }
1171                                         }
1172                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1173                                         if (enable_stats) {
1174                                                 rte_atomic64_add(
1175                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1176                                                 rx_count);
1177                                                 rte_atomic64_add(
1178                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1179                                         }
1180                                         while (likely(rx_count)) {
1181                                                 rx_count--;
1182                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1183                                         }
1184
1185                                 }
1186                         }
1187
1188                         if (likely(!vdev->remove)) {
1189                                 /* Handle guest TX*/
1190                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1191                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1192                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1193                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1194                                                 while (tx_count)
1195                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1196                                         }
1197                                 }
1198                                 for (i = 0; i < tx_count; ++i) {
1199                                         virtio_tx_route(vdev, pkts_burst[i],
1200                                                 vlan_tags[(uint16_t)dev->device_fh]);
1201                                 }
1202                         }
1203
1204                         /*move to the next device in the list*/
1205                         dev_ll = dev_ll->next;
1206                 }
1207         }
1208
1209         return 0;
1210 }
1211
1212 /*
1213  * Add an entry to a used linked list. A free entry must first be found
1214  * in the free linked list using get_data_ll_free_entry();
1215  */
1216 static void
1217 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
1218         struct virtio_net_data_ll *ll_dev)
1219 {
1220         struct virtio_net_data_ll *ll = *ll_root_addr;
1221
1222         /* Set next as NULL and use a compiler barrier to avoid reordering. */
1223         ll_dev->next = NULL;
1224         rte_compiler_barrier();
1225
1226         /* If ll == NULL then this is the first device. */
1227         if (ll) {
1228                 /* Increment to the tail of the linked list. */
1229                 while ((ll->next != NULL) )
1230                         ll = ll->next;
1231
1232                 ll->next = ll_dev;
1233         } else {
1234                 *ll_root_addr = ll_dev;
1235         }
1236 }
1237
1238 /*
1239  * Remove an entry from a used linked list. The entry must then be added to
1240  * the free linked list using put_data_ll_free_entry().
1241  */
1242 static void
1243 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
1244         struct virtio_net_data_ll *ll_dev,
1245         struct virtio_net_data_ll *ll_dev_last)
1246 {
1247         struct virtio_net_data_ll *ll = *ll_root_addr;
1248
1249         if (unlikely((ll == NULL) || (ll_dev == NULL)))
1250                 return;
1251
1252         if (ll_dev == ll)
1253                 *ll_root_addr = ll_dev->next;
1254         else
1255                 if (likely(ll_dev_last != NULL))
1256                         ll_dev_last->next = ll_dev->next;
1257                 else
1258                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
1259 }
1260
1261 /*
1262  * Find and return an entry from the free linked list.
1263  */
1264 static struct virtio_net_data_ll *
1265 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
1266 {
1267         struct virtio_net_data_ll *ll_free = *ll_root_addr;
1268         struct virtio_net_data_ll *ll_dev;
1269
1270         if (ll_free == NULL)
1271                 return NULL;
1272
1273         ll_dev = ll_free;
1274         *ll_root_addr = ll_free->next;
1275
1276         return ll_dev;
1277 }
1278
1279 /*
1280  * Place an entry back on to the free linked list.
1281  */
1282 static void
1283 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
1284         struct virtio_net_data_ll *ll_dev)
1285 {
1286         struct virtio_net_data_ll *ll_free = *ll_root_addr;
1287
1288         if (ll_dev == NULL)
1289                 return;
1290
1291         ll_dev->next = ll_free;
1292         *ll_root_addr = ll_dev;
1293 }
1294
1295 /*
1296  * Creates a linked list of a given size.
1297  */
1298 static struct virtio_net_data_ll *
1299 alloc_data_ll(uint32_t size)
1300 {
1301         struct virtio_net_data_ll *ll_new;
1302         uint32_t i;
1303
1304         /* Malloc and then chain the linked list. */
1305         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
1306         if (ll_new == NULL) {
1307                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
1308                 return NULL;
1309         }
1310
1311         for (i = 0; i < size - 1; i++) {
1312                 ll_new[i].vdev = NULL;
1313                 ll_new[i].next = &ll_new[i+1];
1314         }
1315         ll_new[i].next = NULL;
1316
1317         return ll_new;
1318 }
1319
1320 /*
1321  * Create the main linked list along with each individual cores linked list. A used and a free list
1322  * are created to manage entries.
1323  */
1324 static int
1325 init_data_ll (void)
1326 {
1327         int lcore;
1328
1329         RTE_LCORE_FOREACH_SLAVE(lcore) {
1330                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
1331                 if (lcore_info[lcore].lcore_ll == NULL) {
1332                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
1333                         return -1;
1334                 }
1335
1336                 lcore_info[lcore].lcore_ll->device_num = 0;
1337                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1338                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
1339                 if (num_devices % num_switching_cores)
1340                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
1341                 else
1342                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
1343         }
1344
1345         /* Allocate devices up to a maximum of MAX_DEVICES. */
1346         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
1347
1348         return 0;
1349 }
1350
1351 /*
1352  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
1353  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1354  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1355  */
1356 static void
1357 destroy_device (volatile struct virtio_net *dev)
1358 {
1359         struct virtio_net_data_ll *ll_lcore_dev_cur;
1360         struct virtio_net_data_ll *ll_main_dev_cur;
1361         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
1362         struct virtio_net_data_ll *ll_main_dev_last = NULL;
1363         struct vhost_dev *vdev;
1364         int lcore;
1365
1366         dev->flags &= ~VIRTIO_DEV_RUNNING;
1367
1368         vdev = (struct vhost_dev *)dev->priv;
1369         /*set the remove flag. */
1370         vdev->remove = 1;
1371         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1372                 rte_pause();
1373         }
1374
1375         /* Search for entry to be removed from lcore ll */
1376         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
1377         while (ll_lcore_dev_cur != NULL) {
1378                 if (ll_lcore_dev_cur->vdev == vdev) {
1379                         break;
1380                 } else {
1381                         ll_lcore_dev_last = ll_lcore_dev_cur;
1382                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
1383                 }
1384         }
1385
1386         if (ll_lcore_dev_cur == NULL) {
1387                 RTE_LOG(ERR, VHOST_CONFIG,
1388                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
1389                         dev->device_fh);
1390                 return;
1391         }
1392
1393         /* Search for entry to be removed from main ll */
1394         ll_main_dev_cur = ll_root_used;
1395         ll_main_dev_last = NULL;
1396         while (ll_main_dev_cur != NULL) {
1397                 if (ll_main_dev_cur->vdev == vdev) {
1398                         break;
1399                 } else {
1400                         ll_main_dev_last = ll_main_dev_cur;
1401                         ll_main_dev_cur = ll_main_dev_cur->next;
1402                 }
1403         }
1404
1405         /* Remove entries from the lcore and main ll. */
1406         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
1407         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
1408
1409         /* Set the dev_removal_flag on each lcore. */
1410         RTE_LCORE_FOREACH_SLAVE(lcore) {
1411                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
1412         }
1413
1414         /*
1415          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
1416          * they can no longer access the device removed from the linked lists and that the devices
1417          * are no longer in use.
1418          */
1419         RTE_LCORE_FOREACH_SLAVE(lcore) {
1420                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
1421                         rte_pause();
1422                 }
1423         }
1424
1425         /* Add the entries back to the lcore and main free ll.*/
1426         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
1427         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
1428
1429         /* Decrement number of device on the lcore. */
1430         lcore_info[vdev->coreid].lcore_ll->device_num--;
1431
1432         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
1433
1434         rte_free(vdev);
1435
1436 }
1437
1438 /*
1439  * A new device is added to a data core. First the device is added to the main linked list
1440  * and the allocated to a specific data core.
1441  */
1442 static int
1443 new_device (struct virtio_net *dev)
1444 {
1445         struct virtio_net_data_ll *ll_dev;
1446         int lcore, core_add = 0;
1447         uint32_t device_num_min = num_devices;
1448         struct vhost_dev *vdev;
1449
1450         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1451         if (vdev == NULL) {
1452                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1453                         dev->device_fh);
1454                 return -1;
1455         }
1456         vdev->dev = dev;
1457         dev->priv = vdev;
1458
1459         /* Add device to main ll */
1460         ll_dev = get_data_ll_free_entry(&ll_root_free);
1461         if (ll_dev == NULL) {
1462                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
1463                         "of %d devices per core has been reached\n",
1464                         dev->device_fh, num_devices);
1465                 rte_free(vdev);
1466                 return -1;
1467         }
1468         ll_dev->vdev = vdev;
1469         add_data_ll_entry(&ll_root_used, ll_dev);
1470         vdev->vmdq_rx_q
1471                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
1472
1473         /*reset ready flag*/
1474         vdev->ready = DEVICE_MAC_LEARNING;
1475         vdev->remove = 0;
1476
1477         /* Find a suitable lcore to add the device. */
1478         RTE_LCORE_FOREACH_SLAVE(lcore) {
1479                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
1480                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
1481                         core_add = lcore;
1482                 }
1483         }
1484         /* Add device to lcore ll */
1485         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
1486         if (ll_dev == NULL) {
1487                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
1488                 vdev->ready = DEVICE_SAFE_REMOVE;
1489                 destroy_device(dev);
1490                 rte_free(vdev);
1491                 return -1;
1492         }
1493         ll_dev->vdev = vdev;
1494         vdev->coreid = core_add;
1495
1496         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
1497
1498         /* Initialize device stats */
1499         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1500
1501         /* Disable notifications. */
1502         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1503         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1504         lcore_info[vdev->coreid].lcore_ll->device_num++;
1505         dev->flags |= VIRTIO_DEV_RUNNING;
1506
1507         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1508
1509         return 0;
1510 }
1511
1512 /*
1513  * These callback allow devices to be added to the data core when configuration
1514  * has been fully complete.
1515  */
1516 static const struct virtio_net_device_ops virtio_net_device_ops =
1517 {
1518         .new_device =  new_device,
1519         .destroy_device = destroy_device,
1520 };
1521
1522 /*
1523  * This is a thread will wake up after a period to print stats if the user has
1524  * enabled them.
1525  */
1526 static void
1527 print_stats(void)
1528 {
1529         struct virtio_net_data_ll *dev_ll;
1530         uint64_t tx_dropped, rx_dropped;
1531         uint64_t tx, tx_total, rx, rx_total;
1532         uint32_t device_fh;
1533         const char clr[] = { 27, '[', '2', 'J', '\0' };
1534         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1535
1536         while(1) {
1537                 sleep(enable_stats);
1538
1539                 /* Clear screen and move to top left */
1540                 printf("%s%s", clr, top_left);
1541
1542                 printf("\nDevice statistics ====================================");
1543
1544                 dev_ll = ll_root_used;
1545                 while (dev_ll != NULL) {
1546                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
1547                         tx_total = dev_statistics[device_fh].tx_total;
1548                         tx = dev_statistics[device_fh].tx;
1549                         tx_dropped = tx_total - tx;
1550                         rx_total = rte_atomic64_read(
1551                                 &dev_statistics[device_fh].rx_total_atomic);
1552                         rx = rte_atomic64_read(
1553                                 &dev_statistics[device_fh].rx_atomic);
1554                         rx_dropped = rx_total - rx;
1555
1556                         printf("\nStatistics for device %"PRIu32" ------------------------------"
1557                                         "\nTX total:            %"PRIu64""
1558                                         "\nTX dropped:          %"PRIu64""
1559                                         "\nTX successful:               %"PRIu64""
1560                                         "\nRX total:            %"PRIu64""
1561                                         "\nRX dropped:          %"PRIu64""
1562                                         "\nRX successful:               %"PRIu64"",
1563                                         device_fh,
1564                                         tx_total,
1565                                         tx_dropped,
1566                                         tx,
1567                                         rx_total,
1568                                         rx_dropped,
1569                                         rx);
1570
1571                         dev_ll = dev_ll->next;
1572                 }
1573                 printf("\n======================================================\n");
1574         }
1575 }
1576
1577 /* When we receive a INT signal, unregister vhost driver */
1578 static void
1579 sigint_handler(__rte_unused int signum)
1580 {
1581         /* Unregister vhost driver. */
1582         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1583         if (ret != 0)
1584                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1585         exit(0);
1586 }
1587
1588 /*
1589  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1590  * device is also registered here to handle the IOCTLs.
1591  */
1592 int
1593 main(int argc, char *argv[])
1594 {
1595         unsigned lcore_id, core_id = 0;
1596         unsigned nb_ports, valid_num_ports;
1597         int ret;
1598         uint8_t portid;
1599         static pthread_t tid;
1600         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1601
1602         signal(SIGINT, sigint_handler);
1603
1604         /* init EAL */
1605         ret = rte_eal_init(argc, argv);
1606         if (ret < 0)
1607                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1608         argc -= ret;
1609         argv += ret;
1610
1611         /* parse app arguments */
1612         ret = us_vhost_parse_args(argc, argv);
1613         if (ret < 0)
1614                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1615
1616         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1617                 if (rte_lcore_is_enabled(lcore_id))
1618                         lcore_ids[core_id ++] = lcore_id;
1619
1620         if (rte_lcore_count() > RTE_MAX_LCORE)
1621                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1622
1623         /*set the number of swithcing cores available*/
1624         num_switching_cores = rte_lcore_count()-1;
1625
1626         /* Get the number of physical ports. */
1627         nb_ports = rte_eth_dev_count();
1628         if (nb_ports > RTE_MAX_ETHPORTS)
1629                 nb_ports = RTE_MAX_ETHPORTS;
1630
1631         /*
1632          * Update the global var NUM_PORTS and global array PORTS
1633          * and get value of var VALID_NUM_PORTS according to system ports number
1634          */
1635         valid_num_ports = check_ports_num(nb_ports);
1636
1637         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1638                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1639                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1640                 return -1;
1641         }
1642
1643         /* Create the mbuf pool. */
1644         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1645                 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1646                 0, MBUF_DATA_SIZE, rte_socket_id());
1647         if (mbuf_pool == NULL)
1648                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1649
1650         if (vm2vm_mode == VM2VM_HARDWARE) {
1651                 /* Enable VT loop back to let L2 switch to do it. */
1652                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1653                 RTE_LOG(DEBUG, VHOST_CONFIG,
1654                         "Enable loop back for L2 switch in vmdq.\n");
1655         }
1656
1657         /* initialize all ports */
1658         for (portid = 0; portid < nb_ports; portid++) {
1659                 /* skip ports that are not enabled */
1660                 if ((enabled_port_mask & (1 << portid)) == 0) {
1661                         RTE_LOG(INFO, VHOST_PORT,
1662                                 "Skipping disabled port %d\n", portid);
1663                         continue;
1664                 }
1665                 if (port_init(portid) != 0)
1666                         rte_exit(EXIT_FAILURE,
1667                                 "Cannot initialize network ports\n");
1668         }
1669
1670         /* Initialise all linked lists. */
1671         if (init_data_ll() == -1)
1672                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
1673
1674         /* Initialize device stats */
1675         memset(&dev_statistics, 0, sizeof(dev_statistics));
1676
1677         /* Enable stats if the user option is set. */
1678         if (enable_stats) {
1679                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1680                 if (ret != 0)
1681                         rte_exit(EXIT_FAILURE,
1682                                 "Cannot create print-stats thread\n");
1683
1684                 /* Set thread_name for aid in debugging.  */
1685                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1686                 ret = rte_thread_setname(tid, thread_name);
1687                 if (ret != 0)
1688                         RTE_LOG(ERR, VHOST_CONFIG,
1689                                 "Cannot set print-stats name\n");
1690         }
1691
1692         /* Launch all data cores. */
1693         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1694                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1695
1696         if (mergeable == 0)
1697                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1698
1699         /* Register vhost(cuse or user) driver to handle vhost messages. */
1700         ret = rte_vhost_driver_register((char *)&dev_basename);
1701         if (ret != 0)
1702                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1703
1704         rte_vhost_driver_callback_register(&virtio_net_device_ops);
1705
1706         /* Start CUSE session. */
1707         rte_vhost_driver_session_start();
1708         return 0;
1709
1710 }