examples/vhost: remove unused macro and struct
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
78
79 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
81
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX                       1
87 #define DEVICE_SAFE_REMOVE      2
88
89 /* Config_core_flag status definitions. */
90 #define REQUEST_DEV_REMOVAL 1
91 #define ACK_DEV_REMOVAL 0
92
93 /* Configurable number of RX/TX ring descriptors */
94 #define RTE_TEST_RX_DESC_DEFAULT 1024
95 #define RTE_TEST_TX_DESC_DEFAULT 512
96
97 #define INVALID_PORT_ID 0xFF
98
99 /* Max number of devices. Limited by vmdq. */
100 #define MAX_DEVICES 64
101
102 /* Size of buffers used for snprintfs. */
103 #define MAX_PRINT_BUFF 6072
104
105 /* Maximum character device basename size. */
106 #define MAX_BASENAME_SZ 10
107
108 /* Maximum long option length for option parsing. */
109 #define MAX_LONG_OPT_SZ 64
110
111 /* Used to compare MAC addresses. */
112 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
113
114 /* mask of enabled ports */
115 static uint32_t enabled_port_mask = 0;
116
117 /* Promiscuous mode */
118 static uint32_t promiscuous;
119
120 /*Number of switching cores enabled*/
121 static uint32_t num_switching_cores = 0;
122
123 /* number of devices/queues to support*/
124 static uint32_t num_queues = 0;
125 static uint32_t num_devices;
126
127 static struct rte_mempool *mbuf_pool;
128 static int mergeable;
129
130 /* Do vlan strip on host, enabled on default */
131 static uint32_t vlan_strip = 1;
132
133 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
134 typedef enum {
135         VM2VM_DISABLED = 0,
136         VM2VM_SOFTWARE = 1,
137         VM2VM_HARDWARE = 2,
138         VM2VM_LAST
139 } vm2vm_type;
140 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
141
142 /* Enable stats. */
143 static uint32_t enable_stats = 0;
144 /* Enable retries on RX. */
145 static uint32_t enable_retry = 1;
146
147 /* Disable TX checksum offload */
148 static uint32_t enable_tx_csum;
149
150 /* Disable TSO offload */
151 static uint32_t enable_tso;
152
153 /* Specify timeout (in useconds) between retries on RX. */
154 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
155 /* Specify the number of retries on RX. */
156 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
157
158 /* Character device basename. Can be set by user. */
159 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
160
161 /* empty vmdq configuration structure. Filled in programatically */
162 static struct rte_eth_conf vmdq_conf_default = {
163         .rxmode = {
164                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
165                 .split_hdr_size = 0,
166                 .header_split   = 0, /**< Header Split disabled */
167                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
168                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
169                 /*
170                  * It is necessary for 1G NIC such as I350,
171                  * this fixes bug of ipv4 forwarding in guest can't
172                  * forward pakets from one virtio dev to another virtio dev.
173                  */
174                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
175                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
176                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
177         },
178
179         .txmode = {
180                 .mq_mode = ETH_MQ_TX_NONE,
181         },
182         .rx_adv_conf = {
183                 /*
184                  * should be overridden separately in code with
185                  * appropriate values
186                  */
187                 .vmdq_rx_conf = {
188                         .nb_queue_pools = ETH_8_POOLS,
189                         .enable_default_pool = 0,
190                         .default_pool = 0,
191                         .nb_pool_maps = 0,
192                         .pool_map = {{0, 0},},
193                 },
194         },
195 };
196
197 static unsigned lcore_ids[RTE_MAX_LCORE];
198 static uint8_t ports[RTE_MAX_ETHPORTS];
199 static unsigned num_ports = 0; /**< The number of ports specified in command line */
200 static uint16_t num_pf_queues, num_vmdq_queues;
201 static uint16_t vmdq_pool_base, vmdq_queue_base;
202 static uint16_t queues_per_pool;
203
204 const uint16_t vlan_tags[] = {
205         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
206         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
207         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
208         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
209         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
210         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
211         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
212         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
213 };
214
215 /* ethernet addresses of ports */
216 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
217
218 /* heads for the main used and free linked lists for the data path. */
219 static struct virtio_net_data_ll *ll_root_used = NULL;
220 static struct virtio_net_data_ll *ll_root_free = NULL;
221
222 /* Array of data core structures containing information on individual core linked lists. */
223 static struct lcore_info lcore_info[RTE_MAX_LCORE];
224
225 /* Used for queueing bursts of TX packets. */
226 struct mbuf_table {
227         unsigned len;
228         unsigned txq_id;
229         struct rte_mbuf *m_table[MAX_PKT_BURST];
230 };
231
232 /* TX queue for each data core. */
233 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
234
235 #define VLAN_HLEN       4
236
237 /* Per-device statistics struct */
238 struct device_statistics {
239         uint64_t tx_total;
240         rte_atomic64_t rx_total_atomic;
241         uint64_t tx;
242         rte_atomic64_t rx_atomic;
243 } __rte_cache_aligned;
244 struct device_statistics dev_statistics[MAX_DEVICES];
245
246 /*
247  * Builds up the correct configuration for VMDQ VLAN pool map
248  * according to the pool & queue limits.
249  */
250 static inline int
251 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
252 {
253         struct rte_eth_vmdq_rx_conf conf;
254         struct rte_eth_vmdq_rx_conf *def_conf =
255                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
256         unsigned i;
257
258         memset(&conf, 0, sizeof(conf));
259         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
260         conf.nb_pool_maps = num_devices;
261         conf.enable_loop_back = def_conf->enable_loop_back;
262         conf.rx_mode = def_conf->rx_mode;
263
264         for (i = 0; i < conf.nb_pool_maps; i++) {
265                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
266                 conf.pool_map[i].pools = (1UL << i);
267         }
268
269         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
270         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
271                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
272         return 0;
273 }
274
275 /*
276  * Validate the device number according to the max pool number gotten form
277  * dev_info. If the device number is invalid, give the error message and
278  * return -1. Each device must have its own pool.
279  */
280 static inline int
281 validate_num_devices(uint32_t max_nb_devices)
282 {
283         if (num_devices > max_nb_devices) {
284                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
285                 return -1;
286         }
287         return 0;
288 }
289
290 /*
291  * Initialises a given port using global settings and with the rx buffers
292  * coming from the mbuf_pool passed as parameter
293  */
294 static inline int
295 port_init(uint8_t port)
296 {
297         struct rte_eth_dev_info dev_info;
298         struct rte_eth_conf port_conf;
299         struct rte_eth_rxconf *rxconf;
300         struct rte_eth_txconf *txconf;
301         int16_t rx_rings, tx_rings;
302         uint16_t rx_ring_size, tx_ring_size;
303         int retval;
304         uint16_t q;
305
306         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
307         rte_eth_dev_info_get (port, &dev_info);
308
309         if (dev_info.max_rx_queues > MAX_QUEUES) {
310                 rte_exit(EXIT_FAILURE,
311                         "please define MAX_QUEUES no less than %u in %s\n",
312                         dev_info.max_rx_queues, __FILE__);
313         }
314
315         rxconf = &dev_info.default_rxconf;
316         txconf = &dev_info.default_txconf;
317         rxconf->rx_drop_en = 1;
318
319         /* Enable vlan offload */
320         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
321
322         /*configure the number of supported virtio devices based on VMDQ limits */
323         num_devices = dev_info.max_vmdq_pools;
324
325         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
326         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
327         tx_rings = (uint16_t)rte_lcore_count();
328
329         retval = validate_num_devices(MAX_DEVICES);
330         if (retval < 0)
331                 return retval;
332
333         /* Get port configuration. */
334         retval = get_eth_conf(&port_conf, num_devices);
335         if (retval < 0)
336                 return retval;
337         /* NIC queues are divided into pf queues and vmdq queues.  */
338         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
339         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
340         num_vmdq_queues = num_devices * queues_per_pool;
341         num_queues = num_pf_queues + num_vmdq_queues;
342         vmdq_queue_base = dev_info.vmdq_queue_base;
343         vmdq_pool_base  = dev_info.vmdq_pool_base;
344         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
345                 num_pf_queues, num_devices, queues_per_pool);
346
347         if (port >= rte_eth_dev_count()) return -1;
348
349         if (enable_tx_csum == 0)
350                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
351
352         if (enable_tso == 0) {
353                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
354                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
355         }
356
357         rx_rings = (uint16_t)dev_info.max_rx_queues;
358         /* Configure ethernet device. */
359         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
360         if (retval != 0)
361                 return retval;
362
363         /* Setup the queues. */
364         for (q = 0; q < rx_rings; q ++) {
365                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
366                                                 rte_eth_dev_socket_id(port),
367                                                 rxconf,
368                                                 mbuf_pool);
369                 if (retval < 0)
370                         return retval;
371         }
372         for (q = 0; q < tx_rings; q ++) {
373                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
374                                                 rte_eth_dev_socket_id(port),
375                                                 txconf);
376                 if (retval < 0)
377                         return retval;
378         }
379
380         /* Start the device. */
381         retval  = rte_eth_dev_start(port);
382         if (retval < 0) {
383                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
384                 return retval;
385         }
386
387         if (promiscuous)
388                 rte_eth_promiscuous_enable(port);
389
390         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
391         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
392         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
393                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
394                         (unsigned)port,
395                         vmdq_ports_eth_addr[port].addr_bytes[0],
396                         vmdq_ports_eth_addr[port].addr_bytes[1],
397                         vmdq_ports_eth_addr[port].addr_bytes[2],
398                         vmdq_ports_eth_addr[port].addr_bytes[3],
399                         vmdq_ports_eth_addr[port].addr_bytes[4],
400                         vmdq_ports_eth_addr[port].addr_bytes[5]);
401
402         return 0;
403 }
404
405 /*
406  * Set character device basename.
407  */
408 static int
409 us_vhost_parse_basename(const char *q_arg)
410 {
411         /* parse number string */
412
413         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
414                 return -1;
415         else
416                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
417
418         return 0;
419 }
420
421 /*
422  * Parse the portmask provided at run time.
423  */
424 static int
425 parse_portmask(const char *portmask)
426 {
427         char *end = NULL;
428         unsigned long pm;
429
430         errno = 0;
431
432         /* parse hexadecimal string */
433         pm = strtoul(portmask, &end, 16);
434         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
435                 return -1;
436
437         if (pm == 0)
438                 return -1;
439
440         return pm;
441
442 }
443
444 /*
445  * Parse num options at run time.
446  */
447 static int
448 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
449 {
450         char *end = NULL;
451         unsigned long num;
452
453         errno = 0;
454
455         /* parse unsigned int string */
456         num = strtoul(q_arg, &end, 10);
457         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
458                 return -1;
459
460         if (num > max_valid_value)
461                 return -1;
462
463         return num;
464
465 }
466
467 /*
468  * Display usage
469  */
470 static void
471 us_vhost_usage(const char *prgname)
472 {
473         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
474         "               --vm2vm [0|1|2]\n"
475         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
476         "               --dev-basename <name>\n"
477         "               --nb-devices ND\n"
478         "               -p PORTMASK: Set mask for ports to be used by application\n"
479         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
480         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
481         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
482         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
483         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
484         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
485         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
486         "               --dev-basename: The basename to be used for the character device.\n"
487         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
488         "               --tso [0|1] disable/enable TCP segment offload.\n",
489                prgname);
490 }
491
492 /*
493  * Parse the arguments given in the command line of the application.
494  */
495 static int
496 us_vhost_parse_args(int argc, char **argv)
497 {
498         int opt, ret;
499         int option_index;
500         unsigned i;
501         const char *prgname = argv[0];
502         static struct option long_option[] = {
503                 {"vm2vm", required_argument, NULL, 0},
504                 {"rx-retry", required_argument, NULL, 0},
505                 {"rx-retry-delay", required_argument, NULL, 0},
506                 {"rx-retry-num", required_argument, NULL, 0},
507                 {"mergeable", required_argument, NULL, 0},
508                 {"vlan-strip", required_argument, NULL, 0},
509                 {"stats", required_argument, NULL, 0},
510                 {"dev-basename", required_argument, NULL, 0},
511                 {"tx-csum", required_argument, NULL, 0},
512                 {"tso", required_argument, NULL, 0},
513                 {NULL, 0, 0, 0},
514         };
515
516         /* Parse command line */
517         while ((opt = getopt_long(argc, argv, "p:P",
518                         long_option, &option_index)) != EOF) {
519                 switch (opt) {
520                 /* Portmask */
521                 case 'p':
522                         enabled_port_mask = parse_portmask(optarg);
523                         if (enabled_port_mask == 0) {
524                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
525                                 us_vhost_usage(prgname);
526                                 return -1;
527                         }
528                         break;
529
530                 case 'P':
531                         promiscuous = 1;
532                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
533                                 ETH_VMDQ_ACCEPT_BROADCAST |
534                                 ETH_VMDQ_ACCEPT_MULTICAST;
535                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
536
537                         break;
538
539                 case 0:
540                         /* Enable/disable vm2vm comms. */
541                         if (!strncmp(long_option[option_index].name, "vm2vm",
542                                 MAX_LONG_OPT_SZ)) {
543                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
544                                 if (ret == -1) {
545                                         RTE_LOG(INFO, VHOST_CONFIG,
546                                                 "Invalid argument for "
547                                                 "vm2vm [0|1|2]\n");
548                                         us_vhost_usage(prgname);
549                                         return -1;
550                                 } else {
551                                         vm2vm_mode = (vm2vm_type)ret;
552                                 }
553                         }
554
555                         /* Enable/disable retries on RX. */
556                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
557                                 ret = parse_num_opt(optarg, 1);
558                                 if (ret == -1) {
559                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
560                                         us_vhost_usage(prgname);
561                                         return -1;
562                                 } else {
563                                         enable_retry = ret;
564                                 }
565                         }
566
567                         /* Enable/disable TX checksum offload. */
568                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
569                                 ret = parse_num_opt(optarg, 1);
570                                 if (ret == -1) {
571                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
572                                         us_vhost_usage(prgname);
573                                         return -1;
574                                 } else
575                                         enable_tx_csum = ret;
576                         }
577
578                         /* Enable/disable TSO offload. */
579                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
580                                 ret = parse_num_opt(optarg, 1);
581                                 if (ret == -1) {
582                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
583                                         us_vhost_usage(prgname);
584                                         return -1;
585                                 } else
586                                         enable_tso = ret;
587                         }
588
589                         /* Specify the retries delay time (in useconds) on RX. */
590                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
591                                 ret = parse_num_opt(optarg, INT32_MAX);
592                                 if (ret == -1) {
593                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
594                                         us_vhost_usage(prgname);
595                                         return -1;
596                                 } else {
597                                         burst_rx_delay_time = ret;
598                                 }
599                         }
600
601                         /* Specify the retries number on RX. */
602                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
603                                 ret = parse_num_opt(optarg, INT32_MAX);
604                                 if (ret == -1) {
605                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
606                                         us_vhost_usage(prgname);
607                                         return -1;
608                                 } else {
609                                         burst_rx_retry_num = ret;
610                                 }
611                         }
612
613                         /* Enable/disable RX mergeable buffers. */
614                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
615                                 ret = parse_num_opt(optarg, 1);
616                                 if (ret == -1) {
617                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
618                                         us_vhost_usage(prgname);
619                                         return -1;
620                                 } else {
621                                         mergeable = !!ret;
622                                         if (ret) {
623                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
624                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
625                                                         = JUMBO_FRAME_MAX_SIZE;
626                                         }
627                                 }
628                         }
629
630                         /* Enable/disable RX VLAN strip on host. */
631                         if (!strncmp(long_option[option_index].name,
632                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
633                                 ret = parse_num_opt(optarg, 1);
634                                 if (ret == -1) {
635                                         RTE_LOG(INFO, VHOST_CONFIG,
636                                                 "Invalid argument for VLAN strip [0|1]\n");
637                                         us_vhost_usage(prgname);
638                                         return -1;
639                                 } else {
640                                         vlan_strip = !!ret;
641                                         vmdq_conf_default.rxmode.hw_vlan_strip =
642                                                 vlan_strip;
643                                 }
644                         }
645
646                         /* Enable/disable stats. */
647                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
648                                 ret = parse_num_opt(optarg, INT32_MAX);
649                                 if (ret == -1) {
650                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 } else {
654                                         enable_stats = ret;
655                                 }
656                         }
657
658                         /* Set character device basename. */
659                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
660                                 if (us_vhost_parse_basename(optarg) == -1) {
661                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
662                                         us_vhost_usage(prgname);
663                                         return -1;
664                                 }
665                         }
666
667                         break;
668
669                         /* Invalid option - print options. */
670                 default:
671                         us_vhost_usage(prgname);
672                         return -1;
673                 }
674         }
675
676         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
677                 if (enabled_port_mask & (1 << i))
678                         ports[num_ports++] = (uint8_t)i;
679         }
680
681         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
682                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
683                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
684                 return -1;
685         }
686
687         return 0;
688 }
689
690 /*
691  * Update the global var NUM_PORTS and array PORTS according to system ports number
692  * and return valid ports number
693  */
694 static unsigned check_ports_num(unsigned nb_ports)
695 {
696         unsigned valid_num_ports = num_ports;
697         unsigned portid;
698
699         if (num_ports > nb_ports) {
700                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
701                         num_ports, nb_ports);
702                 num_ports = nb_ports;
703         }
704
705         for (portid = 0; portid < num_ports; portid ++) {
706                 if (ports[portid] >= nb_ports) {
707                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
708                                 ports[portid], (nb_ports - 1));
709                         ports[portid] = INVALID_PORT_ID;
710                         valid_num_ports--;
711                 }
712         }
713         return valid_num_ports;
714 }
715
716 /*
717  * Compares a packet destination MAC address to a device MAC address.
718  */
719 static inline int __attribute__((always_inline))
720 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
721 {
722         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
723 }
724
725 /*
726  * This function learns the MAC address of the device and registers this along with a
727  * vlan tag to a VMDQ.
728  */
729 static int
730 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
731 {
732         struct ether_hdr *pkt_hdr;
733         struct virtio_net_data_ll *dev_ll;
734         struct virtio_net *dev = vdev->dev;
735         int i, ret;
736
737         /* Learn MAC address of guest device from packet */
738         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
739
740         dev_ll = ll_root_used;
741
742         while (dev_ll != NULL) {
743                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
744                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
745                         return -1;
746                 }
747                 dev_ll = dev_ll->next;
748         }
749
750         for (i = 0; i < ETHER_ADDR_LEN; i++)
751                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
752
753         /* vlan_tag currently uses the device_id. */
754         vdev->vlan_tag = vlan_tags[dev->device_fh];
755
756         /* Print out VMDQ registration info. */
757         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
758                 dev->device_fh,
759                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
760                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
761                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
762                 vdev->vlan_tag);
763
764         /* Register the MAC address. */
765         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
766                                 (uint32_t)dev->device_fh + vmdq_pool_base);
767         if (ret)
768                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
769                                         dev->device_fh);
770
771         /* Enable stripping of the vlan tag as we handle routing. */
772         if (vlan_strip)
773                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
774                         (uint16_t)vdev->vmdq_rx_q, 1);
775
776         /* Set device as ready for RX. */
777         vdev->ready = DEVICE_RX;
778
779         return 0;
780 }
781
782 /*
783  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
784  * queue before disabling RX on the device.
785  */
786 static inline void
787 unlink_vmdq(struct vhost_dev *vdev)
788 {
789         unsigned i = 0;
790         unsigned rx_count;
791         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
792
793         if (vdev->ready == DEVICE_RX) {
794                 /*clear MAC and VLAN settings*/
795                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
796                 for (i = 0; i < 6; i++)
797                         vdev->mac_address.addr_bytes[i] = 0;
798
799                 vdev->vlan_tag = 0;
800
801                 /*Clear out the receive buffers*/
802                 rx_count = rte_eth_rx_burst(ports[0],
803                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
804
805                 while (rx_count) {
806                         for (i = 0; i < rx_count; i++)
807                                 rte_pktmbuf_free(pkts_burst[i]);
808
809                         rx_count = rte_eth_rx_burst(ports[0],
810                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
811                 }
812
813                 vdev->ready = DEVICE_MAC_LEARNING;
814         }
815 }
816
817 /*
818  * Check if the packet destination MAC address is for a local device. If so then put
819  * the packet on that devices RX queue. If not then return.
820  */
821 static inline int __attribute__((always_inline))
822 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
823 {
824         struct virtio_net_data_ll *dev_ll;
825         struct ether_hdr *pkt_hdr;
826         uint64_t ret = 0;
827         struct virtio_net *dev = vdev->dev;
828         struct virtio_net *tdev; /* destination virito device */
829
830         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
831
832         /*get the used devices list*/
833         dev_ll = ll_root_used;
834
835         while (dev_ll != NULL) {
836                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
837                                           &dev_ll->vdev->mac_address)) {
838
839                         /* Drop the packet if the TX packet is destined for the TX device. */
840                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
841                                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
842                                         "Source and destination MAC addresses are the same. "
843                                         "Dropping packet.\n",
844                                         dev->device_fh);
845                                 return 0;
846                         }
847                         tdev = dev_ll->vdev->dev;
848
849
850                         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
851                                 "MAC address is local\n", tdev->device_fh);
852
853                         if (unlikely(dev_ll->vdev->remove)) {
854                                 /*drop the packet if the device is marked for removal*/
855                                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
856                                         "Device is marked for removal\n", tdev->device_fh);
857                         } else {
858                                 /*send the packet to the local virtio device*/
859                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
860                                 if (enable_stats) {
861                                         rte_atomic64_add(
862                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
863                                         1);
864                                         rte_atomic64_add(
865                                         &dev_statistics[tdev->device_fh].rx_atomic,
866                                         ret);
867                                         dev_statistics[dev->device_fh].tx_total++;
868                                         dev_statistics[dev->device_fh].tx += ret;
869                                 }
870                         }
871
872                         return 0;
873                 }
874                 dev_ll = dev_ll->next;
875         }
876
877         return -1;
878 }
879
880 /*
881  * Check if the destination MAC of a packet is one local VM,
882  * and get its vlan tag, and offset if it is.
883  */
884 static inline int __attribute__((always_inline))
885 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
886         uint32_t *offset, uint16_t *vlan_tag)
887 {
888         struct virtio_net_data_ll *dev_ll = ll_root_used;
889         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
890
891         while (dev_ll != NULL) {
892                 if ((dev_ll->vdev->ready == DEVICE_RX)
893                         && ether_addr_cmp(&(pkt_hdr->d_addr),
894                 &dev_ll->vdev->mac_address)) {
895                         /*
896                          * Drop the packet if the TX packet is
897                          * destined for the TX device.
898                          */
899                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
900                                 RTE_LOG(DEBUG, VHOST_DATA,
901                                 "(%"PRIu64") TX: Source and destination"
902                                 " MAC addresses are the same. Dropping "
903                                 "packet.\n",
904                                 dev_ll->vdev->dev->device_fh);
905                                 return -1;
906                         }
907
908                         /*
909                          * HW vlan strip will reduce the packet length
910                          * by minus length of vlan tag, so need restore
911                          * the packet length by plus it.
912                          */
913                         *offset = VLAN_HLEN;
914                         *vlan_tag =
915                         (uint16_t)
916                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
917
918                         RTE_LOG(DEBUG, VHOST_DATA,
919                         "(%"PRIu64") TX: pkt to local VM device id:"
920                         "(%"PRIu64") vlan tag: %d.\n",
921                         dev->device_fh, dev_ll->vdev->dev->device_fh,
922                         (int)*vlan_tag);
923
924                         break;
925                 }
926                 dev_ll = dev_ll->next;
927         }
928         return 0;
929 }
930
931 static uint16_t
932 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
933 {
934         if (ol_flags & PKT_TX_IPV4)
935                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
936         else /* assume ethertype == ETHER_TYPE_IPv6 */
937                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
938 }
939
940 static void virtio_tx_offload(struct rte_mbuf *m)
941 {
942         void *l3_hdr;
943         struct ipv4_hdr *ipv4_hdr = NULL;
944         struct tcp_hdr *tcp_hdr = NULL;
945         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
946
947         l3_hdr = (char *)eth_hdr + m->l2_len;
948
949         if (m->ol_flags & PKT_TX_IPV4) {
950                 ipv4_hdr = l3_hdr;
951                 ipv4_hdr->hdr_checksum = 0;
952                 m->ol_flags |= PKT_TX_IP_CKSUM;
953         }
954
955         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
956         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
957 }
958
959 /*
960  * This function routes the TX packet to the correct interface. This may be a local device
961  * or the physical port.
962  */
963 static inline void __attribute__((always_inline))
964 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
965 {
966         struct mbuf_table *tx_q;
967         struct rte_mbuf **m_table;
968         unsigned len, ret, offset = 0;
969         const uint16_t lcore_id = rte_lcore_id();
970         struct virtio_net *dev = vdev->dev;
971         struct ether_hdr *nh;
972
973         /*check if destination is local VM*/
974         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
975                 rte_pktmbuf_free(m);
976                 return;
977         }
978
979         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
980                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
981                         rte_pktmbuf_free(m);
982                         return;
983                 }
984         }
985
986         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
987                 "MAC address is external\n", dev->device_fh);
988
989         /*Add packet to the port tx queue*/
990         tx_q = &lcore_tx_queue[lcore_id];
991         len = tx_q->len;
992
993         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
994         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
995                 /* Guest has inserted the vlan tag. */
996                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
997                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
998                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
999                         (vh->vlan_tci != vlan_tag_be))
1000                         vh->vlan_tci = vlan_tag_be;
1001         } else {
1002                 m->ol_flags |= PKT_TX_VLAN_PKT;
1003
1004                 /*
1005                  * Find the right seg to adjust the data len when offset is
1006                  * bigger than tail room size.
1007                  */
1008                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1009                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1010                                 m->data_len += offset;
1011                         else {
1012                                 struct rte_mbuf *seg = m;
1013
1014                                 while ((seg->next != NULL) &&
1015                                         (offset > rte_pktmbuf_tailroom(seg)))
1016                                         seg = seg->next;
1017
1018                                 seg->data_len += offset;
1019                         }
1020                         m->pkt_len += offset;
1021                 }
1022
1023                 m->vlan_tci = vlan_tag;
1024         }
1025
1026         if (m->ol_flags & PKT_TX_TCP_SEG)
1027                 virtio_tx_offload(m);
1028
1029         tx_q->m_table[len] = m;
1030         len++;
1031         if (enable_stats) {
1032                 dev_statistics[dev->device_fh].tx_total++;
1033                 dev_statistics[dev->device_fh].tx++;
1034         }
1035
1036         if (unlikely(len == MAX_PKT_BURST)) {
1037                 m_table = (struct rte_mbuf **)tx_q->m_table;
1038                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1039                 /* Free any buffers not handled by TX and update the port stats. */
1040                 if (unlikely(ret < len)) {
1041                         do {
1042                                 rte_pktmbuf_free(m_table[ret]);
1043                         } while (++ret < len);
1044                 }
1045
1046                 len = 0;
1047         }
1048
1049         tx_q->len = len;
1050         return;
1051 }
1052 /*
1053  * This function is called by each data core. It handles all RX/TX registered with the
1054  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1055  * with all devices in the main linked list.
1056  */
1057 static int
1058 switch_worker(__attribute__((unused)) void *arg)
1059 {
1060         struct virtio_net *dev = NULL;
1061         struct vhost_dev *vdev = NULL;
1062         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1063         struct virtio_net_data_ll *dev_ll;
1064         struct mbuf_table *tx_q;
1065         volatile struct lcore_ll_info *lcore_ll;
1066         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1067         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1068         unsigned ret, i;
1069         const uint16_t lcore_id = rte_lcore_id();
1070         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1071         uint16_t rx_count = 0;
1072         uint16_t tx_count;
1073         uint32_t retry = 0;
1074
1075         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1076         lcore_ll = lcore_info[lcore_id].lcore_ll;
1077         prev_tsc = 0;
1078
1079         tx_q = &lcore_tx_queue[lcore_id];
1080         for (i = 0; i < num_cores; i ++) {
1081                 if (lcore_ids[i] == lcore_id) {
1082                         tx_q->txq_id = i;
1083                         break;
1084                 }
1085         }
1086
1087         while(1) {
1088                 cur_tsc = rte_rdtsc();
1089                 /*
1090                  * TX burst queue drain
1091                  */
1092                 diff_tsc = cur_tsc - prev_tsc;
1093                 if (unlikely(diff_tsc > drain_tsc)) {
1094
1095                         if (tx_q->len) {
1096                                 RTE_LOG(DEBUG, VHOST_DATA,
1097                                         "TX queue drained after timeout with burst size %u\n",
1098                                         tx_q->len);
1099
1100                                 /*Tx any packets in the queue*/
1101                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1102                                                                            (struct rte_mbuf **)tx_q->m_table,
1103                                                                            (uint16_t)tx_q->len);
1104                                 if (unlikely(ret < tx_q->len)) {
1105                                         do {
1106                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1107                                         } while (++ret < tx_q->len);
1108                                 }
1109
1110                                 tx_q->len = 0;
1111                         }
1112
1113                         prev_tsc = cur_tsc;
1114
1115                 }
1116
1117                 rte_prefetch0(lcore_ll->ll_root_used);
1118                 /*
1119                  * Inform the configuration core that we have exited the linked list and that no devices are
1120                  * in use if requested.
1121                  */
1122                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1123                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1124
1125                 /*
1126                  * Process devices
1127                  */
1128                 dev_ll = lcore_ll->ll_root_used;
1129
1130                 while (dev_ll != NULL) {
1131                         /*get virtio device ID*/
1132                         vdev = dev_ll->vdev;
1133                         dev = vdev->dev;
1134
1135                         if (unlikely(vdev->remove)) {
1136                                 dev_ll = dev_ll->next;
1137                                 unlink_vmdq(vdev);
1138                                 vdev->ready = DEVICE_SAFE_REMOVE;
1139                                 continue;
1140                         }
1141                         if (likely(vdev->ready == DEVICE_RX)) {
1142                                 /*Handle guest RX*/
1143                                 rx_count = rte_eth_rx_burst(ports[0],
1144                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1145
1146                                 if (rx_count) {
1147                                         /*
1148                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1149                                         * Here MAX_PKT_BURST must be less than virtio queue size
1150                                         */
1151                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1152                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1153                                                         rte_delay_us(burst_rx_delay_time);
1154                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1155                                                                 break;
1156                                                 }
1157                                         }
1158                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1159                                         if (enable_stats) {
1160                                                 rte_atomic64_add(
1161                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1162                                                 rx_count);
1163                                                 rte_atomic64_add(
1164                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1165                                         }
1166                                         while (likely(rx_count)) {
1167                                                 rx_count--;
1168                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1169                                         }
1170
1171                                 }
1172                         }
1173
1174                         if (likely(!vdev->remove)) {
1175                                 /* Handle guest TX*/
1176                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1177                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1178                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1179                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1180                                                 while (tx_count)
1181                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1182                                         }
1183                                 }
1184                                 for (i = 0; i < tx_count; ++i) {
1185                                         virtio_tx_route(vdev, pkts_burst[i],
1186                                                 vlan_tags[(uint16_t)dev->device_fh]);
1187                                 }
1188                         }
1189
1190                         /*move to the next device in the list*/
1191                         dev_ll = dev_ll->next;
1192                 }
1193         }
1194
1195         return 0;
1196 }
1197
1198 /*
1199  * Add an entry to a used linked list. A free entry must first be found
1200  * in the free linked list using get_data_ll_free_entry();
1201  */
1202 static void
1203 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
1204         struct virtio_net_data_ll *ll_dev)
1205 {
1206         struct virtio_net_data_ll *ll = *ll_root_addr;
1207
1208         /* Set next as NULL and use a compiler barrier to avoid reordering. */
1209         ll_dev->next = NULL;
1210         rte_compiler_barrier();
1211
1212         /* If ll == NULL then this is the first device. */
1213         if (ll) {
1214                 /* Increment to the tail of the linked list. */
1215                 while ((ll->next != NULL) )
1216                         ll = ll->next;
1217
1218                 ll->next = ll_dev;
1219         } else {
1220                 *ll_root_addr = ll_dev;
1221         }
1222 }
1223
1224 /*
1225  * Remove an entry from a used linked list. The entry must then be added to
1226  * the free linked list using put_data_ll_free_entry().
1227  */
1228 static void
1229 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
1230         struct virtio_net_data_ll *ll_dev,
1231         struct virtio_net_data_ll *ll_dev_last)
1232 {
1233         struct virtio_net_data_ll *ll = *ll_root_addr;
1234
1235         if (unlikely((ll == NULL) || (ll_dev == NULL)))
1236                 return;
1237
1238         if (ll_dev == ll)
1239                 *ll_root_addr = ll_dev->next;
1240         else
1241                 if (likely(ll_dev_last != NULL))
1242                         ll_dev_last->next = ll_dev->next;
1243                 else
1244                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
1245 }
1246
1247 /*
1248  * Find and return an entry from the free linked list.
1249  */
1250 static struct virtio_net_data_ll *
1251 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
1252 {
1253         struct virtio_net_data_ll *ll_free = *ll_root_addr;
1254         struct virtio_net_data_ll *ll_dev;
1255
1256         if (ll_free == NULL)
1257                 return NULL;
1258
1259         ll_dev = ll_free;
1260         *ll_root_addr = ll_free->next;
1261
1262         return ll_dev;
1263 }
1264
1265 /*
1266  * Place an entry back on to the free linked list.
1267  */
1268 static void
1269 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
1270         struct virtio_net_data_ll *ll_dev)
1271 {
1272         struct virtio_net_data_ll *ll_free = *ll_root_addr;
1273
1274         if (ll_dev == NULL)
1275                 return;
1276
1277         ll_dev->next = ll_free;
1278         *ll_root_addr = ll_dev;
1279 }
1280
1281 /*
1282  * Creates a linked list of a given size.
1283  */
1284 static struct virtio_net_data_ll *
1285 alloc_data_ll(uint32_t size)
1286 {
1287         struct virtio_net_data_ll *ll_new;
1288         uint32_t i;
1289
1290         /* Malloc and then chain the linked list. */
1291         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
1292         if (ll_new == NULL) {
1293                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
1294                 return NULL;
1295         }
1296
1297         for (i = 0; i < size - 1; i++) {
1298                 ll_new[i].vdev = NULL;
1299                 ll_new[i].next = &ll_new[i+1];
1300         }
1301         ll_new[i].next = NULL;
1302
1303         return ll_new;
1304 }
1305
1306 /*
1307  * Create the main linked list along with each individual cores linked list. A used and a free list
1308  * are created to manage entries.
1309  */
1310 static int
1311 init_data_ll (void)
1312 {
1313         int lcore;
1314
1315         RTE_LCORE_FOREACH_SLAVE(lcore) {
1316                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
1317                 if (lcore_info[lcore].lcore_ll == NULL) {
1318                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
1319                         return -1;
1320                 }
1321
1322                 lcore_info[lcore].lcore_ll->device_num = 0;
1323                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1324                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
1325                 if (num_devices % num_switching_cores)
1326                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
1327                 else
1328                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
1329         }
1330
1331         /* Allocate devices up to a maximum of MAX_DEVICES. */
1332         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
1333
1334         return 0;
1335 }
1336
1337 /*
1338  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
1339  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1340  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1341  */
1342 static void
1343 destroy_device (volatile struct virtio_net *dev)
1344 {
1345         struct virtio_net_data_ll *ll_lcore_dev_cur;
1346         struct virtio_net_data_ll *ll_main_dev_cur;
1347         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
1348         struct virtio_net_data_ll *ll_main_dev_last = NULL;
1349         struct vhost_dev *vdev;
1350         int lcore;
1351
1352         dev->flags &= ~VIRTIO_DEV_RUNNING;
1353
1354         vdev = (struct vhost_dev *)dev->priv;
1355         /*set the remove flag. */
1356         vdev->remove = 1;
1357         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1358                 rte_pause();
1359         }
1360
1361         /* Search for entry to be removed from lcore ll */
1362         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
1363         while (ll_lcore_dev_cur != NULL) {
1364                 if (ll_lcore_dev_cur->vdev == vdev) {
1365                         break;
1366                 } else {
1367                         ll_lcore_dev_last = ll_lcore_dev_cur;
1368                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
1369                 }
1370         }
1371
1372         if (ll_lcore_dev_cur == NULL) {
1373                 RTE_LOG(ERR, VHOST_CONFIG,
1374                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
1375                         dev->device_fh);
1376                 return;
1377         }
1378
1379         /* Search for entry to be removed from main ll */
1380         ll_main_dev_cur = ll_root_used;
1381         ll_main_dev_last = NULL;
1382         while (ll_main_dev_cur != NULL) {
1383                 if (ll_main_dev_cur->vdev == vdev) {
1384                         break;
1385                 } else {
1386                         ll_main_dev_last = ll_main_dev_cur;
1387                         ll_main_dev_cur = ll_main_dev_cur->next;
1388                 }
1389         }
1390
1391         /* Remove entries from the lcore and main ll. */
1392         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
1393         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
1394
1395         /* Set the dev_removal_flag on each lcore. */
1396         RTE_LCORE_FOREACH_SLAVE(lcore) {
1397                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
1398         }
1399
1400         /*
1401          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
1402          * they can no longer access the device removed from the linked lists and that the devices
1403          * are no longer in use.
1404          */
1405         RTE_LCORE_FOREACH_SLAVE(lcore) {
1406                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
1407                         rte_pause();
1408                 }
1409         }
1410
1411         /* Add the entries back to the lcore and main free ll.*/
1412         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
1413         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
1414
1415         /* Decrement number of device on the lcore. */
1416         lcore_info[vdev->coreid].lcore_ll->device_num--;
1417
1418         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
1419
1420         rte_free(vdev);
1421
1422 }
1423
1424 /*
1425  * A new device is added to a data core. First the device is added to the main linked list
1426  * and the allocated to a specific data core.
1427  */
1428 static int
1429 new_device (struct virtio_net *dev)
1430 {
1431         struct virtio_net_data_ll *ll_dev;
1432         int lcore, core_add = 0;
1433         uint32_t device_num_min = num_devices;
1434         struct vhost_dev *vdev;
1435
1436         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1437         if (vdev == NULL) {
1438                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1439                         dev->device_fh);
1440                 return -1;
1441         }
1442         vdev->dev = dev;
1443         dev->priv = vdev;
1444
1445         /* Add device to main ll */
1446         ll_dev = get_data_ll_free_entry(&ll_root_free);
1447         if (ll_dev == NULL) {
1448                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
1449                         "of %d devices per core has been reached\n",
1450                         dev->device_fh, num_devices);
1451                 rte_free(vdev);
1452                 return -1;
1453         }
1454         ll_dev->vdev = vdev;
1455         add_data_ll_entry(&ll_root_used, ll_dev);
1456         vdev->vmdq_rx_q
1457                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
1458
1459         /*reset ready flag*/
1460         vdev->ready = DEVICE_MAC_LEARNING;
1461         vdev->remove = 0;
1462
1463         /* Find a suitable lcore to add the device. */
1464         RTE_LCORE_FOREACH_SLAVE(lcore) {
1465                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
1466                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
1467                         core_add = lcore;
1468                 }
1469         }
1470         /* Add device to lcore ll */
1471         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
1472         if (ll_dev == NULL) {
1473                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
1474                 vdev->ready = DEVICE_SAFE_REMOVE;
1475                 destroy_device(dev);
1476                 rte_free(vdev);
1477                 return -1;
1478         }
1479         ll_dev->vdev = vdev;
1480         vdev->coreid = core_add;
1481
1482         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
1483
1484         /* Initialize device stats */
1485         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1486
1487         /* Disable notifications. */
1488         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1489         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1490         lcore_info[vdev->coreid].lcore_ll->device_num++;
1491         dev->flags |= VIRTIO_DEV_RUNNING;
1492
1493         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1494
1495         return 0;
1496 }
1497
1498 /*
1499  * These callback allow devices to be added to the data core when configuration
1500  * has been fully complete.
1501  */
1502 static const struct virtio_net_device_ops virtio_net_device_ops =
1503 {
1504         .new_device =  new_device,
1505         .destroy_device = destroy_device,
1506 };
1507
1508 /*
1509  * This is a thread will wake up after a period to print stats if the user has
1510  * enabled them.
1511  */
1512 static void
1513 print_stats(void)
1514 {
1515         struct virtio_net_data_ll *dev_ll;
1516         uint64_t tx_dropped, rx_dropped;
1517         uint64_t tx, tx_total, rx, rx_total;
1518         uint32_t device_fh;
1519         const char clr[] = { 27, '[', '2', 'J', '\0' };
1520         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1521
1522         while(1) {
1523                 sleep(enable_stats);
1524
1525                 /* Clear screen and move to top left */
1526                 printf("%s%s", clr, top_left);
1527
1528                 printf("\nDevice statistics ====================================");
1529
1530                 dev_ll = ll_root_used;
1531                 while (dev_ll != NULL) {
1532                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
1533                         tx_total = dev_statistics[device_fh].tx_total;
1534                         tx = dev_statistics[device_fh].tx;
1535                         tx_dropped = tx_total - tx;
1536                         rx_total = rte_atomic64_read(
1537                                 &dev_statistics[device_fh].rx_total_atomic);
1538                         rx = rte_atomic64_read(
1539                                 &dev_statistics[device_fh].rx_atomic);
1540                         rx_dropped = rx_total - rx;
1541
1542                         printf("\nStatistics for device %"PRIu32" ------------------------------"
1543                                         "\nTX total:            %"PRIu64""
1544                                         "\nTX dropped:          %"PRIu64""
1545                                         "\nTX successful:               %"PRIu64""
1546                                         "\nRX total:            %"PRIu64""
1547                                         "\nRX dropped:          %"PRIu64""
1548                                         "\nRX successful:               %"PRIu64"",
1549                                         device_fh,
1550                                         tx_total,
1551                                         tx_dropped,
1552                                         tx,
1553                                         rx_total,
1554                                         rx_dropped,
1555                                         rx);
1556
1557                         dev_ll = dev_ll->next;
1558                 }
1559                 printf("\n======================================================\n");
1560         }
1561 }
1562
1563 /* When we receive a INT signal, unregister vhost driver */
1564 static void
1565 sigint_handler(__rte_unused int signum)
1566 {
1567         /* Unregister vhost driver. */
1568         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1569         if (ret != 0)
1570                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1571         exit(0);
1572 }
1573
1574 /*
1575  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1576  * device is also registered here to handle the IOCTLs.
1577  */
1578 int
1579 main(int argc, char *argv[])
1580 {
1581         unsigned lcore_id, core_id = 0;
1582         unsigned nb_ports, valid_num_ports;
1583         int ret;
1584         uint8_t portid;
1585         static pthread_t tid;
1586         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1587
1588         signal(SIGINT, sigint_handler);
1589
1590         /* init EAL */
1591         ret = rte_eal_init(argc, argv);
1592         if (ret < 0)
1593                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1594         argc -= ret;
1595         argv += ret;
1596
1597         /* parse app arguments */
1598         ret = us_vhost_parse_args(argc, argv);
1599         if (ret < 0)
1600                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1601
1602         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1603                 if (rte_lcore_is_enabled(lcore_id))
1604                         lcore_ids[core_id ++] = lcore_id;
1605
1606         if (rte_lcore_count() > RTE_MAX_LCORE)
1607                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1608
1609         /*set the number of swithcing cores available*/
1610         num_switching_cores = rte_lcore_count()-1;
1611
1612         /* Get the number of physical ports. */
1613         nb_ports = rte_eth_dev_count();
1614         if (nb_ports > RTE_MAX_ETHPORTS)
1615                 nb_ports = RTE_MAX_ETHPORTS;
1616
1617         /*
1618          * Update the global var NUM_PORTS and global array PORTS
1619          * and get value of var VALID_NUM_PORTS according to system ports number
1620          */
1621         valid_num_ports = check_ports_num(nb_ports);
1622
1623         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1624                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1625                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1626                 return -1;
1627         }
1628
1629         /* Create the mbuf pool. */
1630         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1631                 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1632                 0, MBUF_DATA_SIZE, rte_socket_id());
1633         if (mbuf_pool == NULL)
1634                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1635
1636         if (vm2vm_mode == VM2VM_HARDWARE) {
1637                 /* Enable VT loop back to let L2 switch to do it. */
1638                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1639                 RTE_LOG(DEBUG, VHOST_CONFIG,
1640                         "Enable loop back for L2 switch in vmdq.\n");
1641         }
1642
1643         /* initialize all ports */
1644         for (portid = 0; portid < nb_ports; portid++) {
1645                 /* skip ports that are not enabled */
1646                 if ((enabled_port_mask & (1 << portid)) == 0) {
1647                         RTE_LOG(INFO, VHOST_PORT,
1648                                 "Skipping disabled port %d\n", portid);
1649                         continue;
1650                 }
1651                 if (port_init(portid) != 0)
1652                         rte_exit(EXIT_FAILURE,
1653                                 "Cannot initialize network ports\n");
1654         }
1655
1656         /* Initialise all linked lists. */
1657         if (init_data_ll() == -1)
1658                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
1659
1660         /* Initialize device stats */
1661         memset(&dev_statistics, 0, sizeof(dev_statistics));
1662
1663         /* Enable stats if the user option is set. */
1664         if (enable_stats) {
1665                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1666                 if (ret != 0)
1667                         rte_exit(EXIT_FAILURE,
1668                                 "Cannot create print-stats thread\n");
1669
1670                 /* Set thread_name for aid in debugging.  */
1671                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1672                 ret = rte_thread_setname(tid, thread_name);
1673                 if (ret != 0)
1674                         RTE_LOG(ERR, VHOST_CONFIG,
1675                                 "Cannot set print-stats name\n");
1676         }
1677
1678         /* Launch all data cores. */
1679         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1680                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1681
1682         if (mergeable == 0)
1683                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1684
1685         /* Register vhost(cuse or user) driver to handle vhost messages. */
1686         ret = rte_vhost_driver_register((char *)&dev_basename);
1687         if (ret != 0)
1688                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1689
1690         rte_vhost_driver_callback_register(&virtio_net_device_ops);
1691
1692         /* Start CUSE session. */
1693         rte_vhost_driver_session_start();
1694         return 0;
1695
1696 }