examples/vhost: handle broadcast packet
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
78
79 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
81
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX                       1
87 #define DEVICE_SAFE_REMOVE      2
88
89 /* Configurable number of RX/TX ring descriptors */
90 #define RTE_TEST_RX_DESC_DEFAULT 1024
91 #define RTE_TEST_TX_DESC_DEFAULT 512
92
93 #define INVALID_PORT_ID 0xFF
94
95 /* Max number of devices. Limited by vmdq. */
96 #define MAX_DEVICES 64
97
98 /* Size of buffers used for snprintfs. */
99 #define MAX_PRINT_BUFF 6072
100
101 /* Maximum character device basename size. */
102 #define MAX_BASENAME_SZ 10
103
104 /* Maximum long option length for option parsing. */
105 #define MAX_LONG_OPT_SZ 64
106
107 /* mask of enabled ports */
108 static uint32_t enabled_port_mask = 0;
109
110 /* Promiscuous mode */
111 static uint32_t promiscuous;
112
113 /*Number of switching cores enabled*/
114 static uint32_t num_switching_cores = 0;
115
116 /* number of devices/queues to support*/
117 static uint32_t num_queues = 0;
118 static uint32_t num_devices;
119
120 static struct rte_mempool *mbuf_pool;
121 static int mergeable;
122
123 /* Do vlan strip on host, enabled on default */
124 static uint32_t vlan_strip = 1;
125
126 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
127 typedef enum {
128         VM2VM_DISABLED = 0,
129         VM2VM_SOFTWARE = 1,
130         VM2VM_HARDWARE = 2,
131         VM2VM_LAST
132 } vm2vm_type;
133 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
134
135 /* Enable stats. */
136 static uint32_t enable_stats = 0;
137 /* Enable retries on RX. */
138 static uint32_t enable_retry = 1;
139
140 /* Disable TX checksum offload */
141 static uint32_t enable_tx_csum;
142
143 /* Disable TSO offload */
144 static uint32_t enable_tso;
145
146 /* Specify timeout (in useconds) between retries on RX. */
147 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
148 /* Specify the number of retries on RX. */
149 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
150
151 /* Character device basename. Can be set by user. */
152 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
153
154 /* empty vmdq configuration structure. Filled in programatically */
155 static struct rte_eth_conf vmdq_conf_default = {
156         .rxmode = {
157                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
158                 .split_hdr_size = 0,
159                 .header_split   = 0, /**< Header Split disabled */
160                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
161                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
162                 /*
163                  * It is necessary for 1G NIC such as I350,
164                  * this fixes bug of ipv4 forwarding in guest can't
165                  * forward pakets from one virtio dev to another virtio dev.
166                  */
167                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
168                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
169                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
170         },
171
172         .txmode = {
173                 .mq_mode = ETH_MQ_TX_NONE,
174         },
175         .rx_adv_conf = {
176                 /*
177                  * should be overridden separately in code with
178                  * appropriate values
179                  */
180                 .vmdq_rx_conf = {
181                         .nb_queue_pools = ETH_8_POOLS,
182                         .enable_default_pool = 0,
183                         .default_pool = 0,
184                         .nb_pool_maps = 0,
185                         .pool_map = {{0, 0},},
186                 },
187         },
188 };
189
190 static unsigned lcore_ids[RTE_MAX_LCORE];
191 static uint8_t ports[RTE_MAX_ETHPORTS];
192 static unsigned num_ports = 0; /**< The number of ports specified in command line */
193 static uint16_t num_pf_queues, num_vmdq_queues;
194 static uint16_t vmdq_pool_base, vmdq_queue_base;
195 static uint16_t queues_per_pool;
196
197 const uint16_t vlan_tags[] = {
198         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
199         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
200         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
201         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
202         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
203         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
204         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
205         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
206 };
207
208 /* ethernet addresses of ports */
209 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
210
211 static struct vhost_dev_tailq_list vhost_dev_list =
212         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
213
214 static struct lcore_info lcore_info[RTE_MAX_LCORE];
215
216 /* Used for queueing bursts of TX packets. */
217 struct mbuf_table {
218         unsigned len;
219         unsigned txq_id;
220         struct rte_mbuf *m_table[MAX_PKT_BURST];
221 };
222
223 /* TX queue for each data core. */
224 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
225
226 #define VLAN_HLEN       4
227
228 /* Per-device statistics struct */
229 struct device_statistics {
230         uint64_t tx_total;
231         rte_atomic64_t rx_total_atomic;
232         uint64_t tx;
233         rte_atomic64_t rx_atomic;
234 } __rte_cache_aligned;
235 struct device_statistics dev_statistics[MAX_DEVICES];
236
237 /*
238  * Builds up the correct configuration for VMDQ VLAN pool map
239  * according to the pool & queue limits.
240  */
241 static inline int
242 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
243 {
244         struct rte_eth_vmdq_rx_conf conf;
245         struct rte_eth_vmdq_rx_conf *def_conf =
246                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
247         unsigned i;
248
249         memset(&conf, 0, sizeof(conf));
250         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
251         conf.nb_pool_maps = num_devices;
252         conf.enable_loop_back = def_conf->enable_loop_back;
253         conf.rx_mode = def_conf->rx_mode;
254
255         for (i = 0; i < conf.nb_pool_maps; i++) {
256                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
257                 conf.pool_map[i].pools = (1UL << i);
258         }
259
260         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
261         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
262                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
263         return 0;
264 }
265
266 /*
267  * Validate the device number according to the max pool number gotten form
268  * dev_info. If the device number is invalid, give the error message and
269  * return -1. Each device must have its own pool.
270  */
271 static inline int
272 validate_num_devices(uint32_t max_nb_devices)
273 {
274         if (num_devices > max_nb_devices) {
275                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
276                 return -1;
277         }
278         return 0;
279 }
280
281 /*
282  * Initialises a given port using global settings and with the rx buffers
283  * coming from the mbuf_pool passed as parameter
284  */
285 static inline int
286 port_init(uint8_t port)
287 {
288         struct rte_eth_dev_info dev_info;
289         struct rte_eth_conf port_conf;
290         struct rte_eth_rxconf *rxconf;
291         struct rte_eth_txconf *txconf;
292         int16_t rx_rings, tx_rings;
293         uint16_t rx_ring_size, tx_ring_size;
294         int retval;
295         uint16_t q;
296
297         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
298         rte_eth_dev_info_get (port, &dev_info);
299
300         if (dev_info.max_rx_queues > MAX_QUEUES) {
301                 rte_exit(EXIT_FAILURE,
302                         "please define MAX_QUEUES no less than %u in %s\n",
303                         dev_info.max_rx_queues, __FILE__);
304         }
305
306         rxconf = &dev_info.default_rxconf;
307         txconf = &dev_info.default_txconf;
308         rxconf->rx_drop_en = 1;
309
310         /* Enable vlan offload */
311         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
312
313         /*configure the number of supported virtio devices based on VMDQ limits */
314         num_devices = dev_info.max_vmdq_pools;
315
316         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
317         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
318         tx_rings = (uint16_t)rte_lcore_count();
319
320         retval = validate_num_devices(MAX_DEVICES);
321         if (retval < 0)
322                 return retval;
323
324         /* Get port configuration. */
325         retval = get_eth_conf(&port_conf, num_devices);
326         if (retval < 0)
327                 return retval;
328         /* NIC queues are divided into pf queues and vmdq queues.  */
329         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
330         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
331         num_vmdq_queues = num_devices * queues_per_pool;
332         num_queues = num_pf_queues + num_vmdq_queues;
333         vmdq_queue_base = dev_info.vmdq_queue_base;
334         vmdq_pool_base  = dev_info.vmdq_pool_base;
335         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
336                 num_pf_queues, num_devices, queues_per_pool);
337
338         if (port >= rte_eth_dev_count()) return -1;
339
340         if (enable_tx_csum == 0)
341                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
342
343         if (enable_tso == 0) {
344                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
345                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
346         }
347
348         rx_rings = (uint16_t)dev_info.max_rx_queues;
349         /* Configure ethernet device. */
350         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
351         if (retval != 0)
352                 return retval;
353
354         /* Setup the queues. */
355         for (q = 0; q < rx_rings; q ++) {
356                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
357                                                 rte_eth_dev_socket_id(port),
358                                                 rxconf,
359                                                 mbuf_pool);
360                 if (retval < 0)
361                         return retval;
362         }
363         for (q = 0; q < tx_rings; q ++) {
364                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
365                                                 rte_eth_dev_socket_id(port),
366                                                 txconf);
367                 if (retval < 0)
368                         return retval;
369         }
370
371         /* Start the device. */
372         retval  = rte_eth_dev_start(port);
373         if (retval < 0) {
374                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
375                 return retval;
376         }
377
378         if (promiscuous)
379                 rte_eth_promiscuous_enable(port);
380
381         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
382         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
383         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
384                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
385                         (unsigned)port,
386                         vmdq_ports_eth_addr[port].addr_bytes[0],
387                         vmdq_ports_eth_addr[port].addr_bytes[1],
388                         vmdq_ports_eth_addr[port].addr_bytes[2],
389                         vmdq_ports_eth_addr[port].addr_bytes[3],
390                         vmdq_ports_eth_addr[port].addr_bytes[4],
391                         vmdq_ports_eth_addr[port].addr_bytes[5]);
392
393         return 0;
394 }
395
396 /*
397  * Set character device basename.
398  */
399 static int
400 us_vhost_parse_basename(const char *q_arg)
401 {
402         /* parse number string */
403
404         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
405                 return -1;
406         else
407                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
408
409         return 0;
410 }
411
412 /*
413  * Parse the portmask provided at run time.
414  */
415 static int
416 parse_portmask(const char *portmask)
417 {
418         char *end = NULL;
419         unsigned long pm;
420
421         errno = 0;
422
423         /* parse hexadecimal string */
424         pm = strtoul(portmask, &end, 16);
425         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426                 return -1;
427
428         if (pm == 0)
429                 return -1;
430
431         return pm;
432
433 }
434
435 /*
436  * Parse num options at run time.
437  */
438 static int
439 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
440 {
441         char *end = NULL;
442         unsigned long num;
443
444         errno = 0;
445
446         /* parse unsigned int string */
447         num = strtoul(q_arg, &end, 10);
448         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
449                 return -1;
450
451         if (num > max_valid_value)
452                 return -1;
453
454         return num;
455
456 }
457
458 /*
459  * Display usage
460  */
461 static void
462 us_vhost_usage(const char *prgname)
463 {
464         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
465         "               --vm2vm [0|1|2]\n"
466         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
467         "               --dev-basename <name>\n"
468         "               --nb-devices ND\n"
469         "               -p PORTMASK: Set mask for ports to be used by application\n"
470         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
471         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
472         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
473         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
474         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
475         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
476         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
477         "               --dev-basename: The basename to be used for the character device.\n"
478         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
479         "               --tso [0|1] disable/enable TCP segment offload.\n",
480                prgname);
481 }
482
483 /*
484  * Parse the arguments given in the command line of the application.
485  */
486 static int
487 us_vhost_parse_args(int argc, char **argv)
488 {
489         int opt, ret;
490         int option_index;
491         unsigned i;
492         const char *prgname = argv[0];
493         static struct option long_option[] = {
494                 {"vm2vm", required_argument, NULL, 0},
495                 {"rx-retry", required_argument, NULL, 0},
496                 {"rx-retry-delay", required_argument, NULL, 0},
497                 {"rx-retry-num", required_argument, NULL, 0},
498                 {"mergeable", required_argument, NULL, 0},
499                 {"vlan-strip", required_argument, NULL, 0},
500                 {"stats", required_argument, NULL, 0},
501                 {"dev-basename", required_argument, NULL, 0},
502                 {"tx-csum", required_argument, NULL, 0},
503                 {"tso", required_argument, NULL, 0},
504                 {NULL, 0, 0, 0},
505         };
506
507         /* Parse command line */
508         while ((opt = getopt_long(argc, argv, "p:P",
509                         long_option, &option_index)) != EOF) {
510                 switch (opt) {
511                 /* Portmask */
512                 case 'p':
513                         enabled_port_mask = parse_portmask(optarg);
514                         if (enabled_port_mask == 0) {
515                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
516                                 us_vhost_usage(prgname);
517                                 return -1;
518                         }
519                         break;
520
521                 case 'P':
522                         promiscuous = 1;
523                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
524                                 ETH_VMDQ_ACCEPT_BROADCAST |
525                                 ETH_VMDQ_ACCEPT_MULTICAST;
526                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
527
528                         break;
529
530                 case 0:
531                         /* Enable/disable vm2vm comms. */
532                         if (!strncmp(long_option[option_index].name, "vm2vm",
533                                 MAX_LONG_OPT_SZ)) {
534                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535                                 if (ret == -1) {
536                                         RTE_LOG(INFO, VHOST_CONFIG,
537                                                 "Invalid argument for "
538                                                 "vm2vm [0|1|2]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         vm2vm_mode = (vm2vm_type)ret;
543                                 }
544                         }
545
546                         /* Enable/disable retries on RX. */
547                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else {
554                                         enable_retry = ret;
555                                 }
556                         }
557
558                         /* Enable/disable TX checksum offload. */
559                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560                                 ret = parse_num_opt(optarg, 1);
561                                 if (ret == -1) {
562                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563                                         us_vhost_usage(prgname);
564                                         return -1;
565                                 } else
566                                         enable_tx_csum = ret;
567                         }
568
569                         /* Enable/disable TSO offload. */
570                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571                                 ret = parse_num_opt(optarg, 1);
572                                 if (ret == -1) {
573                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574                                         us_vhost_usage(prgname);
575                                         return -1;
576                                 } else
577                                         enable_tso = ret;
578                         }
579
580                         /* Specify the retries delay time (in useconds) on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_delay_time = ret;
589                                 }
590                         }
591
592                         /* Specify the retries number on RX. */
593                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, INT32_MAX);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         burst_rx_retry_num = ret;
601                                 }
602                         }
603
604                         /* Enable/disable RX mergeable buffers. */
605                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606                                 ret = parse_num_opt(optarg, 1);
607                                 if (ret == -1) {
608                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609                                         us_vhost_usage(prgname);
610                                         return -1;
611                                 } else {
612                                         mergeable = !!ret;
613                                         if (ret) {
614                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
615                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
616                                                         = JUMBO_FRAME_MAX_SIZE;
617                                         }
618                                 }
619                         }
620
621                         /* Enable/disable RX VLAN strip on host. */
622                         if (!strncmp(long_option[option_index].name,
623                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
624                                 ret = parse_num_opt(optarg, 1);
625                                 if (ret == -1) {
626                                         RTE_LOG(INFO, VHOST_CONFIG,
627                                                 "Invalid argument for VLAN strip [0|1]\n");
628                                         us_vhost_usage(prgname);
629                                         return -1;
630                                 } else {
631                                         vlan_strip = !!ret;
632                                         vmdq_conf_default.rxmode.hw_vlan_strip =
633                                                 vlan_strip;
634                                 }
635                         }
636
637                         /* Enable/disable stats. */
638                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
639                                 ret = parse_num_opt(optarg, INT32_MAX);
640                                 if (ret == -1) {
641                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 } else {
645                                         enable_stats = ret;
646                                 }
647                         }
648
649                         /* Set character device basename. */
650                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
651                                 if (us_vhost_parse_basename(optarg) == -1) {
652                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
653                                         us_vhost_usage(prgname);
654                                         return -1;
655                                 }
656                         }
657
658                         break;
659
660                         /* Invalid option - print options. */
661                 default:
662                         us_vhost_usage(prgname);
663                         return -1;
664                 }
665         }
666
667         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
668                 if (enabled_port_mask & (1 << i))
669                         ports[num_ports++] = (uint8_t)i;
670         }
671
672         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
673                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
674                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
675                 return -1;
676         }
677
678         return 0;
679 }
680
681 /*
682  * Update the global var NUM_PORTS and array PORTS according to system ports number
683  * and return valid ports number
684  */
685 static unsigned check_ports_num(unsigned nb_ports)
686 {
687         unsigned valid_num_ports = num_ports;
688         unsigned portid;
689
690         if (num_ports > nb_ports) {
691                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
692                         num_ports, nb_ports);
693                 num_ports = nb_ports;
694         }
695
696         for (portid = 0; portid < num_ports; portid ++) {
697                 if (ports[portid] >= nb_ports) {
698                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
699                                 ports[portid], (nb_ports - 1));
700                         ports[portid] = INVALID_PORT_ID;
701                         valid_num_ports--;
702                 }
703         }
704         return valid_num_ports;
705 }
706
707 static inline struct vhost_dev *__attribute__((always_inline))
708 find_vhost_dev(struct ether_addr *mac)
709 {
710         struct vhost_dev *vdev;
711
712         TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
713                 if (vdev->ready == DEVICE_RX &&
714                     is_same_ether_addr(mac, &vdev->mac_address))
715                         return vdev;
716         }
717
718         return NULL;
719 }
720
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728         struct ether_hdr *pkt_hdr;
729         struct virtio_net *dev = vdev->dev;
730         int i, ret;
731
732         /* Learn MAC address of guest device from packet */
733         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
734
735         if (find_vhost_dev(&pkt_hdr->s_addr)) {
736                 RTE_LOG(ERR, VHOST_DATA,
737                         "Device (%" PRIu64 ") is using a registered MAC!\n",
738                         dev->device_fh);
739                 return -1;
740         }
741
742         for (i = 0; i < ETHER_ADDR_LEN; i++)
743                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
744
745         /* vlan_tag currently uses the device_id. */
746         vdev->vlan_tag = vlan_tags[dev->device_fh];
747
748         /* Print out VMDQ registration info. */
749         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
750                 dev->device_fh,
751                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754                 vdev->vlan_tag);
755
756         /* Register the MAC address. */
757         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758                                 (uint32_t)dev->device_fh + vmdq_pool_base);
759         if (ret)
760                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
761                                         dev->device_fh);
762
763         /* Enable stripping of the vlan tag as we handle routing. */
764         if (vlan_strip)
765                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
766                         (uint16_t)vdev->vmdq_rx_q, 1);
767
768         /* Set device as ready for RX. */
769         vdev->ready = DEVICE_RX;
770
771         return 0;
772 }
773
774 /*
775  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
776  * queue before disabling RX on the device.
777  */
778 static inline void
779 unlink_vmdq(struct vhost_dev *vdev)
780 {
781         unsigned i = 0;
782         unsigned rx_count;
783         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
784
785         if (vdev->ready == DEVICE_RX) {
786                 /*clear MAC and VLAN settings*/
787                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
788                 for (i = 0; i < 6; i++)
789                         vdev->mac_address.addr_bytes[i] = 0;
790
791                 vdev->vlan_tag = 0;
792
793                 /*Clear out the receive buffers*/
794                 rx_count = rte_eth_rx_burst(ports[0],
795                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
796
797                 while (rx_count) {
798                         for (i = 0; i < rx_count; i++)
799                                 rte_pktmbuf_free(pkts_burst[i]);
800
801                         rx_count = rte_eth_rx_burst(ports[0],
802                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
803                 }
804
805                 vdev->ready = DEVICE_MAC_LEARNING;
806         }
807 }
808
809 static inline void __attribute__((always_inline))
810 virtio_xmit(struct virtio_net *dst_dev, struct virtio_net *src_dev,
811             struct rte_mbuf *m)
812 {
813         uint16_t ret;
814
815         ret = rte_vhost_enqueue_burst(dst_dev, VIRTIO_RXQ, &m, 1);
816         if (enable_stats) {
817                 rte_atomic64_inc(&dev_statistics[dst_dev->device_fh].rx_total_atomic);
818                 rte_atomic64_add(&dev_statistics[dst_dev->device_fh].rx_atomic, ret);
819                 dev_statistics[src_dev->device_fh].tx_total++;
820                 dev_statistics[src_dev->device_fh].tx += ret;
821         }
822 }
823
824 /*
825  * Check if the packet destination MAC address is for a local device. If so then put
826  * the packet on that devices RX queue. If not then return.
827  */
828 static inline int __attribute__((always_inline))
829 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
830 {
831         struct ether_hdr *pkt_hdr;
832         struct vhost_dev *dst_vdev;
833         uint64_t fh;
834
835         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
836
837         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
838         if (!dst_vdev)
839                 return -1;
840
841         fh = dst_vdev->dev->device_fh;
842         if (fh == vdev->dev->device_fh) {
843                 RTE_LOG(DEBUG, VHOST_DATA,
844                         "(%" PRIu64 ") TX: src and dst MAC is same. "
845                         "Dropping packet.\n", fh);
846                 return 0;
847         }
848
849         RTE_LOG(DEBUG, VHOST_DATA,
850                 "(%" PRIu64 ") TX: MAC address is local\n", fh);
851
852         if (unlikely(dst_vdev->remove)) {
853                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
854                         "Device is marked for removal\n", fh);
855                 return 0;
856         }
857
858         virtio_xmit(dst_vdev->dev, vdev->dev, m);
859         return 0;
860 }
861
862 /*
863  * Check if the destination MAC of a packet is one local VM,
864  * and get its vlan tag, and offset if it is.
865  */
866 static inline int __attribute__((always_inline))
867 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
868         uint32_t *offset, uint16_t *vlan_tag)
869 {
870         struct vhost_dev *dst_vdev;
871         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
872
873         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
874         if (!dst_vdev)
875                 return 0;
876
877         if (dst_vdev->dev->device_fh == dev->device_fh) {
878                 RTE_LOG(DEBUG, VHOST_DATA,
879                         "(%" PRIu64 ") TX: src and dst MAC is same. "
880                         " Dropping packet.\n", dst_vdev->dev->device_fh);
881                 return -1;
882         }
883
884         /*
885          * HW vlan strip will reduce the packet length
886          * by minus length of vlan tag, so need restore
887          * the packet length by plus it.
888          */
889         *offset  = VLAN_HLEN;
890         *vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
891
892         RTE_LOG(DEBUG, VHOST_DATA,
893                 "(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
894                 "vlan tag: %u.\n",
895                 dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
896
897         return 0;
898 }
899
900 static uint16_t
901 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
902 {
903         if (ol_flags & PKT_TX_IPV4)
904                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
905         else /* assume ethertype == ETHER_TYPE_IPv6 */
906                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
907 }
908
909 static void virtio_tx_offload(struct rte_mbuf *m)
910 {
911         void *l3_hdr;
912         struct ipv4_hdr *ipv4_hdr = NULL;
913         struct tcp_hdr *tcp_hdr = NULL;
914         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
915
916         l3_hdr = (char *)eth_hdr + m->l2_len;
917
918         if (m->ol_flags & PKT_TX_IPV4) {
919                 ipv4_hdr = l3_hdr;
920                 ipv4_hdr->hdr_checksum = 0;
921                 m->ol_flags |= PKT_TX_IP_CKSUM;
922         }
923
924         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
925         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
926 }
927
928 /*
929  * This function routes the TX packet to the correct interface. This may be a local device
930  * or the physical port.
931  */
932 static inline void __attribute__((always_inline))
933 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
934 {
935         struct mbuf_table *tx_q;
936         struct rte_mbuf **m_table;
937         unsigned len, ret, offset = 0;
938         const uint16_t lcore_id = rte_lcore_id();
939         struct virtio_net *dev = vdev->dev;
940         struct ether_hdr *nh;
941
942
943         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
944         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
945                 struct vhost_dev *vdev2;
946
947                 TAILQ_FOREACH(vdev2, &vhost_dev_list, next) {
948                         virtio_xmit(vdev2->dev, vdev->dev, m);
949                 }
950                 goto queue2nic;
951         }
952
953         /*check if destination is local VM*/
954         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
955                 rte_pktmbuf_free(m);
956                 return;
957         }
958
959         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
960                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
961                         rte_pktmbuf_free(m);
962                         return;
963                 }
964         }
965
966         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
967                 "MAC address is external\n", dev->device_fh);
968
969 queue2nic:
970
971         /*Add packet to the port tx queue*/
972         tx_q = &lcore_tx_queue[lcore_id];
973         len = tx_q->len;
974
975         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
976         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
977                 /* Guest has inserted the vlan tag. */
978                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
979                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
980                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
981                         (vh->vlan_tci != vlan_tag_be))
982                         vh->vlan_tci = vlan_tag_be;
983         } else {
984                 m->ol_flags |= PKT_TX_VLAN_PKT;
985
986                 /*
987                  * Find the right seg to adjust the data len when offset is
988                  * bigger than tail room size.
989                  */
990                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
991                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
992                                 m->data_len += offset;
993                         else {
994                                 struct rte_mbuf *seg = m;
995
996                                 while ((seg->next != NULL) &&
997                                         (offset > rte_pktmbuf_tailroom(seg)))
998                                         seg = seg->next;
999
1000                                 seg->data_len += offset;
1001                         }
1002                         m->pkt_len += offset;
1003                 }
1004
1005                 m->vlan_tci = vlan_tag;
1006         }
1007
1008         if (m->ol_flags & PKT_TX_TCP_SEG)
1009                 virtio_tx_offload(m);
1010
1011         tx_q->m_table[len] = m;
1012         len++;
1013         if (enable_stats) {
1014                 dev_statistics[dev->device_fh].tx_total++;
1015                 dev_statistics[dev->device_fh].tx++;
1016         }
1017
1018         if (unlikely(len == MAX_PKT_BURST)) {
1019                 m_table = (struct rte_mbuf **)tx_q->m_table;
1020                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1021                 /* Free any buffers not handled by TX and update the port stats. */
1022                 if (unlikely(ret < len)) {
1023                         do {
1024                                 rte_pktmbuf_free(m_table[ret]);
1025                         } while (++ret < len);
1026                 }
1027
1028                 len = 0;
1029         }
1030
1031         tx_q->len = len;
1032         return;
1033 }
1034 /*
1035  * This function is called by each data core. It handles all RX/TX registered with the
1036  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1037  * with all devices in the main linked list.
1038  */
1039 static int
1040 switch_worker(__attribute__((unused)) void *arg)
1041 {
1042         struct virtio_net *dev = NULL;
1043         struct vhost_dev *vdev = NULL;
1044         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1045         struct mbuf_table *tx_q;
1046         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1047         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1048         unsigned ret, i;
1049         const uint16_t lcore_id = rte_lcore_id();
1050         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1051         uint16_t rx_count = 0;
1052         uint16_t tx_count;
1053         uint32_t retry = 0;
1054
1055         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1056         prev_tsc = 0;
1057
1058         tx_q = &lcore_tx_queue[lcore_id];
1059         for (i = 0; i < num_cores; i ++) {
1060                 if (lcore_ids[i] == lcore_id) {
1061                         tx_q->txq_id = i;
1062                         break;
1063                 }
1064         }
1065
1066         while(1) {
1067                 cur_tsc = rte_rdtsc();
1068                 /*
1069                  * TX burst queue drain
1070                  */
1071                 diff_tsc = cur_tsc - prev_tsc;
1072                 if (unlikely(diff_tsc > drain_tsc)) {
1073
1074                         if (tx_q->len) {
1075                                 RTE_LOG(DEBUG, VHOST_DATA,
1076                                         "TX queue drained after timeout with burst size %u\n",
1077                                         tx_q->len);
1078
1079                                 /*Tx any packets in the queue*/
1080                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1081                                                                            (struct rte_mbuf **)tx_q->m_table,
1082                                                                            (uint16_t)tx_q->len);
1083                                 if (unlikely(ret < tx_q->len)) {
1084                                         do {
1085                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1086                                         } while (++ret < tx_q->len);
1087                                 }
1088
1089                                 tx_q->len = 0;
1090                         }
1091
1092                         prev_tsc = cur_tsc;
1093
1094                 }
1095
1096                 /*
1097                  * Inform the configuration core that we have exited the
1098                  * linked list and that no devices are in use if requested.
1099                  */
1100                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1101                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1102
1103                 /*
1104                  * Process devices
1105                  */
1106                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1107                         uint64_t fh;
1108
1109                         dev = vdev->dev;
1110                         fh  = dev->device_fh;
1111
1112                         if (unlikely(vdev->remove)) {
1113                                 unlink_vmdq(vdev);
1114                                 vdev->ready = DEVICE_SAFE_REMOVE;
1115                                 continue;
1116                         }
1117
1118                         if (likely(vdev->ready == DEVICE_RX)) {
1119                                 /*Handle guest RX*/
1120                                 rx_count = rte_eth_rx_burst(ports[0],
1121                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1122
1123                                 if (rx_count) {
1124                                         /*
1125                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1126                                         * Here MAX_PKT_BURST must be less than virtio queue size
1127                                         */
1128                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1129                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1130                                                         rte_delay_us(burst_rx_delay_time);
1131                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1132                                                                 break;
1133                                                 }
1134                                         }
1135                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1136                                         if (enable_stats) {
1137                                                 rte_atomic64_add(
1138                                                         &dev_statistics[fh].rx_total_atomic,
1139                                                         rx_count);
1140                                                 rte_atomic64_add(
1141                                                         &dev_statistics[fh].rx_atomic,
1142                                                         ret_count);
1143                                         }
1144                                         while (likely(rx_count)) {
1145                                                 rx_count--;
1146                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1147                                         }
1148
1149                                 }
1150                         }
1151
1152                         if (likely(!vdev->remove)) {
1153                                 /* Handle guest TX*/
1154                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1155                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1156                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1157                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1158                                                 while (tx_count)
1159                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1160                                         }
1161                                 }
1162                                 for (i = 0; i < tx_count; ++i) {
1163                                         virtio_tx_route(vdev, pkts_burst[i],
1164                                                 vlan_tags[(uint16_t)dev->device_fh]);
1165                                 }
1166                         }
1167                 }
1168         }
1169
1170         return 0;
1171 }
1172
1173 /*
1174  * Remove a device from the specific data core linked list and from the
1175  * main linked list. Synchonization  occurs through the use of the
1176  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1177  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1178  */
1179 static void
1180 destroy_device (volatile struct virtio_net *dev)
1181 {
1182         struct vhost_dev *vdev;
1183         int lcore;
1184
1185         dev->flags &= ~VIRTIO_DEV_RUNNING;
1186
1187         vdev = (struct vhost_dev *)dev->priv;
1188         /*set the remove flag. */
1189         vdev->remove = 1;
1190         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1191                 rte_pause();
1192         }
1193
1194         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1195         TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1196
1197         /* Set the dev_removal_flag on each lcore. */
1198         RTE_LCORE_FOREACH_SLAVE(lcore)
1199                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1200
1201         /*
1202          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1203          * we can be sure that they can no longer access the device removed
1204          * from the linked lists and that the devices are no longer in use.
1205          */
1206         RTE_LCORE_FOREACH_SLAVE(lcore) {
1207                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1208                         rte_pause();
1209         }
1210
1211         lcore_info[vdev->coreid].device_num--;
1212
1213         RTE_LOG(INFO, VHOST_DATA,
1214                 "(%" PRIu64 ") Device has been removed from data core\n",
1215                 dev->device_fh);
1216
1217         rte_free(vdev);
1218 }
1219
1220 /*
1221  * A new device is added to a data core. First the device is added to the main linked list
1222  * and the allocated to a specific data core.
1223  */
1224 static int
1225 new_device (struct virtio_net *dev)
1226 {
1227         int lcore, core_add = 0;
1228         uint32_t device_num_min = num_devices;
1229         struct vhost_dev *vdev;
1230
1231         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1232         if (vdev == NULL) {
1233                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1234                         dev->device_fh);
1235                 return -1;
1236         }
1237         vdev->dev = dev;
1238         dev->priv = vdev;
1239
1240         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1241         vdev->vmdq_rx_q
1242                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
1243
1244         /*reset ready flag*/
1245         vdev->ready = DEVICE_MAC_LEARNING;
1246         vdev->remove = 0;
1247
1248         /* Find a suitable lcore to add the device. */
1249         RTE_LCORE_FOREACH_SLAVE(lcore) {
1250                 if (lcore_info[lcore].device_num < device_num_min) {
1251                         device_num_min = lcore_info[lcore].device_num;
1252                         core_add = lcore;
1253                 }
1254         }
1255         vdev->coreid = core_add;
1256
1257         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1258         lcore_info[vdev->coreid].device_num++;
1259
1260         /* Initialize device stats */
1261         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1262
1263         /* Disable notifications. */
1264         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1265         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1266         dev->flags |= VIRTIO_DEV_RUNNING;
1267
1268         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1269
1270         return 0;
1271 }
1272
1273 /*
1274  * These callback allow devices to be added to the data core when configuration
1275  * has been fully complete.
1276  */
1277 static const struct virtio_net_device_ops virtio_net_device_ops =
1278 {
1279         .new_device =  new_device,
1280         .destroy_device = destroy_device,
1281 };
1282
1283 /*
1284  * This is a thread will wake up after a period to print stats if the user has
1285  * enabled them.
1286  */
1287 static void
1288 print_stats(void)
1289 {
1290         struct vhost_dev *vdev;
1291         uint64_t tx_dropped, rx_dropped;
1292         uint64_t tx, tx_total, rx, rx_total;
1293         uint32_t device_fh;
1294         const char clr[] = { 27, '[', '2', 'J', '\0' };
1295         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1296
1297         while(1) {
1298                 sleep(enable_stats);
1299
1300                 /* Clear screen and move to top left */
1301                 printf("%s%s", clr, top_left);
1302
1303                 printf("\nDevice statistics ====================================");
1304
1305                 TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1306                         device_fh = vdev->dev->device_fh;
1307                         tx_total = dev_statistics[device_fh].tx_total;
1308                         tx = dev_statistics[device_fh].tx;
1309                         tx_dropped = tx_total - tx;
1310                         rx_total = rte_atomic64_read(
1311                                 &dev_statistics[device_fh].rx_total_atomic);
1312                         rx = rte_atomic64_read(
1313                                 &dev_statistics[device_fh].rx_atomic);
1314                         rx_dropped = rx_total - rx;
1315
1316                         printf("\nStatistics for device %"PRIu32" ------------------------------"
1317                                         "\nTX total:            %"PRIu64""
1318                                         "\nTX dropped:          %"PRIu64""
1319                                         "\nTX successful:               %"PRIu64""
1320                                         "\nRX total:            %"PRIu64""
1321                                         "\nRX dropped:          %"PRIu64""
1322                                         "\nRX successful:               %"PRIu64"",
1323                                         device_fh,
1324                                         tx_total,
1325                                         tx_dropped,
1326                                         tx,
1327                                         rx_total,
1328                                         rx_dropped,
1329                                         rx);
1330                 }
1331                 printf("\n======================================================\n");
1332         }
1333 }
1334
1335 /* When we receive a INT signal, unregister vhost driver */
1336 static void
1337 sigint_handler(__rte_unused int signum)
1338 {
1339         /* Unregister vhost driver. */
1340         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1341         if (ret != 0)
1342                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1343         exit(0);
1344 }
1345
1346 /*
1347  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1348  * device is also registered here to handle the IOCTLs.
1349  */
1350 int
1351 main(int argc, char *argv[])
1352 {
1353         unsigned lcore_id, core_id = 0;
1354         unsigned nb_ports, valid_num_ports;
1355         int ret;
1356         uint8_t portid;
1357         static pthread_t tid;
1358         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1359
1360         signal(SIGINT, sigint_handler);
1361
1362         /* init EAL */
1363         ret = rte_eal_init(argc, argv);
1364         if (ret < 0)
1365                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1366         argc -= ret;
1367         argv += ret;
1368
1369         /* parse app arguments */
1370         ret = us_vhost_parse_args(argc, argv);
1371         if (ret < 0)
1372                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1373
1374         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1375                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1376
1377                 if (rte_lcore_is_enabled(lcore_id))
1378                         lcore_ids[core_id ++] = lcore_id;
1379
1380         if (rte_lcore_count() > RTE_MAX_LCORE)
1381                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1382
1383         /*set the number of swithcing cores available*/
1384         num_switching_cores = rte_lcore_count()-1;
1385
1386         /* Get the number of physical ports. */
1387         nb_ports = rte_eth_dev_count();
1388         if (nb_ports > RTE_MAX_ETHPORTS)
1389                 nb_ports = RTE_MAX_ETHPORTS;
1390
1391         /*
1392          * Update the global var NUM_PORTS and global array PORTS
1393          * and get value of var VALID_NUM_PORTS according to system ports number
1394          */
1395         valid_num_ports = check_ports_num(nb_ports);
1396
1397         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1398                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1399                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1400                 return -1;
1401         }
1402
1403         /* Create the mbuf pool. */
1404         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1405                 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1406                 0, MBUF_DATA_SIZE, rte_socket_id());
1407         if (mbuf_pool == NULL)
1408                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1409
1410         if (vm2vm_mode == VM2VM_HARDWARE) {
1411                 /* Enable VT loop back to let L2 switch to do it. */
1412                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1413                 RTE_LOG(DEBUG, VHOST_CONFIG,
1414                         "Enable loop back for L2 switch in vmdq.\n");
1415         }
1416
1417         /* initialize all ports */
1418         for (portid = 0; portid < nb_ports; portid++) {
1419                 /* skip ports that are not enabled */
1420                 if ((enabled_port_mask & (1 << portid)) == 0) {
1421                         RTE_LOG(INFO, VHOST_PORT,
1422                                 "Skipping disabled port %d\n", portid);
1423                         continue;
1424                 }
1425                 if (port_init(portid) != 0)
1426                         rte_exit(EXIT_FAILURE,
1427                                 "Cannot initialize network ports\n");
1428         }
1429
1430         /* Initialize device stats */
1431         memset(&dev_statistics, 0, sizeof(dev_statistics));
1432
1433         /* Enable stats if the user option is set. */
1434         if (enable_stats) {
1435                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1436                 if (ret != 0)
1437                         rte_exit(EXIT_FAILURE,
1438                                 "Cannot create print-stats thread\n");
1439
1440                 /* Set thread_name for aid in debugging.  */
1441                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1442                 ret = rte_thread_setname(tid, thread_name);
1443                 if (ret != 0)
1444                         RTE_LOG(ERR, VHOST_CONFIG,
1445                                 "Cannot set print-stats name\n");
1446         }
1447
1448         /* Launch all data cores. */
1449         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1450                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1451
1452         if (mergeable == 0)
1453                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1454
1455         /* Register vhost(cuse or user) driver to handle vhost messages. */
1456         ret = rte_vhost_driver_register((char *)&dev_basename);
1457         if (ret != 0)
1458                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1459
1460         rte_vhost_driver_callback_register(&virtio_net_device_ops);
1461
1462         /* Start CUSE session. */
1463         rte_vhost_driver_session_start();
1464         return 0;
1465
1466 }