examples/vhost: use tailq to link vhost devices
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
78
79 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
81
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX                       1
87 #define DEVICE_SAFE_REMOVE      2
88
89 /* Configurable number of RX/TX ring descriptors */
90 #define RTE_TEST_RX_DESC_DEFAULT 1024
91 #define RTE_TEST_TX_DESC_DEFAULT 512
92
93 #define INVALID_PORT_ID 0xFF
94
95 /* Max number of devices. Limited by vmdq. */
96 #define MAX_DEVICES 64
97
98 /* Size of buffers used for snprintfs. */
99 #define MAX_PRINT_BUFF 6072
100
101 /* Maximum character device basename size. */
102 #define MAX_BASENAME_SZ 10
103
104 /* Maximum long option length for option parsing. */
105 #define MAX_LONG_OPT_SZ 64
106
107 /* Used to compare MAC addresses. */
108 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
109
110 /* mask of enabled ports */
111 static uint32_t enabled_port_mask = 0;
112
113 /* Promiscuous mode */
114 static uint32_t promiscuous;
115
116 /*Number of switching cores enabled*/
117 static uint32_t num_switching_cores = 0;
118
119 /* number of devices/queues to support*/
120 static uint32_t num_queues = 0;
121 static uint32_t num_devices;
122
123 static struct rte_mempool *mbuf_pool;
124 static int mergeable;
125
126 /* Do vlan strip on host, enabled on default */
127 static uint32_t vlan_strip = 1;
128
129 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
130 typedef enum {
131         VM2VM_DISABLED = 0,
132         VM2VM_SOFTWARE = 1,
133         VM2VM_HARDWARE = 2,
134         VM2VM_LAST
135 } vm2vm_type;
136 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
137
138 /* Enable stats. */
139 static uint32_t enable_stats = 0;
140 /* Enable retries on RX. */
141 static uint32_t enable_retry = 1;
142
143 /* Disable TX checksum offload */
144 static uint32_t enable_tx_csum;
145
146 /* Disable TSO offload */
147 static uint32_t enable_tso;
148
149 /* Specify timeout (in useconds) between retries on RX. */
150 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
151 /* Specify the number of retries on RX. */
152 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
153
154 /* Character device basename. Can be set by user. */
155 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
156
157 /* empty vmdq configuration structure. Filled in programatically */
158 static struct rte_eth_conf vmdq_conf_default = {
159         .rxmode = {
160                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
161                 .split_hdr_size = 0,
162                 .header_split   = 0, /**< Header Split disabled */
163                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
164                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
165                 /*
166                  * It is necessary for 1G NIC such as I350,
167                  * this fixes bug of ipv4 forwarding in guest can't
168                  * forward pakets from one virtio dev to another virtio dev.
169                  */
170                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
171                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
172                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
173         },
174
175         .txmode = {
176                 .mq_mode = ETH_MQ_TX_NONE,
177         },
178         .rx_adv_conf = {
179                 /*
180                  * should be overridden separately in code with
181                  * appropriate values
182                  */
183                 .vmdq_rx_conf = {
184                         .nb_queue_pools = ETH_8_POOLS,
185                         .enable_default_pool = 0,
186                         .default_pool = 0,
187                         .nb_pool_maps = 0,
188                         .pool_map = {{0, 0},},
189                 },
190         },
191 };
192
193 static unsigned lcore_ids[RTE_MAX_LCORE];
194 static uint8_t ports[RTE_MAX_ETHPORTS];
195 static unsigned num_ports = 0; /**< The number of ports specified in command line */
196 static uint16_t num_pf_queues, num_vmdq_queues;
197 static uint16_t vmdq_pool_base, vmdq_queue_base;
198 static uint16_t queues_per_pool;
199
200 const uint16_t vlan_tags[] = {
201         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
202         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
203         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
204         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
205         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
206         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
207         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
208         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
209 };
210
211 /* ethernet addresses of ports */
212 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
213
214 static struct vhost_dev_tailq_list vhost_dev_list =
215         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
216
217 static struct lcore_info lcore_info[RTE_MAX_LCORE];
218
219 /* Used for queueing bursts of TX packets. */
220 struct mbuf_table {
221         unsigned len;
222         unsigned txq_id;
223         struct rte_mbuf *m_table[MAX_PKT_BURST];
224 };
225
226 /* TX queue for each data core. */
227 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
228
229 #define VLAN_HLEN       4
230
231 /* Per-device statistics struct */
232 struct device_statistics {
233         uint64_t tx_total;
234         rte_atomic64_t rx_total_atomic;
235         uint64_t tx;
236         rte_atomic64_t rx_atomic;
237 } __rte_cache_aligned;
238 struct device_statistics dev_statistics[MAX_DEVICES];
239
240 /*
241  * Builds up the correct configuration for VMDQ VLAN pool map
242  * according to the pool & queue limits.
243  */
244 static inline int
245 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
246 {
247         struct rte_eth_vmdq_rx_conf conf;
248         struct rte_eth_vmdq_rx_conf *def_conf =
249                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
250         unsigned i;
251
252         memset(&conf, 0, sizeof(conf));
253         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
254         conf.nb_pool_maps = num_devices;
255         conf.enable_loop_back = def_conf->enable_loop_back;
256         conf.rx_mode = def_conf->rx_mode;
257
258         for (i = 0; i < conf.nb_pool_maps; i++) {
259                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
260                 conf.pool_map[i].pools = (1UL << i);
261         }
262
263         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
264         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
265                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
266         return 0;
267 }
268
269 /*
270  * Validate the device number according to the max pool number gotten form
271  * dev_info. If the device number is invalid, give the error message and
272  * return -1. Each device must have its own pool.
273  */
274 static inline int
275 validate_num_devices(uint32_t max_nb_devices)
276 {
277         if (num_devices > max_nb_devices) {
278                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
279                 return -1;
280         }
281         return 0;
282 }
283
284 /*
285  * Initialises a given port using global settings and with the rx buffers
286  * coming from the mbuf_pool passed as parameter
287  */
288 static inline int
289 port_init(uint8_t port)
290 {
291         struct rte_eth_dev_info dev_info;
292         struct rte_eth_conf port_conf;
293         struct rte_eth_rxconf *rxconf;
294         struct rte_eth_txconf *txconf;
295         int16_t rx_rings, tx_rings;
296         uint16_t rx_ring_size, tx_ring_size;
297         int retval;
298         uint16_t q;
299
300         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
301         rte_eth_dev_info_get (port, &dev_info);
302
303         if (dev_info.max_rx_queues > MAX_QUEUES) {
304                 rte_exit(EXIT_FAILURE,
305                         "please define MAX_QUEUES no less than %u in %s\n",
306                         dev_info.max_rx_queues, __FILE__);
307         }
308
309         rxconf = &dev_info.default_rxconf;
310         txconf = &dev_info.default_txconf;
311         rxconf->rx_drop_en = 1;
312
313         /* Enable vlan offload */
314         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
315
316         /*configure the number of supported virtio devices based on VMDQ limits */
317         num_devices = dev_info.max_vmdq_pools;
318
319         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
320         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
321         tx_rings = (uint16_t)rte_lcore_count();
322
323         retval = validate_num_devices(MAX_DEVICES);
324         if (retval < 0)
325                 return retval;
326
327         /* Get port configuration. */
328         retval = get_eth_conf(&port_conf, num_devices);
329         if (retval < 0)
330                 return retval;
331         /* NIC queues are divided into pf queues and vmdq queues.  */
332         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
333         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
334         num_vmdq_queues = num_devices * queues_per_pool;
335         num_queues = num_pf_queues + num_vmdq_queues;
336         vmdq_queue_base = dev_info.vmdq_queue_base;
337         vmdq_pool_base  = dev_info.vmdq_pool_base;
338         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
339                 num_pf_queues, num_devices, queues_per_pool);
340
341         if (port >= rte_eth_dev_count()) return -1;
342
343         if (enable_tx_csum == 0)
344                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
345
346         if (enable_tso == 0) {
347                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
348                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
349         }
350
351         rx_rings = (uint16_t)dev_info.max_rx_queues;
352         /* Configure ethernet device. */
353         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
354         if (retval != 0)
355                 return retval;
356
357         /* Setup the queues. */
358         for (q = 0; q < rx_rings; q ++) {
359                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
360                                                 rte_eth_dev_socket_id(port),
361                                                 rxconf,
362                                                 mbuf_pool);
363                 if (retval < 0)
364                         return retval;
365         }
366         for (q = 0; q < tx_rings; q ++) {
367                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
368                                                 rte_eth_dev_socket_id(port),
369                                                 txconf);
370                 if (retval < 0)
371                         return retval;
372         }
373
374         /* Start the device. */
375         retval  = rte_eth_dev_start(port);
376         if (retval < 0) {
377                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
378                 return retval;
379         }
380
381         if (promiscuous)
382                 rte_eth_promiscuous_enable(port);
383
384         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
385         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
386         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
387                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
388                         (unsigned)port,
389                         vmdq_ports_eth_addr[port].addr_bytes[0],
390                         vmdq_ports_eth_addr[port].addr_bytes[1],
391                         vmdq_ports_eth_addr[port].addr_bytes[2],
392                         vmdq_ports_eth_addr[port].addr_bytes[3],
393                         vmdq_ports_eth_addr[port].addr_bytes[4],
394                         vmdq_ports_eth_addr[port].addr_bytes[5]);
395
396         return 0;
397 }
398
399 /*
400  * Set character device basename.
401  */
402 static int
403 us_vhost_parse_basename(const char *q_arg)
404 {
405         /* parse number string */
406
407         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
408                 return -1;
409         else
410                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
411
412         return 0;
413 }
414
415 /*
416  * Parse the portmask provided at run time.
417  */
418 static int
419 parse_portmask(const char *portmask)
420 {
421         char *end = NULL;
422         unsigned long pm;
423
424         errno = 0;
425
426         /* parse hexadecimal string */
427         pm = strtoul(portmask, &end, 16);
428         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
429                 return -1;
430
431         if (pm == 0)
432                 return -1;
433
434         return pm;
435
436 }
437
438 /*
439  * Parse num options at run time.
440  */
441 static int
442 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
443 {
444         char *end = NULL;
445         unsigned long num;
446
447         errno = 0;
448
449         /* parse unsigned int string */
450         num = strtoul(q_arg, &end, 10);
451         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
452                 return -1;
453
454         if (num > max_valid_value)
455                 return -1;
456
457         return num;
458
459 }
460
461 /*
462  * Display usage
463  */
464 static void
465 us_vhost_usage(const char *prgname)
466 {
467         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
468         "               --vm2vm [0|1|2]\n"
469         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
470         "               --dev-basename <name>\n"
471         "               --nb-devices ND\n"
472         "               -p PORTMASK: Set mask for ports to be used by application\n"
473         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
474         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
475         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
476         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
477         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
478         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
479         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
480         "               --dev-basename: The basename to be used for the character device.\n"
481         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
482         "               --tso [0|1] disable/enable TCP segment offload.\n",
483                prgname);
484 }
485
486 /*
487  * Parse the arguments given in the command line of the application.
488  */
489 static int
490 us_vhost_parse_args(int argc, char **argv)
491 {
492         int opt, ret;
493         int option_index;
494         unsigned i;
495         const char *prgname = argv[0];
496         static struct option long_option[] = {
497                 {"vm2vm", required_argument, NULL, 0},
498                 {"rx-retry", required_argument, NULL, 0},
499                 {"rx-retry-delay", required_argument, NULL, 0},
500                 {"rx-retry-num", required_argument, NULL, 0},
501                 {"mergeable", required_argument, NULL, 0},
502                 {"vlan-strip", required_argument, NULL, 0},
503                 {"stats", required_argument, NULL, 0},
504                 {"dev-basename", required_argument, NULL, 0},
505                 {"tx-csum", required_argument, NULL, 0},
506                 {"tso", required_argument, NULL, 0},
507                 {NULL, 0, 0, 0},
508         };
509
510         /* Parse command line */
511         while ((opt = getopt_long(argc, argv, "p:P",
512                         long_option, &option_index)) != EOF) {
513                 switch (opt) {
514                 /* Portmask */
515                 case 'p':
516                         enabled_port_mask = parse_portmask(optarg);
517                         if (enabled_port_mask == 0) {
518                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
519                                 us_vhost_usage(prgname);
520                                 return -1;
521                         }
522                         break;
523
524                 case 'P':
525                         promiscuous = 1;
526                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
527                                 ETH_VMDQ_ACCEPT_BROADCAST |
528                                 ETH_VMDQ_ACCEPT_MULTICAST;
529                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
530
531                         break;
532
533                 case 0:
534                         /* Enable/disable vm2vm comms. */
535                         if (!strncmp(long_option[option_index].name, "vm2vm",
536                                 MAX_LONG_OPT_SZ)) {
537                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
538                                 if (ret == -1) {
539                                         RTE_LOG(INFO, VHOST_CONFIG,
540                                                 "Invalid argument for "
541                                                 "vm2vm [0|1|2]\n");
542                                         us_vhost_usage(prgname);
543                                         return -1;
544                                 } else {
545                                         vm2vm_mode = (vm2vm_type)ret;
546                                 }
547                         }
548
549                         /* Enable/disable retries on RX. */
550                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
551                                 ret = parse_num_opt(optarg, 1);
552                                 if (ret == -1) {
553                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
554                                         us_vhost_usage(prgname);
555                                         return -1;
556                                 } else {
557                                         enable_retry = ret;
558                                 }
559                         }
560
561                         /* Enable/disable TX checksum offload. */
562                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
563                                 ret = parse_num_opt(optarg, 1);
564                                 if (ret == -1) {
565                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
566                                         us_vhost_usage(prgname);
567                                         return -1;
568                                 } else
569                                         enable_tx_csum = ret;
570                         }
571
572                         /* Enable/disable TSO offload. */
573                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
574                                 ret = parse_num_opt(optarg, 1);
575                                 if (ret == -1) {
576                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
577                                         us_vhost_usage(prgname);
578                                         return -1;
579                                 } else
580                                         enable_tso = ret;
581                         }
582
583                         /* Specify the retries delay time (in useconds) on RX. */
584                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
585                                 ret = parse_num_opt(optarg, INT32_MAX);
586                                 if (ret == -1) {
587                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
588                                         us_vhost_usage(prgname);
589                                         return -1;
590                                 } else {
591                                         burst_rx_delay_time = ret;
592                                 }
593                         }
594
595                         /* Specify the retries number on RX. */
596                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
597                                 ret = parse_num_opt(optarg, INT32_MAX);
598                                 if (ret == -1) {
599                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
600                                         us_vhost_usage(prgname);
601                                         return -1;
602                                 } else {
603                                         burst_rx_retry_num = ret;
604                                 }
605                         }
606
607                         /* Enable/disable RX mergeable buffers. */
608                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
609                                 ret = parse_num_opt(optarg, 1);
610                                 if (ret == -1) {
611                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
612                                         us_vhost_usage(prgname);
613                                         return -1;
614                                 } else {
615                                         mergeable = !!ret;
616                                         if (ret) {
617                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
618                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
619                                                         = JUMBO_FRAME_MAX_SIZE;
620                                         }
621                                 }
622                         }
623
624                         /* Enable/disable RX VLAN strip on host. */
625                         if (!strncmp(long_option[option_index].name,
626                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
627                                 ret = parse_num_opt(optarg, 1);
628                                 if (ret == -1) {
629                                         RTE_LOG(INFO, VHOST_CONFIG,
630                                                 "Invalid argument for VLAN strip [0|1]\n");
631                                         us_vhost_usage(prgname);
632                                         return -1;
633                                 } else {
634                                         vlan_strip = !!ret;
635                                         vmdq_conf_default.rxmode.hw_vlan_strip =
636                                                 vlan_strip;
637                                 }
638                         }
639
640                         /* Enable/disable stats. */
641                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
642                                 ret = parse_num_opt(optarg, INT32_MAX);
643                                 if (ret == -1) {
644                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
645                                         us_vhost_usage(prgname);
646                                         return -1;
647                                 } else {
648                                         enable_stats = ret;
649                                 }
650                         }
651
652                         /* Set character device basename. */
653                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
654                                 if (us_vhost_parse_basename(optarg) == -1) {
655                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
656                                         us_vhost_usage(prgname);
657                                         return -1;
658                                 }
659                         }
660
661                         break;
662
663                         /* Invalid option - print options. */
664                 default:
665                         us_vhost_usage(prgname);
666                         return -1;
667                 }
668         }
669
670         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
671                 if (enabled_port_mask & (1 << i))
672                         ports[num_ports++] = (uint8_t)i;
673         }
674
675         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
676                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
677                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
678                 return -1;
679         }
680
681         return 0;
682 }
683
684 /*
685  * Update the global var NUM_PORTS and array PORTS according to system ports number
686  * and return valid ports number
687  */
688 static unsigned check_ports_num(unsigned nb_ports)
689 {
690         unsigned valid_num_ports = num_ports;
691         unsigned portid;
692
693         if (num_ports > nb_ports) {
694                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
695                         num_ports, nb_ports);
696                 num_ports = nb_ports;
697         }
698
699         for (portid = 0; portid < num_ports; portid ++) {
700                 if (ports[portid] >= nb_ports) {
701                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
702                                 ports[portid], (nb_ports - 1));
703                         ports[portid] = INVALID_PORT_ID;
704                         valid_num_ports--;
705                 }
706         }
707         return valid_num_ports;
708 }
709
710 /*
711  * Compares a packet destination MAC address to a device MAC address.
712  */
713 static inline int __attribute__((always_inline))
714 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
715 {
716         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
717 }
718
719 static inline struct vhost_dev *__attribute__((always_inline))
720 find_vhost_dev(struct ether_addr *mac)
721 {
722         struct vhost_dev *vdev;
723
724         TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
725                 if (vdev->ready == DEVICE_RX &&
726                     ether_addr_cmp(mac, &vdev->mac_address))
727                         return vdev;
728         }
729
730         return NULL;
731 }
732
733 /*
734  * This function learns the MAC address of the device and registers this along with a
735  * vlan tag to a VMDQ.
736  */
737 static int
738 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
739 {
740         struct ether_hdr *pkt_hdr;
741         struct virtio_net *dev = vdev->dev;
742         int i, ret;
743
744         /* Learn MAC address of guest device from packet */
745         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
746
747         if (find_vhost_dev(&pkt_hdr->s_addr)) {
748                 RTE_LOG(ERR, VHOST_DATA,
749                         "Device (%" PRIu64 ") is using a registered MAC!\n",
750                         dev->device_fh);
751                 return -1;
752         }
753
754         for (i = 0; i < ETHER_ADDR_LEN; i++)
755                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
756
757         /* vlan_tag currently uses the device_id. */
758         vdev->vlan_tag = vlan_tags[dev->device_fh];
759
760         /* Print out VMDQ registration info. */
761         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
762                 dev->device_fh,
763                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
764                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
765                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
766                 vdev->vlan_tag);
767
768         /* Register the MAC address. */
769         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
770                                 (uint32_t)dev->device_fh + vmdq_pool_base);
771         if (ret)
772                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
773                                         dev->device_fh);
774
775         /* Enable stripping of the vlan tag as we handle routing. */
776         if (vlan_strip)
777                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
778                         (uint16_t)vdev->vmdq_rx_q, 1);
779
780         /* Set device as ready for RX. */
781         vdev->ready = DEVICE_RX;
782
783         return 0;
784 }
785
786 /*
787  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
788  * queue before disabling RX on the device.
789  */
790 static inline void
791 unlink_vmdq(struct vhost_dev *vdev)
792 {
793         unsigned i = 0;
794         unsigned rx_count;
795         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
796
797         if (vdev->ready == DEVICE_RX) {
798                 /*clear MAC and VLAN settings*/
799                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
800                 for (i = 0; i < 6; i++)
801                         vdev->mac_address.addr_bytes[i] = 0;
802
803                 vdev->vlan_tag = 0;
804
805                 /*Clear out the receive buffers*/
806                 rx_count = rte_eth_rx_burst(ports[0],
807                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
808
809                 while (rx_count) {
810                         for (i = 0; i < rx_count; i++)
811                                 rte_pktmbuf_free(pkts_burst[i]);
812
813                         rx_count = rte_eth_rx_burst(ports[0],
814                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
815                 }
816
817                 vdev->ready = DEVICE_MAC_LEARNING;
818         }
819 }
820
821 /*
822  * Check if the packet destination MAC address is for a local device. If so then put
823  * the packet on that devices RX queue. If not then return.
824  */
825 static inline int __attribute__((always_inline))
826 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
827 {
828         struct ether_hdr *pkt_hdr;
829         uint64_t ret = 0;
830         struct vhost_dev *dst_vdev;
831         uint64_t fh;
832
833         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
834
835         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
836         if (!dst_vdev)
837                 return -1;
838
839         fh = dst_vdev->dev->device_fh;
840         if (fh == vdev->dev->device_fh) {
841                 RTE_LOG(DEBUG, VHOST_DATA,
842                         "(%" PRIu64 ") TX: src and dst MAC is same. "
843                         "Dropping packet.\n", fh);
844                 return 0;
845         }
846
847         RTE_LOG(DEBUG, VHOST_DATA,
848                 "(%" PRIu64 ") TX: MAC address is local\n", fh);
849
850         if (unlikely(dst_vdev->remove)) {
851                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
852                         "Device is marked for removal\n", fh);
853                 return 0;
854         }
855
856         /* send the packet to the local virtio device */
857         ret = rte_vhost_enqueue_burst(dst_vdev->dev, VIRTIO_RXQ, &m, 1);
858         if (enable_stats) {
859                 rte_atomic64_inc(&dev_statistics[fh].rx_total_atomic);
860                 rte_atomic64_add(&dev_statistics[fh].rx_atomic, ret);
861                 dev_statistics[vdev->dev->device_fh].tx_total++;
862                 dev_statistics[vdev->dev->device_fh].tx += ret;
863         }
864
865         return 0;
866 }
867
868 /*
869  * Check if the destination MAC of a packet is one local VM,
870  * and get its vlan tag, and offset if it is.
871  */
872 static inline int __attribute__((always_inline))
873 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
874         uint32_t *offset, uint16_t *vlan_tag)
875 {
876         struct vhost_dev *dst_vdev;
877         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
878
879         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
880         if (!dst_vdev)
881                 return 0;
882
883         if (dst_vdev->dev->device_fh == dev->device_fh) {
884                 RTE_LOG(DEBUG, VHOST_DATA,
885                         "(%" PRIu64 ") TX: src and dst MAC is same. "
886                         " Dropping packet.\n", dst_vdev->dev->device_fh);
887                 return -1;
888         }
889
890         /*
891          * HW vlan strip will reduce the packet length
892          * by minus length of vlan tag, so need restore
893          * the packet length by plus it.
894          */
895         *offset  = VLAN_HLEN;
896         *vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
897
898         RTE_LOG(DEBUG, VHOST_DATA,
899                 "(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
900                 "vlan tag: %u.\n",
901                 dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
902
903         return 0;
904 }
905
906 static uint16_t
907 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
908 {
909         if (ol_flags & PKT_TX_IPV4)
910                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
911         else /* assume ethertype == ETHER_TYPE_IPv6 */
912                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
913 }
914
915 static void virtio_tx_offload(struct rte_mbuf *m)
916 {
917         void *l3_hdr;
918         struct ipv4_hdr *ipv4_hdr = NULL;
919         struct tcp_hdr *tcp_hdr = NULL;
920         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
921
922         l3_hdr = (char *)eth_hdr + m->l2_len;
923
924         if (m->ol_flags & PKT_TX_IPV4) {
925                 ipv4_hdr = l3_hdr;
926                 ipv4_hdr->hdr_checksum = 0;
927                 m->ol_flags |= PKT_TX_IP_CKSUM;
928         }
929
930         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
931         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
932 }
933
934 /*
935  * This function routes the TX packet to the correct interface. This may be a local device
936  * or the physical port.
937  */
938 static inline void __attribute__((always_inline))
939 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
940 {
941         struct mbuf_table *tx_q;
942         struct rte_mbuf **m_table;
943         unsigned len, ret, offset = 0;
944         const uint16_t lcore_id = rte_lcore_id();
945         struct virtio_net *dev = vdev->dev;
946         struct ether_hdr *nh;
947
948         /*check if destination is local VM*/
949         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
950                 rte_pktmbuf_free(m);
951                 return;
952         }
953
954         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
955                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
956                         rte_pktmbuf_free(m);
957                         return;
958                 }
959         }
960
961         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
962                 "MAC address is external\n", dev->device_fh);
963
964         /*Add packet to the port tx queue*/
965         tx_q = &lcore_tx_queue[lcore_id];
966         len = tx_q->len;
967
968         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
969         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
970                 /* Guest has inserted the vlan tag. */
971                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
972                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
973                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
974                         (vh->vlan_tci != vlan_tag_be))
975                         vh->vlan_tci = vlan_tag_be;
976         } else {
977                 m->ol_flags |= PKT_TX_VLAN_PKT;
978
979                 /*
980                  * Find the right seg to adjust the data len when offset is
981                  * bigger than tail room size.
982                  */
983                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
984                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
985                                 m->data_len += offset;
986                         else {
987                                 struct rte_mbuf *seg = m;
988
989                                 while ((seg->next != NULL) &&
990                                         (offset > rte_pktmbuf_tailroom(seg)))
991                                         seg = seg->next;
992
993                                 seg->data_len += offset;
994                         }
995                         m->pkt_len += offset;
996                 }
997
998                 m->vlan_tci = vlan_tag;
999         }
1000
1001         if (m->ol_flags & PKT_TX_TCP_SEG)
1002                 virtio_tx_offload(m);
1003
1004         tx_q->m_table[len] = m;
1005         len++;
1006         if (enable_stats) {
1007                 dev_statistics[dev->device_fh].tx_total++;
1008                 dev_statistics[dev->device_fh].tx++;
1009         }
1010
1011         if (unlikely(len == MAX_PKT_BURST)) {
1012                 m_table = (struct rte_mbuf **)tx_q->m_table;
1013                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1014                 /* Free any buffers not handled by TX and update the port stats. */
1015                 if (unlikely(ret < len)) {
1016                         do {
1017                                 rte_pktmbuf_free(m_table[ret]);
1018                         } while (++ret < len);
1019                 }
1020
1021                 len = 0;
1022         }
1023
1024         tx_q->len = len;
1025         return;
1026 }
1027 /*
1028  * This function is called by each data core. It handles all RX/TX registered with the
1029  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1030  * with all devices in the main linked list.
1031  */
1032 static int
1033 switch_worker(__attribute__((unused)) void *arg)
1034 {
1035         struct virtio_net *dev = NULL;
1036         struct vhost_dev *vdev = NULL;
1037         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1038         struct mbuf_table *tx_q;
1039         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1040         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1041         unsigned ret, i;
1042         const uint16_t lcore_id = rte_lcore_id();
1043         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1044         uint16_t rx_count = 0;
1045         uint16_t tx_count;
1046         uint32_t retry = 0;
1047
1048         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1049         prev_tsc = 0;
1050
1051         tx_q = &lcore_tx_queue[lcore_id];
1052         for (i = 0; i < num_cores; i ++) {
1053                 if (lcore_ids[i] == lcore_id) {
1054                         tx_q->txq_id = i;
1055                         break;
1056                 }
1057         }
1058
1059         while(1) {
1060                 cur_tsc = rte_rdtsc();
1061                 /*
1062                  * TX burst queue drain
1063                  */
1064                 diff_tsc = cur_tsc - prev_tsc;
1065                 if (unlikely(diff_tsc > drain_tsc)) {
1066
1067                         if (tx_q->len) {
1068                                 RTE_LOG(DEBUG, VHOST_DATA,
1069                                         "TX queue drained after timeout with burst size %u\n",
1070                                         tx_q->len);
1071
1072                                 /*Tx any packets in the queue*/
1073                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1074                                                                            (struct rte_mbuf **)tx_q->m_table,
1075                                                                            (uint16_t)tx_q->len);
1076                                 if (unlikely(ret < tx_q->len)) {
1077                                         do {
1078                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1079                                         } while (++ret < tx_q->len);
1080                                 }
1081
1082                                 tx_q->len = 0;
1083                         }
1084
1085                         prev_tsc = cur_tsc;
1086
1087                 }
1088
1089                 /*
1090                  * Inform the configuration core that we have exited the
1091                  * linked list and that no devices are in use if requested.
1092                  */
1093                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1094                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1095
1096                 /*
1097                  * Process devices
1098                  */
1099                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1100                         uint64_t fh;
1101
1102                         dev = vdev->dev;
1103                         fh  = dev->device_fh;
1104
1105                         if (unlikely(vdev->remove)) {
1106                                 unlink_vmdq(vdev);
1107                                 vdev->ready = DEVICE_SAFE_REMOVE;
1108                                 continue;
1109                         }
1110
1111                         if (likely(vdev->ready == DEVICE_RX)) {
1112                                 /*Handle guest RX*/
1113                                 rx_count = rte_eth_rx_burst(ports[0],
1114                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1115
1116                                 if (rx_count) {
1117                                         /*
1118                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1119                                         * Here MAX_PKT_BURST must be less than virtio queue size
1120                                         */
1121                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1122                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1123                                                         rte_delay_us(burst_rx_delay_time);
1124                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1125                                                                 break;
1126                                                 }
1127                                         }
1128                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1129                                         if (enable_stats) {
1130                                                 rte_atomic64_add(
1131                                                         &dev_statistics[fh].rx_total_atomic,
1132                                                         rx_count);
1133                                                 rte_atomic64_add(
1134                                                         &dev_statistics[fh].rx_atomic,
1135                                                         ret_count);
1136                                         }
1137                                         while (likely(rx_count)) {
1138                                                 rx_count--;
1139                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1140                                         }
1141
1142                                 }
1143                         }
1144
1145                         if (likely(!vdev->remove)) {
1146                                 /* Handle guest TX*/
1147                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1148                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1149                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1150                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1151                                                 while (tx_count)
1152                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1153                                         }
1154                                 }
1155                                 for (i = 0; i < tx_count; ++i) {
1156                                         virtio_tx_route(vdev, pkts_burst[i],
1157                                                 vlan_tags[(uint16_t)dev->device_fh]);
1158                                 }
1159                         }
1160                 }
1161         }
1162
1163         return 0;
1164 }
1165
1166 /*
1167  * Remove a device from the specific data core linked list and from the
1168  * main linked list. Synchonization  occurs through the use of the
1169  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1171  */
1172 static void
1173 destroy_device (volatile struct virtio_net *dev)
1174 {
1175         struct vhost_dev *vdev;
1176         int lcore;
1177
1178         dev->flags &= ~VIRTIO_DEV_RUNNING;
1179
1180         vdev = (struct vhost_dev *)dev->priv;
1181         /*set the remove flag. */
1182         vdev->remove = 1;
1183         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1184                 rte_pause();
1185         }
1186
1187         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1188         TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1189
1190         /* Set the dev_removal_flag on each lcore. */
1191         RTE_LCORE_FOREACH_SLAVE(lcore)
1192                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1193
1194         /*
1195          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1196          * we can be sure that they can no longer access the device removed
1197          * from the linked lists and that the devices are no longer in use.
1198          */
1199         RTE_LCORE_FOREACH_SLAVE(lcore) {
1200                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1201                         rte_pause();
1202         }
1203
1204         lcore_info[vdev->coreid].device_num--;
1205
1206         RTE_LOG(INFO, VHOST_DATA,
1207                 "(%" PRIu64 ") Device has been removed from data core\n",
1208                 dev->device_fh);
1209
1210         rte_free(vdev);
1211 }
1212
1213 /*
1214  * A new device is added to a data core. First the device is added to the main linked list
1215  * and the allocated to a specific data core.
1216  */
1217 static int
1218 new_device (struct virtio_net *dev)
1219 {
1220         int lcore, core_add = 0;
1221         uint32_t device_num_min = num_devices;
1222         struct vhost_dev *vdev;
1223
1224         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1225         if (vdev == NULL) {
1226                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1227                         dev->device_fh);
1228                 return -1;
1229         }
1230         vdev->dev = dev;
1231         dev->priv = vdev;
1232
1233         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1234         vdev->vmdq_rx_q
1235                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
1236
1237         /*reset ready flag*/
1238         vdev->ready = DEVICE_MAC_LEARNING;
1239         vdev->remove = 0;
1240
1241         /* Find a suitable lcore to add the device. */
1242         RTE_LCORE_FOREACH_SLAVE(lcore) {
1243                 if (lcore_info[lcore].device_num < device_num_min) {
1244                         device_num_min = lcore_info[lcore].device_num;
1245                         core_add = lcore;
1246                 }
1247         }
1248         vdev->coreid = core_add;
1249
1250         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1251         lcore_info[vdev->coreid].device_num++;
1252
1253         /* Initialize device stats */
1254         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1255
1256         /* Disable notifications. */
1257         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1258         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1259         dev->flags |= VIRTIO_DEV_RUNNING;
1260
1261         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1262
1263         return 0;
1264 }
1265
1266 /*
1267  * These callback allow devices to be added to the data core when configuration
1268  * has been fully complete.
1269  */
1270 static const struct virtio_net_device_ops virtio_net_device_ops =
1271 {
1272         .new_device =  new_device,
1273         .destroy_device = destroy_device,
1274 };
1275
1276 /*
1277  * This is a thread will wake up after a period to print stats if the user has
1278  * enabled them.
1279  */
1280 static void
1281 print_stats(void)
1282 {
1283         struct vhost_dev *vdev;
1284         uint64_t tx_dropped, rx_dropped;
1285         uint64_t tx, tx_total, rx, rx_total;
1286         uint32_t device_fh;
1287         const char clr[] = { 27, '[', '2', 'J', '\0' };
1288         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1289
1290         while(1) {
1291                 sleep(enable_stats);
1292
1293                 /* Clear screen and move to top left */
1294                 printf("%s%s", clr, top_left);
1295
1296                 printf("\nDevice statistics ====================================");
1297
1298                 TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1299                         device_fh = vdev->dev->device_fh;
1300                         tx_total = dev_statistics[device_fh].tx_total;
1301                         tx = dev_statistics[device_fh].tx;
1302                         tx_dropped = tx_total - tx;
1303                         rx_total = rte_atomic64_read(
1304                                 &dev_statistics[device_fh].rx_total_atomic);
1305                         rx = rte_atomic64_read(
1306                                 &dev_statistics[device_fh].rx_atomic);
1307                         rx_dropped = rx_total - rx;
1308
1309                         printf("\nStatistics for device %"PRIu32" ------------------------------"
1310                                         "\nTX total:            %"PRIu64""
1311                                         "\nTX dropped:          %"PRIu64""
1312                                         "\nTX successful:               %"PRIu64""
1313                                         "\nRX total:            %"PRIu64""
1314                                         "\nRX dropped:          %"PRIu64""
1315                                         "\nRX successful:               %"PRIu64"",
1316                                         device_fh,
1317                                         tx_total,
1318                                         tx_dropped,
1319                                         tx,
1320                                         rx_total,
1321                                         rx_dropped,
1322                                         rx);
1323                 }
1324                 printf("\n======================================================\n");
1325         }
1326 }
1327
1328 /* When we receive a INT signal, unregister vhost driver */
1329 static void
1330 sigint_handler(__rte_unused int signum)
1331 {
1332         /* Unregister vhost driver. */
1333         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1334         if (ret != 0)
1335                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1336         exit(0);
1337 }
1338
1339 /*
1340  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1341  * device is also registered here to handle the IOCTLs.
1342  */
1343 int
1344 main(int argc, char *argv[])
1345 {
1346         unsigned lcore_id, core_id = 0;
1347         unsigned nb_ports, valid_num_ports;
1348         int ret;
1349         uint8_t portid;
1350         static pthread_t tid;
1351         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1352
1353         signal(SIGINT, sigint_handler);
1354
1355         /* init EAL */
1356         ret = rte_eal_init(argc, argv);
1357         if (ret < 0)
1358                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1359         argc -= ret;
1360         argv += ret;
1361
1362         /* parse app arguments */
1363         ret = us_vhost_parse_args(argc, argv);
1364         if (ret < 0)
1365                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1366
1367         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1368                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1369
1370                 if (rte_lcore_is_enabled(lcore_id))
1371                         lcore_ids[core_id ++] = lcore_id;
1372
1373         if (rte_lcore_count() > RTE_MAX_LCORE)
1374                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1375
1376         /*set the number of swithcing cores available*/
1377         num_switching_cores = rte_lcore_count()-1;
1378
1379         /* Get the number of physical ports. */
1380         nb_ports = rte_eth_dev_count();
1381         if (nb_ports > RTE_MAX_ETHPORTS)
1382                 nb_ports = RTE_MAX_ETHPORTS;
1383
1384         /*
1385          * Update the global var NUM_PORTS and global array PORTS
1386          * and get value of var VALID_NUM_PORTS according to system ports number
1387          */
1388         valid_num_ports = check_ports_num(nb_ports);
1389
1390         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1391                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1392                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1393                 return -1;
1394         }
1395
1396         /* Create the mbuf pool. */
1397         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1398                 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1399                 0, MBUF_DATA_SIZE, rte_socket_id());
1400         if (mbuf_pool == NULL)
1401                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1402
1403         if (vm2vm_mode == VM2VM_HARDWARE) {
1404                 /* Enable VT loop back to let L2 switch to do it. */
1405                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1406                 RTE_LOG(DEBUG, VHOST_CONFIG,
1407                         "Enable loop back for L2 switch in vmdq.\n");
1408         }
1409
1410         /* initialize all ports */
1411         for (portid = 0; portid < nb_ports; portid++) {
1412                 /* skip ports that are not enabled */
1413                 if ((enabled_port_mask & (1 << portid)) == 0) {
1414                         RTE_LOG(INFO, VHOST_PORT,
1415                                 "Skipping disabled port %d\n", portid);
1416                         continue;
1417                 }
1418                 if (port_init(portid) != 0)
1419                         rte_exit(EXIT_FAILURE,
1420                                 "Cannot initialize network ports\n");
1421         }
1422
1423         /* Initialize device stats */
1424         memset(&dev_statistics, 0, sizeof(dev_statistics));
1425
1426         /* Enable stats if the user option is set. */
1427         if (enable_stats) {
1428                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1429                 if (ret != 0)
1430                         rte_exit(EXIT_FAILURE,
1431                                 "Cannot create print-stats thread\n");
1432
1433                 /* Set thread_name for aid in debugging.  */
1434                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1435                 ret = rte_thread_setname(tid, thread_name);
1436                 if (ret != 0)
1437                         RTE_LOG(ERR, VHOST_CONFIG,
1438                                 "Cannot set print-stats name\n");
1439         }
1440
1441         /* Launch all data cores. */
1442         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1443                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1444
1445         if (mergeable == 0)
1446                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1447
1448         /* Register vhost(cuse or user) driver to handle vhost messages. */
1449         ret = rte_vhost_driver_register((char *)&dev_basename);
1450         if (ret != 0)
1451                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1452
1453         rte_vhost_driver_callback_register(&virtio_net_device_ops);
1454
1455         /* Start CUSE session. */
1456         rte_vhost_driver_session_start();
1457         return 0;
1458
1459 }