examples/vhost: check argument length
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77         VM2VM_DISABLED = 0,
78         VM2VM_SOFTWARE = 1,
79         VM2VM_HARDWARE = 2,
80         VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94
95 static int client_mode;
96
97 static int builtin_net_driver;
98
99 static int async_vhost_driver;
100
101 static char dma_type[MAX_LONG_OPT_SZ];
102
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114         .rxmode = {
115                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116                 .split_hdr_size = 0,
117                 /*
118                  * VLAN strip is necessary for 1G NIC such as I350,
119                  * this fixes bug of ipv4 forwarding in guest can't
120                  * forward pakets from one virtio dev to another virtio dev.
121                  */
122                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123         },
124
125         .txmode = {
126                 .mq_mode = ETH_MQ_TX_NONE,
127                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128                              DEV_TX_OFFLOAD_TCP_CKSUM |
129                              DEV_TX_OFFLOAD_VLAN_INSERT |
130                              DEV_TX_OFFLOAD_MULTI_SEGS |
131                              DEV_TX_OFFLOAD_TCP_TSO),
132         },
133         .rx_adv_conf = {
134                 /*
135                  * should be overridden separately in code with
136                  * appropriate values
137                  */
138                 .vmdq_rx_conf = {
139                         .nb_queue_pools = ETH_8_POOLS,
140                         .enable_default_pool = 0,
141                         .default_pool = 0,
142                         .nb_pool_maps = 0,
143                         .pool_map = {{0, 0},},
144                 },
145         },
146 };
147
148
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155
156 const uint16_t vlan_tags[] = {
157         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
159         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169
170 static struct vhost_dev_tailq_list vhost_dev_list =
171         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177         unsigned len;
178         unsigned txq_id;
179         struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184
185 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
186                                  / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188
189 static inline int
190 open_dma(const char *value)
191 {
192         if (strncmp(dma_type, "ioat", 4) == 0)
193                 return open_ioat(value);
194
195         return -1;
196 }
197
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205         struct rte_eth_vmdq_rx_conf conf;
206         struct rte_eth_vmdq_rx_conf *def_conf =
207                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208         unsigned i;
209
210         memset(&conf, 0, sizeof(conf));
211         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212         conf.nb_pool_maps = num_devices;
213         conf.enable_loop_back = def_conf->enable_loop_back;
214         conf.rx_mode = def_conf->rx_mode;
215
216         for (i = 0; i < conf.nb_pool_maps; i++) {
217                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
218                 conf.pool_map[i].pools = (1UL << i);
219         }
220
221         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224         return 0;
225 }
226
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234         struct rte_eth_dev_info dev_info;
235         struct rte_eth_conf port_conf;
236         struct rte_eth_rxconf *rxconf;
237         struct rte_eth_txconf *txconf;
238         int16_t rx_rings, tx_rings;
239         uint16_t rx_ring_size, tx_ring_size;
240         int retval;
241         uint16_t q;
242
243         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244         retval = rte_eth_dev_info_get(port, &dev_info);
245         if (retval != 0) {
246                 RTE_LOG(ERR, VHOST_PORT,
247                         "Error during getting device (port %u) info: %s\n",
248                         port, strerror(-retval));
249
250                 return retval;
251         }
252
253         rxconf = &dev_info.default_rxconf;
254         txconf = &dev_info.default_txconf;
255         rxconf->rx_drop_en = 1;
256
257         /*configure the number of supported virtio devices based on VMDQ limits */
258         num_devices = dev_info.max_vmdq_pools;
259
260         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262
263         tx_rings = (uint16_t)rte_lcore_count();
264
265         /* Get port configuration. */
266         retval = get_eth_conf(&port_conf, num_devices);
267         if (retval < 0)
268                 return retval;
269         /* NIC queues are divided into pf queues and vmdq queues.  */
270         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272         num_vmdq_queues = num_devices * queues_per_pool;
273         num_queues = num_pf_queues + num_vmdq_queues;
274         vmdq_queue_base = dev_info.vmdq_queue_base;
275         vmdq_pool_base  = dev_info.vmdq_pool_base;
276         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277                 num_pf_queues, num_devices, queues_per_pool);
278
279         if (!rte_eth_dev_is_valid_port(port))
280                 return -1;
281
282         rx_rings = (uint16_t)dev_info.max_rx_queues;
283         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284                 port_conf.txmode.offloads |=
285                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286         /* Configure ethernet device. */
287         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288         if (retval != 0) {
289                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290                         port, strerror(-retval));
291                 return retval;
292         }
293
294         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295                 &tx_ring_size);
296         if (retval != 0) {
297                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298                         "for port %u: %s.\n", port, strerror(-retval));
299                 return retval;
300         }
301         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303                         "for Rx queues on port %u.\n", port);
304                 return -1;
305         }
306
307         /* Setup the queues. */
308         rxconf->offloads = port_conf.rxmode.offloads;
309         for (q = 0; q < rx_rings; q ++) {
310                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311                                                 rte_eth_dev_socket_id(port),
312                                                 rxconf,
313                                                 mbuf_pool);
314                 if (retval < 0) {
315                         RTE_LOG(ERR, VHOST_PORT,
316                                 "Failed to setup rx queue %u of port %u: %s.\n",
317                                 q, port, strerror(-retval));
318                         return retval;
319                 }
320         }
321         txconf->offloads = port_conf.txmode.offloads;
322         for (q = 0; q < tx_rings; q ++) {
323                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324                                                 rte_eth_dev_socket_id(port),
325                                                 txconf);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup tx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333
334         /* Start the device. */
335         retval  = rte_eth_dev_start(port);
336         if (retval < 0) {
337                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338                         port, strerror(-retval));
339                 return retval;
340         }
341
342         if (promiscuous) {
343                 retval = rte_eth_promiscuous_enable(port);
344                 if (retval != 0) {
345                         RTE_LOG(ERR, VHOST_PORT,
346                                 "Failed to enable promiscuous mode on port %u: %s\n",
347                                 port, rte_strerror(-retval));
348                         return retval;
349                 }
350         }
351
352         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT,
355                         "Failed to get MAC address on port %u: %s\n",
356                         port, rte_strerror(-retval));
357                 return retval;
358         }
359
360         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363                         port,
364                         vmdq_ports_eth_addr[port].addr_bytes[0],
365                         vmdq_ports_eth_addr[port].addr_bytes[1],
366                         vmdq_ports_eth_addr[port].addr_bytes[2],
367                         vmdq_ports_eth_addr[port].addr_bytes[3],
368                         vmdq_ports_eth_addr[port].addr_bytes[4],
369                         vmdq_ports_eth_addr[port].addr_bytes[5]);
370
371         return 0;
372 }
373
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380         char *old;
381
382         /* parse number string */
383         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384                 return -1;
385
386         old = socket_files;
387         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388         if (socket_files == NULL) {
389                 free(old);
390                 return -1;
391         }
392
393         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394         nb_sockets++;
395
396         return 0;
397 }
398
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405         char *end = NULL;
406         unsigned long pm;
407
408         errno = 0;
409
410         /* parse hexadecimal string */
411         pm = strtoul(portmask, &end, 16);
412         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413                 return 0;
414
415         return pm;
416
417 }
418
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425         char *end = NULL;
426         unsigned long num;
427
428         errno = 0;
429
430         /* parse unsigned int string */
431         num = strtoul(q_arg, &end, 10);
432         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433                 return -1;
434
435         if (num > max_valid_value)
436                 return -1;
437
438         return num;
439
440 }
441
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449         "               --vm2vm [0|1|2]\n"
450         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451         "               --socket-file <path>\n"
452         "               --nb-devices ND\n"
453         "               -p PORTMASK: Set mask for ports to be used by application\n"
454         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460         "               --socket-file: The path of the socket file.\n"
461         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
462         "               --tso [0|1] disable/enable TCP segment offload.\n"
463         "               --client register a vhost-user socket as client mode.\n"
464         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465         "               --dmas register dma channel for specific vhost device.\n",
466                prgname);
467 }
468
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475         int opt, ret;
476         int option_index;
477         unsigned i;
478         const char *prgname = argv[0];
479         static struct option long_option[] = {
480                 {"vm2vm", required_argument, NULL, 0},
481                 {"rx-retry", required_argument, NULL, 0},
482                 {"rx-retry-delay", required_argument, NULL, 0},
483                 {"rx-retry-num", required_argument, NULL, 0},
484                 {"mergeable", required_argument, NULL, 0},
485                 {"stats", required_argument, NULL, 0},
486                 {"socket-file", required_argument, NULL, 0},
487                 {"tx-csum", required_argument, NULL, 0},
488                 {"tso", required_argument, NULL, 0},
489                 {"client", no_argument, &client_mode, 1},
490                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491                 {"dma-type", required_argument, NULL, 0},
492                 {"dmas", required_argument, NULL, 0},
493                 {NULL, 0, 0, 0},
494         };
495
496         /* Parse command line */
497         while ((opt = getopt_long(argc, argv, "p:P",
498                         long_option, &option_index)) != EOF) {
499                 switch (opt) {
500                 /* Portmask */
501                 case 'p':
502                         enabled_port_mask = parse_portmask(optarg);
503                         if (enabled_port_mask == 0) {
504                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505                                 us_vhost_usage(prgname);
506                                 return -1;
507                         }
508                         break;
509
510                 case 'P':
511                         promiscuous = 1;
512                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513                                 ETH_VMDQ_ACCEPT_BROADCAST |
514                                 ETH_VMDQ_ACCEPT_MULTICAST;
515
516                         break;
517
518                 case 0:
519                         /* Enable/disable vm2vm comms. */
520                         if (!strncmp(long_option[option_index].name, "vm2vm",
521                                 MAX_LONG_OPT_SZ)) {
522                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523                                 if (ret == -1) {
524                                         RTE_LOG(INFO, VHOST_CONFIG,
525                                                 "Invalid argument for "
526                                                 "vm2vm [0|1|2]\n");
527                                         us_vhost_usage(prgname);
528                                         return -1;
529                                 } else {
530                                         vm2vm_mode = (vm2vm_type)ret;
531                                 }
532                         }
533
534                         /* Enable/disable retries on RX. */
535                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536                                 ret = parse_num_opt(optarg, 1);
537                                 if (ret == -1) {
538                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         enable_retry = ret;
543                                 }
544                         }
545
546                         /* Enable/disable TX checksum offload. */
547                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else
554                                         enable_tx_csum = ret;
555                         }
556
557                         /* Enable/disable TSO offload. */
558                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559                                 ret = parse_num_opt(optarg, 1);
560                                 if (ret == -1) {
561                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562                                         us_vhost_usage(prgname);
563                                         return -1;
564                                 } else
565                                         enable_tso = ret;
566                         }
567
568                         /* Specify the retries delay time (in useconds) on RX. */
569                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570                                 ret = parse_num_opt(optarg, INT32_MAX);
571                                 if (ret == -1) {
572                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573                                         us_vhost_usage(prgname);
574                                         return -1;
575                                 } else {
576                                         burst_rx_delay_time = ret;
577                                 }
578                         }
579
580                         /* Specify the retries number on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_retry_num = ret;
589                                 }
590                         }
591
592                         /* Enable/disable RX mergeable buffers. */
593                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, 1);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         mergeable = !!ret;
601                                         if (ret) {
602                                                 vmdq_conf_default.rxmode.offloads |=
603                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
604                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
605                                                         = JUMBO_FRAME_MAX_SIZE;
606                                         }
607                                 }
608                         }
609
610                         /* Enable/disable stats. */
611                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612                                 ret = parse_num_opt(optarg, INT32_MAX);
613                                 if (ret == -1) {
614                                         RTE_LOG(INFO, VHOST_CONFIG,
615                                                 "Invalid argument for stats [0..N]\n");
616                                         us_vhost_usage(prgname);
617                                         return -1;
618                                 } else {
619                                         enable_stats = ret;
620                                 }
621                         }
622
623                         /* Set socket file path. */
624                         if (!strncmp(long_option[option_index].name,
625                                                 "socket-file", MAX_LONG_OPT_SZ)) {
626                                 if (us_vhost_parse_socket_path(optarg) == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                         "Invalid argument for socket name (Max %d characters)\n",
629                                         PATH_MAX);
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 }
633                         }
634
635                         if (!strncmp(long_option[option_index].name,
636                                                 "dma-type", MAX_LONG_OPT_SZ)) {
637                                 if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
638                                         RTE_LOG(INFO, VHOST_CONFIG,
639                                                 "Wrong DMA type\n");
640                                         us_vhost_usage(prgname);
641                                         return -1;
642                                 }
643                                 strcpy(dma_type, optarg);
644                         }
645
646                         if (!strncmp(long_option[option_index].name,
647                                                 "dmas", MAX_LONG_OPT_SZ)) {
648                                 if (open_dma(optarg) == -1) {
649                                         RTE_LOG(INFO, VHOST_CONFIG,
650                                                 "Wrong DMA args\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 }
654                                 async_vhost_driver = 1;
655                         }
656
657                         break;
658
659                         /* Invalid option - print options. */
660                 default:
661                         us_vhost_usage(prgname);
662                         return -1;
663                 }
664         }
665
666         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
667                 if (enabled_port_mask & (1 << i))
668                         ports[num_ports++] = i;
669         }
670
671         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
672                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
673                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
674                 return -1;
675         }
676
677         return 0;
678 }
679
680 /*
681  * Update the global var NUM_PORTS and array PORTS according to system ports number
682  * and return valid ports number
683  */
684 static unsigned check_ports_num(unsigned nb_ports)
685 {
686         unsigned valid_num_ports = num_ports;
687         unsigned portid;
688
689         if (num_ports > nb_ports) {
690                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
691                         num_ports, nb_ports);
692                 num_ports = nb_ports;
693         }
694
695         for (portid = 0; portid < num_ports; portid ++) {
696                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
697                         RTE_LOG(INFO, VHOST_PORT,
698                                 "\nSpecified port ID(%u) is not valid\n",
699                                 ports[portid]);
700                         ports[portid] = INVALID_PORT_ID;
701                         valid_num_ports--;
702                 }
703         }
704         return valid_num_ports;
705 }
706
707 static __rte_always_inline struct vhost_dev *
708 find_vhost_dev(struct rte_ether_addr *mac)
709 {
710         struct vhost_dev *vdev;
711
712         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
713                 if (vdev->ready == DEVICE_RX &&
714                     rte_is_same_ether_addr(mac, &vdev->mac_address))
715                         return vdev;
716         }
717
718         return NULL;
719 }
720
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728         struct rte_ether_hdr *pkt_hdr;
729         int i, ret;
730
731         /* Learn MAC address of guest device from packet */
732         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
733
734         if (find_vhost_dev(&pkt_hdr->s_addr)) {
735                 RTE_LOG(ERR, VHOST_DATA,
736                         "(%d) device is using a registered MAC!\n",
737                         vdev->vid);
738                 return -1;
739         }
740
741         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
742                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
743
744         /* vlan_tag currently uses the device_id. */
745         vdev->vlan_tag = vlan_tags[vdev->vid];
746
747         /* Print out VMDQ registration info. */
748         RTE_LOG(INFO, VHOST_DATA,
749                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
750                 vdev->vid,
751                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754                 vdev->vlan_tag);
755
756         /* Register the MAC address. */
757         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758                                 (uint32_t)vdev->vid + vmdq_pool_base);
759         if (ret)
760                 RTE_LOG(ERR, VHOST_DATA,
761                         "(%d) failed to add device MAC address to VMDQ\n",
762                         vdev->vid);
763
764         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
765
766         /* Set device as ready for RX. */
767         vdev->ready = DEVICE_RX;
768
769         return 0;
770 }
771
772 /*
773  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
774  * queue before disabling RX on the device.
775  */
776 static inline void
777 unlink_vmdq(struct vhost_dev *vdev)
778 {
779         unsigned i = 0;
780         unsigned rx_count;
781         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
782
783         if (vdev->ready == DEVICE_RX) {
784                 /*clear MAC and VLAN settings*/
785                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
786                 for (i = 0; i < 6; i++)
787                         vdev->mac_address.addr_bytes[i] = 0;
788
789                 vdev->vlan_tag = 0;
790
791                 /*Clear out the receive buffers*/
792                 rx_count = rte_eth_rx_burst(ports[0],
793                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
794
795                 while (rx_count) {
796                         for (i = 0; i < rx_count; i++)
797                                 rte_pktmbuf_free(pkts_burst[i]);
798
799                         rx_count = rte_eth_rx_burst(ports[0],
800                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
801                 }
802
803                 vdev->ready = DEVICE_MAC_LEARNING;
804         }
805 }
806
807 static __rte_always_inline void
808 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
809             struct rte_mbuf *m)
810 {
811         uint16_t ret;
812         struct rte_mbuf *m_cpl[1];
813
814         if (builtin_net_driver) {
815                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
816         } else if (async_vhost_driver) {
817                 ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
818                                                 &m, 1);
819
820                 if (likely(ret))
821                         dst_vdev->nr_async_pkts++;
822
823                 while (likely(dst_vdev->nr_async_pkts)) {
824                         if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
825                                         VIRTIO_RXQ, m_cpl, 1))
826                                 dst_vdev->nr_async_pkts--;
827                 }
828         } else {
829                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
830         }
831
832         if (enable_stats) {
833                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
834                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
835                 src_vdev->stats.tx_total++;
836                 src_vdev->stats.tx += ret;
837         }
838 }
839
840 /*
841  * Check if the packet destination MAC address is for a local device. If so then put
842  * the packet on that devices RX queue. If not then return.
843  */
844 static __rte_always_inline int
845 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
846 {
847         struct rte_ether_hdr *pkt_hdr;
848         struct vhost_dev *dst_vdev;
849
850         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
851
852         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
853         if (!dst_vdev)
854                 return -1;
855
856         if (vdev->vid == dst_vdev->vid) {
857                 RTE_LOG_DP(DEBUG, VHOST_DATA,
858                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
859                         vdev->vid);
860                 return 0;
861         }
862
863         RTE_LOG_DP(DEBUG, VHOST_DATA,
864                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
865
866         if (unlikely(dst_vdev->remove)) {
867                 RTE_LOG_DP(DEBUG, VHOST_DATA,
868                         "(%d) device is marked for removal\n", dst_vdev->vid);
869                 return 0;
870         }
871
872         virtio_xmit(dst_vdev, vdev, m);
873         return 0;
874 }
875
876 /*
877  * Check if the destination MAC of a packet is one local VM,
878  * and get its vlan tag, and offset if it is.
879  */
880 static __rte_always_inline int
881 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
882         uint32_t *offset, uint16_t *vlan_tag)
883 {
884         struct vhost_dev *dst_vdev;
885         struct rte_ether_hdr *pkt_hdr =
886                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
887
888         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
889         if (!dst_vdev)
890                 return 0;
891
892         if (vdev->vid == dst_vdev->vid) {
893                 RTE_LOG_DP(DEBUG, VHOST_DATA,
894                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
895                         vdev->vid);
896                 return -1;
897         }
898
899         /*
900          * HW vlan strip will reduce the packet length
901          * by minus length of vlan tag, so need restore
902          * the packet length by plus it.
903          */
904         *offset  = VLAN_HLEN;
905         *vlan_tag = vlan_tags[vdev->vid];
906
907         RTE_LOG_DP(DEBUG, VHOST_DATA,
908                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
909                 vdev->vid, dst_vdev->vid, *vlan_tag);
910
911         return 0;
912 }
913
914 static uint16_t
915 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
916 {
917         if (ol_flags & PKT_TX_IPV4)
918                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
919         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
920                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
921 }
922
923 static void virtio_tx_offload(struct rte_mbuf *m)
924 {
925         void *l3_hdr;
926         struct rte_ipv4_hdr *ipv4_hdr = NULL;
927         struct rte_tcp_hdr *tcp_hdr = NULL;
928         struct rte_ether_hdr *eth_hdr =
929                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
930
931         l3_hdr = (char *)eth_hdr + m->l2_len;
932
933         if (m->ol_flags & PKT_TX_IPV4) {
934                 ipv4_hdr = l3_hdr;
935                 ipv4_hdr->hdr_checksum = 0;
936                 m->ol_flags |= PKT_TX_IP_CKSUM;
937         }
938
939         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
940         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
941 }
942
943 static inline void
944 free_pkts(struct rte_mbuf **pkts, uint16_t n)
945 {
946         while (n--)
947                 rte_pktmbuf_free(pkts[n]);
948 }
949
950 static __rte_always_inline void
951 do_drain_mbuf_table(struct mbuf_table *tx_q)
952 {
953         uint16_t count;
954
955         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
956                                  tx_q->m_table, tx_q->len);
957         if (unlikely(count < tx_q->len))
958                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
959
960         tx_q->len = 0;
961 }
962
963 /*
964  * This function routes the TX packet to the correct interface. This
965  * may be a local device or the physical port.
966  */
967 static __rte_always_inline void
968 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
969 {
970         struct mbuf_table *tx_q;
971         unsigned offset = 0;
972         const uint16_t lcore_id = rte_lcore_id();
973         struct rte_ether_hdr *nh;
974
975
976         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
977         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
978                 struct vhost_dev *vdev2;
979
980                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
981                         if (vdev2 != vdev)
982                                 virtio_xmit(vdev2, vdev, m);
983                 }
984                 goto queue2nic;
985         }
986
987         /*check if destination is local VM*/
988         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
989                 rte_pktmbuf_free(m);
990                 return;
991         }
992
993         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
994                 if (unlikely(find_local_dest(vdev, m, &offset,
995                                              &vlan_tag) != 0)) {
996                         rte_pktmbuf_free(m);
997                         return;
998                 }
999         }
1000
1001         RTE_LOG_DP(DEBUG, VHOST_DATA,
1002                 "(%d) TX: MAC address is external\n", vdev->vid);
1003
1004 queue2nic:
1005
1006         /*Add packet to the port tx queue*/
1007         tx_q = &lcore_tx_queue[lcore_id];
1008
1009         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1010         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1011                 /* Guest has inserted the vlan tag. */
1012                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1013                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1014                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1015                         (vh->vlan_tci != vlan_tag_be))
1016                         vh->vlan_tci = vlan_tag_be;
1017         } else {
1018                 m->ol_flags |= PKT_TX_VLAN_PKT;
1019
1020                 /*
1021                  * Find the right seg to adjust the data len when offset is
1022                  * bigger than tail room size.
1023                  */
1024                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1025                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1026                                 m->data_len += offset;
1027                         else {
1028                                 struct rte_mbuf *seg = m;
1029
1030                                 while ((seg->next != NULL) &&
1031                                         (offset > rte_pktmbuf_tailroom(seg)))
1032                                         seg = seg->next;
1033
1034                                 seg->data_len += offset;
1035                         }
1036                         m->pkt_len += offset;
1037                 }
1038
1039                 m->vlan_tci = vlan_tag;
1040         }
1041
1042         if (m->ol_flags & PKT_TX_TCP_SEG)
1043                 virtio_tx_offload(m);
1044
1045         tx_q->m_table[tx_q->len++] = m;
1046         if (enable_stats) {
1047                 vdev->stats.tx_total++;
1048                 vdev->stats.tx++;
1049         }
1050
1051         if (unlikely(tx_q->len == MAX_PKT_BURST))
1052                 do_drain_mbuf_table(tx_q);
1053 }
1054
1055
1056 static __rte_always_inline void
1057 drain_mbuf_table(struct mbuf_table *tx_q)
1058 {
1059         static uint64_t prev_tsc;
1060         uint64_t cur_tsc;
1061
1062         if (tx_q->len == 0)
1063                 return;
1064
1065         cur_tsc = rte_rdtsc();
1066         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1067                 prev_tsc = cur_tsc;
1068
1069                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1070                         "TX queue drained after timeout with burst size %u\n",
1071                         tx_q->len);
1072                 do_drain_mbuf_table(tx_q);
1073         }
1074 }
1075
1076 static __rte_always_inline void
1077 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1078 {
1079         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1080         uint16_t complete_count;
1081
1082         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1083                                                 qid, p_cpl, MAX_PKT_BURST);
1084         vdev->nr_async_pkts -= complete_count;
1085         if (complete_count)
1086                 free_pkts(p_cpl, complete_count);
1087 }
1088
1089 static __rte_always_inline void
1090 drain_eth_rx(struct vhost_dev *vdev)
1091 {
1092         uint16_t rx_count, enqueue_count;
1093         struct rte_mbuf *pkts[MAX_PKT_BURST];
1094
1095         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1096                                     pkts, MAX_PKT_BURST);
1097
1098         while (likely(vdev->nr_async_pkts))
1099                 complete_async_pkts(vdev, VIRTIO_RXQ);
1100
1101         if (!rx_count)
1102                 return;
1103
1104         /*
1105          * When "enable_retry" is set, here we wait and retry when there
1106          * is no enough free slots in the queue to hold @rx_count packets,
1107          * to diminish packet loss.
1108          */
1109         if (enable_retry &&
1110             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1111                         VIRTIO_RXQ))) {
1112                 uint32_t retry;
1113
1114                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1115                         rte_delay_us(burst_rx_delay_time);
1116                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1117                                         VIRTIO_RXQ))
1118                                 break;
1119                 }
1120         }
1121
1122         if (builtin_net_driver) {
1123                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1124                                                 pkts, rx_count);
1125         } else if (async_vhost_driver) {
1126                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1127                                         VIRTIO_RXQ, pkts, rx_count);
1128                 vdev->nr_async_pkts += enqueue_count;
1129         } else {
1130                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1131                                                 pkts, rx_count);
1132         }
1133
1134         if (enable_stats) {
1135                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1136                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1137         }
1138
1139         if (!async_vhost_driver)
1140                 free_pkts(pkts, rx_count);
1141 }
1142
1143 static __rte_always_inline void
1144 drain_virtio_tx(struct vhost_dev *vdev)
1145 {
1146         struct rte_mbuf *pkts[MAX_PKT_BURST];
1147         uint16_t count;
1148         uint16_t i;
1149
1150         if (builtin_net_driver) {
1151                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1152                                         pkts, MAX_PKT_BURST);
1153         } else {
1154                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1155                                         mbuf_pool, pkts, MAX_PKT_BURST);
1156         }
1157
1158         /* setup VMDq for the first packet */
1159         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1160                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1161                         free_pkts(pkts, count);
1162         }
1163
1164         for (i = 0; i < count; ++i)
1165                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1166 }
1167
1168 /*
1169  * Main function of vhost-switch. It basically does:
1170  *
1171  * for each vhost device {
1172  *    - drain_eth_rx()
1173  *
1174  *      Which drains the host eth Rx queue linked to the vhost device,
1175  *      and deliver all of them to guest virito Rx ring associated with
1176  *      this vhost device.
1177  *
1178  *    - drain_virtio_tx()
1179  *
1180  *      Which drains the guest virtio Tx queue and deliver all of them
1181  *      to the target, which could be another vhost device, or the
1182  *      physical eth dev. The route is done in function "virtio_tx_route".
1183  * }
1184  */
1185 static int
1186 switch_worker(void *arg __rte_unused)
1187 {
1188         unsigned i;
1189         unsigned lcore_id = rte_lcore_id();
1190         struct vhost_dev *vdev;
1191         struct mbuf_table *tx_q;
1192
1193         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1194
1195         tx_q = &lcore_tx_queue[lcore_id];
1196         for (i = 0; i < rte_lcore_count(); i++) {
1197                 if (lcore_ids[i] == lcore_id) {
1198                         tx_q->txq_id = i;
1199                         break;
1200                 }
1201         }
1202
1203         while(1) {
1204                 drain_mbuf_table(tx_q);
1205
1206                 /*
1207                  * Inform the configuration core that we have exited the
1208                  * linked list and that no devices are in use if requested.
1209                  */
1210                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1211                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1212
1213                 /*
1214                  * Process vhost devices
1215                  */
1216                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1217                               lcore_vdev_entry) {
1218                         if (unlikely(vdev->remove)) {
1219                                 unlink_vmdq(vdev);
1220                                 vdev->ready = DEVICE_SAFE_REMOVE;
1221                                 continue;
1222                         }
1223
1224                         if (likely(vdev->ready == DEVICE_RX))
1225                                 drain_eth_rx(vdev);
1226
1227                         if (likely(!vdev->remove))
1228                                 drain_virtio_tx(vdev);
1229                 }
1230         }
1231
1232         return 0;
1233 }
1234
1235 /*
1236  * Remove a device from the specific data core linked list and from the
1237  * main linked list. Synchonization  occurs through the use of the
1238  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1239  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1240  */
1241 static void
1242 destroy_device(int vid)
1243 {
1244         struct vhost_dev *vdev = NULL;
1245         int lcore;
1246
1247         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1248                 if (vdev->vid == vid)
1249                         break;
1250         }
1251         if (!vdev)
1252                 return;
1253         /*set the remove flag. */
1254         vdev->remove = 1;
1255         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1256                 rte_pause();
1257         }
1258
1259         if (builtin_net_driver)
1260                 vs_vhost_net_remove(vdev);
1261
1262         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1263                      lcore_vdev_entry);
1264         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1265
1266
1267         /* Set the dev_removal_flag on each lcore. */
1268         RTE_LCORE_FOREACH_WORKER(lcore)
1269                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1270
1271         /*
1272          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1273          * we can be sure that they can no longer access the device removed
1274          * from the linked lists and that the devices are no longer in use.
1275          */
1276         RTE_LCORE_FOREACH_WORKER(lcore) {
1277                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1278                         rte_pause();
1279         }
1280
1281         lcore_info[vdev->coreid].device_num--;
1282
1283         RTE_LOG(INFO, VHOST_DATA,
1284                 "(%d) device has been removed from data core\n",
1285                 vdev->vid);
1286
1287         if (async_vhost_driver)
1288                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1289
1290         rte_free(vdev);
1291 }
1292
1293 /*
1294  * A new device is added to a data core. First the device is added to the main linked list
1295  * and then allocated to a specific data core.
1296  */
1297 static int
1298 new_device(int vid)
1299 {
1300         int lcore, core_add = 0;
1301         uint32_t device_num_min = num_devices;
1302         struct vhost_dev *vdev;
1303
1304         struct rte_vhost_async_channel_ops channel_ops = {
1305                 .transfer_data = ioat_transfer_data_cb,
1306                 .check_completed_copies = ioat_check_completed_copies_cb
1307         };
1308         struct rte_vhost_async_features f;
1309
1310         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1311         if (vdev == NULL) {
1312                 RTE_LOG(INFO, VHOST_DATA,
1313                         "(%d) couldn't allocate memory for vhost dev\n",
1314                         vid);
1315                 return -1;
1316         }
1317         vdev->vid = vid;
1318
1319         if (builtin_net_driver)
1320                 vs_vhost_net_setup(vdev);
1321
1322         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1323         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1324
1325         /*reset ready flag*/
1326         vdev->ready = DEVICE_MAC_LEARNING;
1327         vdev->remove = 0;
1328
1329         /* Find a suitable lcore to add the device. */
1330         RTE_LCORE_FOREACH_WORKER(lcore) {
1331                 if (lcore_info[lcore].device_num < device_num_min) {
1332                         device_num_min = lcore_info[lcore].device_num;
1333                         core_add = lcore;
1334                 }
1335         }
1336         vdev->coreid = core_add;
1337
1338         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1339                           lcore_vdev_entry);
1340         lcore_info[vdev->coreid].device_num++;
1341
1342         /* Disable notifications. */
1343         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1344         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1345
1346         RTE_LOG(INFO, VHOST_DATA,
1347                 "(%d) device has been added to data core %d\n",
1348                 vid, vdev->coreid);
1349
1350         if (async_vhost_driver) {
1351                 f.async_inorder = 1;
1352                 f.async_threshold = 256;
1353                 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1354                         f.intval, &channel_ops);
1355         }
1356
1357         return 0;
1358 }
1359
1360 /*
1361  * These callback allow devices to be added to the data core when configuration
1362  * has been fully complete.
1363  */
1364 static const struct vhost_device_ops virtio_net_device_ops =
1365 {
1366         .new_device =  new_device,
1367         .destroy_device = destroy_device,
1368 };
1369
1370 /*
1371  * This is a thread will wake up after a period to print stats if the user has
1372  * enabled them.
1373  */
1374 static void *
1375 print_stats(__rte_unused void *arg)
1376 {
1377         struct vhost_dev *vdev;
1378         uint64_t tx_dropped, rx_dropped;
1379         uint64_t tx, tx_total, rx, rx_total;
1380         const char clr[] = { 27, '[', '2', 'J', '\0' };
1381         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1382
1383         while(1) {
1384                 sleep(enable_stats);
1385
1386                 /* Clear screen and move to top left */
1387                 printf("%s%s\n", clr, top_left);
1388                 printf("Device statistics =================================\n");
1389
1390                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1391                         tx_total   = vdev->stats.tx_total;
1392                         tx         = vdev->stats.tx;
1393                         tx_dropped = tx_total - tx;
1394
1395                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1396                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1397                         rx_dropped = rx_total - rx;
1398
1399                         printf("Statistics for device %d\n"
1400                                 "-----------------------\n"
1401                                 "TX total:              %" PRIu64 "\n"
1402                                 "TX dropped:            %" PRIu64 "\n"
1403                                 "TX successful:         %" PRIu64 "\n"
1404                                 "RX total:              %" PRIu64 "\n"
1405                                 "RX dropped:            %" PRIu64 "\n"
1406                                 "RX successful:         %" PRIu64 "\n",
1407                                 vdev->vid,
1408                                 tx_total, tx_dropped, tx,
1409                                 rx_total, rx_dropped, rx);
1410                 }
1411
1412                 printf("===================================================\n");
1413
1414                 fflush(stdout);
1415         }
1416
1417         return NULL;
1418 }
1419
1420 static void
1421 unregister_drivers(int socket_num)
1422 {
1423         int i, ret;
1424
1425         for (i = 0; i < socket_num; i++) {
1426                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1427                 if (ret != 0)
1428                         RTE_LOG(ERR, VHOST_CONFIG,
1429                                 "Fail to unregister vhost driver for %s.\n",
1430                                 socket_files + i * PATH_MAX);
1431         }
1432 }
1433
1434 /* When we receive a INT signal, unregister vhost driver */
1435 static void
1436 sigint_handler(__rte_unused int signum)
1437 {
1438         /* Unregister vhost driver. */
1439         unregister_drivers(nb_sockets);
1440
1441         exit(0);
1442 }
1443
1444 /*
1445  * While creating an mbuf pool, one key thing is to figure out how
1446  * many mbuf entries is enough for our use. FYI, here are some
1447  * guidelines:
1448  *
1449  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1450  *
1451  * - For each switch core (A CPU core does the packet switch), we need
1452  *   also make some reservation for receiving the packets from virtio
1453  *   Tx queue. How many is enough depends on the usage. It's normally
1454  *   a simple calculation like following:
1455  *
1456  *       MAX_PKT_BURST * max packet size / mbuf size
1457  *
1458  *   So, we definitely need allocate more mbufs when TSO is enabled.
1459  *
1460  * - Similarly, for each switching core, we should serve @nr_rx_desc
1461  *   mbufs for receiving the packets from physical NIC device.
1462  *
1463  * - We also need make sure, for each switch core, we have allocated
1464  *   enough mbufs to fill up the mbuf cache.
1465  */
1466 static void
1467 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1468         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1469 {
1470         uint32_t nr_mbufs;
1471         uint32_t nr_mbufs_per_core;
1472         uint32_t mtu = 1500;
1473
1474         if (mergeable)
1475                 mtu = 9000;
1476         if (enable_tso)
1477                 mtu = 64 * 1024;
1478
1479         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1480                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1481         nr_mbufs_per_core += nr_rx_desc;
1482         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1483
1484         nr_mbufs  = nr_queues * nr_rx_desc;
1485         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1486         nr_mbufs *= nr_port;
1487
1488         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1489                                             nr_mbuf_cache, 0, mbuf_size,
1490                                             rte_socket_id());
1491         if (mbuf_pool == NULL)
1492                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1493 }
1494
1495 /*
1496  * Main function, does initialisation and calls the per-lcore functions.
1497  */
1498 int
1499 main(int argc, char *argv[])
1500 {
1501         unsigned lcore_id, core_id = 0;
1502         unsigned nb_ports, valid_num_ports;
1503         int ret, i;
1504         uint16_t portid;
1505         static pthread_t tid;
1506         uint64_t flags = 0;
1507
1508         signal(SIGINT, sigint_handler);
1509
1510         /* init EAL */
1511         ret = rte_eal_init(argc, argv);
1512         if (ret < 0)
1513                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1514         argc -= ret;
1515         argv += ret;
1516
1517         /* parse app arguments */
1518         ret = us_vhost_parse_args(argc, argv);
1519         if (ret < 0)
1520                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1521
1522         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1523                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1524
1525                 if (rte_lcore_is_enabled(lcore_id))
1526                         lcore_ids[core_id++] = lcore_id;
1527         }
1528
1529         if (rte_lcore_count() > RTE_MAX_LCORE)
1530                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1531
1532         /* Get the number of physical ports. */
1533         nb_ports = rte_eth_dev_count_avail();
1534
1535         /*
1536          * Update the global var NUM_PORTS and global array PORTS
1537          * and get value of var VALID_NUM_PORTS according to system ports number
1538          */
1539         valid_num_ports = check_ports_num(nb_ports);
1540
1541         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1542                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1543                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1544                 return -1;
1545         }
1546
1547         /*
1548          * FIXME: here we are trying to allocate mbufs big enough for
1549          * @MAX_QUEUES, but the truth is we're never going to use that
1550          * many queues here. We probably should only do allocation for
1551          * those queues we are going to use.
1552          */
1553         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1554                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1555
1556         if (vm2vm_mode == VM2VM_HARDWARE) {
1557                 /* Enable VT loop back to let L2 switch to do it. */
1558                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1559                 RTE_LOG(DEBUG, VHOST_CONFIG,
1560                         "Enable loop back for L2 switch in vmdq.\n");
1561         }
1562
1563         /* initialize all ports */
1564         RTE_ETH_FOREACH_DEV(portid) {
1565                 /* skip ports that are not enabled */
1566                 if ((enabled_port_mask & (1 << portid)) == 0) {
1567                         RTE_LOG(INFO, VHOST_PORT,
1568                                 "Skipping disabled port %d\n", portid);
1569                         continue;
1570                 }
1571                 if (port_init(portid) != 0)
1572                         rte_exit(EXIT_FAILURE,
1573                                 "Cannot initialize network ports\n");
1574         }
1575
1576         /* Enable stats if the user option is set. */
1577         if (enable_stats) {
1578                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1579                                         print_stats, NULL);
1580                 if (ret < 0)
1581                         rte_exit(EXIT_FAILURE,
1582                                 "Cannot create print-stats thread\n");
1583         }
1584
1585         /* Launch all data cores. */
1586         RTE_LCORE_FOREACH_WORKER(lcore_id)
1587                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1588
1589         if (client_mode)
1590                 flags |= RTE_VHOST_USER_CLIENT;
1591
1592         /* Register vhost user driver to handle vhost messages. */
1593         for (i = 0; i < nb_sockets; i++) {
1594                 char *file = socket_files + i * PATH_MAX;
1595                 if (async_vhost_driver)
1596                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1597
1598                 ret = rte_vhost_driver_register(file, flags);
1599                 if (ret != 0) {
1600                         unregister_drivers(i);
1601                         rte_exit(EXIT_FAILURE,
1602                                 "vhost driver register failure.\n");
1603                 }
1604
1605                 if (builtin_net_driver)
1606                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1607
1608                 if (mergeable == 0) {
1609                         rte_vhost_driver_disable_features(file,
1610                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1611                 }
1612
1613                 if (enable_tx_csum == 0) {
1614                         rte_vhost_driver_disable_features(file,
1615                                 1ULL << VIRTIO_NET_F_CSUM);
1616                 }
1617
1618                 if (enable_tso == 0) {
1619                         rte_vhost_driver_disable_features(file,
1620                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1621                         rte_vhost_driver_disable_features(file,
1622                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1623                         rte_vhost_driver_disable_features(file,
1624                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1625                         rte_vhost_driver_disable_features(file,
1626                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1627                 }
1628
1629                 if (promiscuous) {
1630                         rte_vhost_driver_enable_features(file,
1631                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1632                 }
1633
1634                 ret = rte_vhost_driver_callback_register(file,
1635                         &virtio_net_device_ops);
1636                 if (ret != 0) {
1637                         rte_exit(EXIT_FAILURE,
1638                                 "failed to register vhost driver callbacks.\n");
1639                 }
1640
1641                 if (rte_vhost_driver_start(file) < 0) {
1642                         rte_exit(EXIT_FAILURE,
1643                                 "failed to start vhost driver.\n");
1644                 }
1645         }
1646
1647         RTE_LCORE_FOREACH_WORKER(lcore_id)
1648                 rte_eal_wait_lcore(lcore_id);
1649
1650         return 0;
1651
1652 }