examples/vhost: add async vhost args parsing
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77         VM2VM_DISABLED = 0,
78         VM2VM_SOFTWARE = 1,
79         VM2VM_HARDWARE = 2,
80         VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94
95 static int client_mode;
96
97 static int builtin_net_driver;
98
99 static int async_vhost_driver;
100
101 static char dma_type[MAX_LONG_OPT_SZ];
102
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114         .rxmode = {
115                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116                 .split_hdr_size = 0,
117                 /*
118                  * VLAN strip is necessary for 1G NIC such as I350,
119                  * this fixes bug of ipv4 forwarding in guest can't
120                  * forward pakets from one virtio dev to another virtio dev.
121                  */
122                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123         },
124
125         .txmode = {
126                 .mq_mode = ETH_MQ_TX_NONE,
127                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128                              DEV_TX_OFFLOAD_TCP_CKSUM |
129                              DEV_TX_OFFLOAD_VLAN_INSERT |
130                              DEV_TX_OFFLOAD_MULTI_SEGS |
131                              DEV_TX_OFFLOAD_TCP_TSO),
132         },
133         .rx_adv_conf = {
134                 /*
135                  * should be overridden separately in code with
136                  * appropriate values
137                  */
138                 .vmdq_rx_conf = {
139                         .nb_queue_pools = ETH_8_POOLS,
140                         .enable_default_pool = 0,
141                         .default_pool = 0,
142                         .nb_pool_maps = 0,
143                         .pool_map = {{0, 0},},
144                 },
145         },
146 };
147
148
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155
156 const uint16_t vlan_tags[] = {
157         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
159         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169
170 static struct vhost_dev_tailq_list vhost_dev_list =
171         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177         unsigned len;
178         unsigned txq_id;
179         struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184
185 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
186                                  / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188
189 static inline int
190 open_dma(const char *value)
191 {
192         if (strncmp(dma_type, "ioat", 4) == 0)
193                 return open_ioat(value);
194
195         return -1;
196 }
197
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205         struct rte_eth_vmdq_rx_conf conf;
206         struct rte_eth_vmdq_rx_conf *def_conf =
207                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208         unsigned i;
209
210         memset(&conf, 0, sizeof(conf));
211         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212         conf.nb_pool_maps = num_devices;
213         conf.enable_loop_back = def_conf->enable_loop_back;
214         conf.rx_mode = def_conf->rx_mode;
215
216         for (i = 0; i < conf.nb_pool_maps; i++) {
217                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
218                 conf.pool_map[i].pools = (1UL << i);
219         }
220
221         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224         return 0;
225 }
226
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234         struct rte_eth_dev_info dev_info;
235         struct rte_eth_conf port_conf;
236         struct rte_eth_rxconf *rxconf;
237         struct rte_eth_txconf *txconf;
238         int16_t rx_rings, tx_rings;
239         uint16_t rx_ring_size, tx_ring_size;
240         int retval;
241         uint16_t q;
242
243         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244         retval = rte_eth_dev_info_get(port, &dev_info);
245         if (retval != 0) {
246                 RTE_LOG(ERR, VHOST_PORT,
247                         "Error during getting device (port %u) info: %s\n",
248                         port, strerror(-retval));
249
250                 return retval;
251         }
252
253         rxconf = &dev_info.default_rxconf;
254         txconf = &dev_info.default_txconf;
255         rxconf->rx_drop_en = 1;
256
257         /*configure the number of supported virtio devices based on VMDQ limits */
258         num_devices = dev_info.max_vmdq_pools;
259
260         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262
263         tx_rings = (uint16_t)rte_lcore_count();
264
265         /* Get port configuration. */
266         retval = get_eth_conf(&port_conf, num_devices);
267         if (retval < 0)
268                 return retval;
269         /* NIC queues are divided into pf queues and vmdq queues.  */
270         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272         num_vmdq_queues = num_devices * queues_per_pool;
273         num_queues = num_pf_queues + num_vmdq_queues;
274         vmdq_queue_base = dev_info.vmdq_queue_base;
275         vmdq_pool_base  = dev_info.vmdq_pool_base;
276         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277                 num_pf_queues, num_devices, queues_per_pool);
278
279         if (!rte_eth_dev_is_valid_port(port))
280                 return -1;
281
282         rx_rings = (uint16_t)dev_info.max_rx_queues;
283         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284                 port_conf.txmode.offloads |=
285                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286         /* Configure ethernet device. */
287         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288         if (retval != 0) {
289                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290                         port, strerror(-retval));
291                 return retval;
292         }
293
294         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295                 &tx_ring_size);
296         if (retval != 0) {
297                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298                         "for port %u: %s.\n", port, strerror(-retval));
299                 return retval;
300         }
301         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303                         "for Rx queues on port %u.\n", port);
304                 return -1;
305         }
306
307         /* Setup the queues. */
308         rxconf->offloads = port_conf.rxmode.offloads;
309         for (q = 0; q < rx_rings; q ++) {
310                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311                                                 rte_eth_dev_socket_id(port),
312                                                 rxconf,
313                                                 mbuf_pool);
314                 if (retval < 0) {
315                         RTE_LOG(ERR, VHOST_PORT,
316                                 "Failed to setup rx queue %u of port %u: %s.\n",
317                                 q, port, strerror(-retval));
318                         return retval;
319                 }
320         }
321         txconf->offloads = port_conf.txmode.offloads;
322         for (q = 0; q < tx_rings; q ++) {
323                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324                                                 rte_eth_dev_socket_id(port),
325                                                 txconf);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup tx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333
334         /* Start the device. */
335         retval  = rte_eth_dev_start(port);
336         if (retval < 0) {
337                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338                         port, strerror(-retval));
339                 return retval;
340         }
341
342         if (promiscuous) {
343                 retval = rte_eth_promiscuous_enable(port);
344                 if (retval != 0) {
345                         RTE_LOG(ERR, VHOST_PORT,
346                                 "Failed to enable promiscuous mode on port %u: %s\n",
347                                 port, rte_strerror(-retval));
348                         return retval;
349                 }
350         }
351
352         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT,
355                         "Failed to get MAC address on port %u: %s\n",
356                         port, rte_strerror(-retval));
357                 return retval;
358         }
359
360         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363                         port,
364                         vmdq_ports_eth_addr[port].addr_bytes[0],
365                         vmdq_ports_eth_addr[port].addr_bytes[1],
366                         vmdq_ports_eth_addr[port].addr_bytes[2],
367                         vmdq_ports_eth_addr[port].addr_bytes[3],
368                         vmdq_ports_eth_addr[port].addr_bytes[4],
369                         vmdq_ports_eth_addr[port].addr_bytes[5]);
370
371         return 0;
372 }
373
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380         char *old;
381
382         /* parse number string */
383         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384                 return -1;
385
386         old = socket_files;
387         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388         if (socket_files == NULL) {
389                 free(old);
390                 return -1;
391         }
392
393         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394         nb_sockets++;
395
396         return 0;
397 }
398
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405         char *end = NULL;
406         unsigned long pm;
407
408         errno = 0;
409
410         /* parse hexadecimal string */
411         pm = strtoul(portmask, &end, 16);
412         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413                 return 0;
414
415         return pm;
416
417 }
418
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425         char *end = NULL;
426         unsigned long num;
427
428         errno = 0;
429
430         /* parse unsigned int string */
431         num = strtoul(q_arg, &end, 10);
432         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433                 return -1;
434
435         if (num > max_valid_value)
436                 return -1;
437
438         return num;
439
440 }
441
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449         "               --vm2vm [0|1|2]\n"
450         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451         "               --socket-file <path>\n"
452         "               --nb-devices ND\n"
453         "               -p PORTMASK: Set mask for ports to be used by application\n"
454         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460         "               --socket-file: The path of the socket file.\n"
461         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
462         "               --tso [0|1] disable/enable TCP segment offload.\n"
463         "               --client register a vhost-user socket as client mode.\n"
464         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465         "               --dmas register dma channel for specific vhost device.\n",
466                prgname);
467 }
468
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475         int opt, ret;
476         int option_index;
477         unsigned i;
478         const char *prgname = argv[0];
479         static struct option long_option[] = {
480                 {"vm2vm", required_argument, NULL, 0},
481                 {"rx-retry", required_argument, NULL, 0},
482                 {"rx-retry-delay", required_argument, NULL, 0},
483                 {"rx-retry-num", required_argument, NULL, 0},
484                 {"mergeable", required_argument, NULL, 0},
485                 {"stats", required_argument, NULL, 0},
486                 {"socket-file", required_argument, NULL, 0},
487                 {"tx-csum", required_argument, NULL, 0},
488                 {"tso", required_argument, NULL, 0},
489                 {"client", no_argument, &client_mode, 1},
490                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491                 {"dma-type", required_argument, NULL, 0},
492                 {"dmas", required_argument, NULL, 0},
493                 {NULL, 0, 0, 0},
494         };
495
496         /* Parse command line */
497         while ((opt = getopt_long(argc, argv, "p:P",
498                         long_option, &option_index)) != EOF) {
499                 switch (opt) {
500                 /* Portmask */
501                 case 'p':
502                         enabled_port_mask = parse_portmask(optarg);
503                         if (enabled_port_mask == 0) {
504                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505                                 us_vhost_usage(prgname);
506                                 return -1;
507                         }
508                         break;
509
510                 case 'P':
511                         promiscuous = 1;
512                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513                                 ETH_VMDQ_ACCEPT_BROADCAST |
514                                 ETH_VMDQ_ACCEPT_MULTICAST;
515
516                         break;
517
518                 case 0:
519                         /* Enable/disable vm2vm comms. */
520                         if (!strncmp(long_option[option_index].name, "vm2vm",
521                                 MAX_LONG_OPT_SZ)) {
522                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523                                 if (ret == -1) {
524                                         RTE_LOG(INFO, VHOST_CONFIG,
525                                                 "Invalid argument for "
526                                                 "vm2vm [0|1|2]\n");
527                                         us_vhost_usage(prgname);
528                                         return -1;
529                                 } else {
530                                         vm2vm_mode = (vm2vm_type)ret;
531                                 }
532                         }
533
534                         /* Enable/disable retries on RX. */
535                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536                                 ret = parse_num_opt(optarg, 1);
537                                 if (ret == -1) {
538                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         enable_retry = ret;
543                                 }
544                         }
545
546                         /* Enable/disable TX checksum offload. */
547                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else
554                                         enable_tx_csum = ret;
555                         }
556
557                         /* Enable/disable TSO offload. */
558                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559                                 ret = parse_num_opt(optarg, 1);
560                                 if (ret == -1) {
561                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562                                         us_vhost_usage(prgname);
563                                         return -1;
564                                 } else
565                                         enable_tso = ret;
566                         }
567
568                         /* Specify the retries delay time (in useconds) on RX. */
569                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570                                 ret = parse_num_opt(optarg, INT32_MAX);
571                                 if (ret == -1) {
572                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573                                         us_vhost_usage(prgname);
574                                         return -1;
575                                 } else {
576                                         burst_rx_delay_time = ret;
577                                 }
578                         }
579
580                         /* Specify the retries number on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_retry_num = ret;
589                                 }
590                         }
591
592                         /* Enable/disable RX mergeable buffers. */
593                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, 1);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         mergeable = !!ret;
601                                         if (ret) {
602                                                 vmdq_conf_default.rxmode.offloads |=
603                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
604                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
605                                                         = JUMBO_FRAME_MAX_SIZE;
606                                         }
607                                 }
608                         }
609
610                         /* Enable/disable stats. */
611                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612                                 ret = parse_num_opt(optarg, INT32_MAX);
613                                 if (ret == -1) {
614                                         RTE_LOG(INFO, VHOST_CONFIG,
615                                                 "Invalid argument for stats [0..N]\n");
616                                         us_vhost_usage(prgname);
617                                         return -1;
618                                 } else {
619                                         enable_stats = ret;
620                                 }
621                         }
622
623                         /* Set socket file path. */
624                         if (!strncmp(long_option[option_index].name,
625                                                 "socket-file", MAX_LONG_OPT_SZ)) {
626                                 if (us_vhost_parse_socket_path(optarg) == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                         "Invalid argument for socket name (Max %d characters)\n",
629                                         PATH_MAX);
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 }
633                         }
634
635                         if (!strncmp(long_option[option_index].name,
636                                                 "dma-type", MAX_LONG_OPT_SZ)) {
637                                 strcpy(dma_type, optarg);
638                         }
639
640                         if (!strncmp(long_option[option_index].name,
641                                                 "dmas", MAX_LONG_OPT_SZ)) {
642                                 if (open_dma(optarg) == -1) {
643                                         RTE_LOG(INFO, VHOST_CONFIG,
644                                                 "Wrong DMA args\n");
645                                         us_vhost_usage(prgname);
646                                         return -1;
647                                 }
648                                 async_vhost_driver = 1;
649                         }
650
651                         break;
652
653                         /* Invalid option - print options. */
654                 default:
655                         us_vhost_usage(prgname);
656                         return -1;
657                 }
658         }
659
660         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
661                 if (enabled_port_mask & (1 << i))
662                         ports[num_ports++] = i;
663         }
664
665         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
666                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
667                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
668                 return -1;
669         }
670
671         return 0;
672 }
673
674 /*
675  * Update the global var NUM_PORTS and array PORTS according to system ports number
676  * and return valid ports number
677  */
678 static unsigned check_ports_num(unsigned nb_ports)
679 {
680         unsigned valid_num_ports = num_ports;
681         unsigned portid;
682
683         if (num_ports > nb_ports) {
684                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
685                         num_ports, nb_ports);
686                 num_ports = nb_ports;
687         }
688
689         for (portid = 0; portid < num_ports; portid ++) {
690                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
691                         RTE_LOG(INFO, VHOST_PORT,
692                                 "\nSpecified port ID(%u) is not valid\n",
693                                 ports[portid]);
694                         ports[portid] = INVALID_PORT_ID;
695                         valid_num_ports--;
696                 }
697         }
698         return valid_num_ports;
699 }
700
701 static __rte_always_inline struct vhost_dev *
702 find_vhost_dev(struct rte_ether_addr *mac)
703 {
704         struct vhost_dev *vdev;
705
706         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
707                 if (vdev->ready == DEVICE_RX &&
708                     rte_is_same_ether_addr(mac, &vdev->mac_address))
709                         return vdev;
710         }
711
712         return NULL;
713 }
714
715 /*
716  * This function learns the MAC address of the device and registers this along with a
717  * vlan tag to a VMDQ.
718  */
719 static int
720 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
721 {
722         struct rte_ether_hdr *pkt_hdr;
723         int i, ret;
724
725         /* Learn MAC address of guest device from packet */
726         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
727
728         if (find_vhost_dev(&pkt_hdr->s_addr)) {
729                 RTE_LOG(ERR, VHOST_DATA,
730                         "(%d) device is using a registered MAC!\n",
731                         vdev->vid);
732                 return -1;
733         }
734
735         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
736                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
737
738         /* vlan_tag currently uses the device_id. */
739         vdev->vlan_tag = vlan_tags[vdev->vid];
740
741         /* Print out VMDQ registration info. */
742         RTE_LOG(INFO, VHOST_DATA,
743                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
744                 vdev->vid,
745                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
746                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
747                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
748                 vdev->vlan_tag);
749
750         /* Register the MAC address. */
751         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
752                                 (uint32_t)vdev->vid + vmdq_pool_base);
753         if (ret)
754                 RTE_LOG(ERR, VHOST_DATA,
755                         "(%d) failed to add device MAC address to VMDQ\n",
756                         vdev->vid);
757
758         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
759
760         /* Set device as ready for RX. */
761         vdev->ready = DEVICE_RX;
762
763         return 0;
764 }
765
766 /*
767  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
768  * queue before disabling RX on the device.
769  */
770 static inline void
771 unlink_vmdq(struct vhost_dev *vdev)
772 {
773         unsigned i = 0;
774         unsigned rx_count;
775         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
776
777         if (vdev->ready == DEVICE_RX) {
778                 /*clear MAC and VLAN settings*/
779                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
780                 for (i = 0; i < 6; i++)
781                         vdev->mac_address.addr_bytes[i] = 0;
782
783                 vdev->vlan_tag = 0;
784
785                 /*Clear out the receive buffers*/
786                 rx_count = rte_eth_rx_burst(ports[0],
787                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
788
789                 while (rx_count) {
790                         for (i = 0; i < rx_count; i++)
791                                 rte_pktmbuf_free(pkts_burst[i]);
792
793                         rx_count = rte_eth_rx_burst(ports[0],
794                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
795                 }
796
797                 vdev->ready = DEVICE_MAC_LEARNING;
798         }
799 }
800
801 static __rte_always_inline void
802 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
803             struct rte_mbuf *m)
804 {
805         uint16_t ret;
806
807         if (builtin_net_driver) {
808                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
809         } else {
810                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
811         }
812
813         if (enable_stats) {
814                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
815                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
816                 src_vdev->stats.tx_total++;
817                 src_vdev->stats.tx += ret;
818         }
819 }
820
821 /*
822  * Check if the packet destination MAC address is for a local device. If so then put
823  * the packet on that devices RX queue. If not then return.
824  */
825 static __rte_always_inline int
826 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
827 {
828         struct rte_ether_hdr *pkt_hdr;
829         struct vhost_dev *dst_vdev;
830
831         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
832
833         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
834         if (!dst_vdev)
835                 return -1;
836
837         if (vdev->vid == dst_vdev->vid) {
838                 RTE_LOG_DP(DEBUG, VHOST_DATA,
839                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
840                         vdev->vid);
841                 return 0;
842         }
843
844         RTE_LOG_DP(DEBUG, VHOST_DATA,
845                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
846
847         if (unlikely(dst_vdev->remove)) {
848                 RTE_LOG_DP(DEBUG, VHOST_DATA,
849                         "(%d) device is marked for removal\n", dst_vdev->vid);
850                 return 0;
851         }
852
853         virtio_xmit(dst_vdev, vdev, m);
854         return 0;
855 }
856
857 /*
858  * Check if the destination MAC of a packet is one local VM,
859  * and get its vlan tag, and offset if it is.
860  */
861 static __rte_always_inline int
862 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
863         uint32_t *offset, uint16_t *vlan_tag)
864 {
865         struct vhost_dev *dst_vdev;
866         struct rte_ether_hdr *pkt_hdr =
867                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
868
869         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
870         if (!dst_vdev)
871                 return 0;
872
873         if (vdev->vid == dst_vdev->vid) {
874                 RTE_LOG_DP(DEBUG, VHOST_DATA,
875                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
876                         vdev->vid);
877                 return -1;
878         }
879
880         /*
881          * HW vlan strip will reduce the packet length
882          * by minus length of vlan tag, so need restore
883          * the packet length by plus it.
884          */
885         *offset  = VLAN_HLEN;
886         *vlan_tag = vlan_tags[vdev->vid];
887
888         RTE_LOG_DP(DEBUG, VHOST_DATA,
889                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
890                 vdev->vid, dst_vdev->vid, *vlan_tag);
891
892         return 0;
893 }
894
895 static uint16_t
896 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
897 {
898         if (ol_flags & PKT_TX_IPV4)
899                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
900         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
901                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
902 }
903
904 static void virtio_tx_offload(struct rte_mbuf *m)
905 {
906         void *l3_hdr;
907         struct rte_ipv4_hdr *ipv4_hdr = NULL;
908         struct rte_tcp_hdr *tcp_hdr = NULL;
909         struct rte_ether_hdr *eth_hdr =
910                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
911
912         l3_hdr = (char *)eth_hdr + m->l2_len;
913
914         if (m->ol_flags & PKT_TX_IPV4) {
915                 ipv4_hdr = l3_hdr;
916                 ipv4_hdr->hdr_checksum = 0;
917                 m->ol_flags |= PKT_TX_IP_CKSUM;
918         }
919
920         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
921         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
922 }
923
924 static inline void
925 free_pkts(struct rte_mbuf **pkts, uint16_t n)
926 {
927         while (n--)
928                 rte_pktmbuf_free(pkts[n]);
929 }
930
931 static __rte_always_inline void
932 do_drain_mbuf_table(struct mbuf_table *tx_q)
933 {
934         uint16_t count;
935
936         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
937                                  tx_q->m_table, tx_q->len);
938         if (unlikely(count < tx_q->len))
939                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
940
941         tx_q->len = 0;
942 }
943
944 /*
945  * This function routes the TX packet to the correct interface. This
946  * may be a local device or the physical port.
947  */
948 static __rte_always_inline void
949 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
950 {
951         struct mbuf_table *tx_q;
952         unsigned offset = 0;
953         const uint16_t lcore_id = rte_lcore_id();
954         struct rte_ether_hdr *nh;
955
956
957         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
958         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
959                 struct vhost_dev *vdev2;
960
961                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
962                         if (vdev2 != vdev)
963                                 virtio_xmit(vdev2, vdev, m);
964                 }
965                 goto queue2nic;
966         }
967
968         /*check if destination is local VM*/
969         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
970                 rte_pktmbuf_free(m);
971                 return;
972         }
973
974         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
975                 if (unlikely(find_local_dest(vdev, m, &offset,
976                                              &vlan_tag) != 0)) {
977                         rte_pktmbuf_free(m);
978                         return;
979                 }
980         }
981
982         RTE_LOG_DP(DEBUG, VHOST_DATA,
983                 "(%d) TX: MAC address is external\n", vdev->vid);
984
985 queue2nic:
986
987         /*Add packet to the port tx queue*/
988         tx_q = &lcore_tx_queue[lcore_id];
989
990         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
991         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
992                 /* Guest has inserted the vlan tag. */
993                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
994                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
995                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
996                         (vh->vlan_tci != vlan_tag_be))
997                         vh->vlan_tci = vlan_tag_be;
998         } else {
999                 m->ol_flags |= PKT_TX_VLAN_PKT;
1000
1001                 /*
1002                  * Find the right seg to adjust the data len when offset is
1003                  * bigger than tail room size.
1004                  */
1005                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1006                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1007                                 m->data_len += offset;
1008                         else {
1009                                 struct rte_mbuf *seg = m;
1010
1011                                 while ((seg->next != NULL) &&
1012                                         (offset > rte_pktmbuf_tailroom(seg)))
1013                                         seg = seg->next;
1014
1015                                 seg->data_len += offset;
1016                         }
1017                         m->pkt_len += offset;
1018                 }
1019
1020                 m->vlan_tci = vlan_tag;
1021         }
1022
1023         if (m->ol_flags & PKT_TX_TCP_SEG)
1024                 virtio_tx_offload(m);
1025
1026         tx_q->m_table[tx_q->len++] = m;
1027         if (enable_stats) {
1028                 vdev->stats.tx_total++;
1029                 vdev->stats.tx++;
1030         }
1031
1032         if (unlikely(tx_q->len == MAX_PKT_BURST))
1033                 do_drain_mbuf_table(tx_q);
1034 }
1035
1036
1037 static __rte_always_inline void
1038 drain_mbuf_table(struct mbuf_table *tx_q)
1039 {
1040         static uint64_t prev_tsc;
1041         uint64_t cur_tsc;
1042
1043         if (tx_q->len == 0)
1044                 return;
1045
1046         cur_tsc = rte_rdtsc();
1047         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1048                 prev_tsc = cur_tsc;
1049
1050                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1051                         "TX queue drained after timeout with burst size %u\n",
1052                         tx_q->len);
1053                 do_drain_mbuf_table(tx_q);
1054         }
1055 }
1056
1057 static __rte_always_inline void
1058 drain_eth_rx(struct vhost_dev *vdev)
1059 {
1060         uint16_t rx_count, enqueue_count;
1061         struct rte_mbuf *pkts[MAX_PKT_BURST];
1062
1063         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1064                                     pkts, MAX_PKT_BURST);
1065         if (!rx_count)
1066                 return;
1067
1068         /*
1069          * When "enable_retry" is set, here we wait and retry when there
1070          * is no enough free slots in the queue to hold @rx_count packets,
1071          * to diminish packet loss.
1072          */
1073         if (enable_retry &&
1074             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1075                         VIRTIO_RXQ))) {
1076                 uint32_t retry;
1077
1078                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1079                         rte_delay_us(burst_rx_delay_time);
1080                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1081                                         VIRTIO_RXQ))
1082                                 break;
1083                 }
1084         }
1085
1086         if (builtin_net_driver) {
1087                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1088                                                 pkts, rx_count);
1089         } else {
1090                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1091                                                 pkts, rx_count);
1092         }
1093         if (enable_stats) {
1094                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1095                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1096         }
1097
1098         free_pkts(pkts, rx_count);
1099 }
1100
1101 static __rte_always_inline void
1102 drain_virtio_tx(struct vhost_dev *vdev)
1103 {
1104         struct rte_mbuf *pkts[MAX_PKT_BURST];
1105         uint16_t count;
1106         uint16_t i;
1107
1108         if (builtin_net_driver) {
1109                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1110                                         pkts, MAX_PKT_BURST);
1111         } else {
1112                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1113                                         mbuf_pool, pkts, MAX_PKT_BURST);
1114         }
1115
1116         /* setup VMDq for the first packet */
1117         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1118                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1119                         free_pkts(pkts, count);
1120         }
1121
1122         for (i = 0; i < count; ++i)
1123                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1124 }
1125
1126 /*
1127  * Main function of vhost-switch. It basically does:
1128  *
1129  * for each vhost device {
1130  *    - drain_eth_rx()
1131  *
1132  *      Which drains the host eth Rx queue linked to the vhost device,
1133  *      and deliver all of them to guest virito Rx ring associated with
1134  *      this vhost device.
1135  *
1136  *    - drain_virtio_tx()
1137  *
1138  *      Which drains the guest virtio Tx queue and deliver all of them
1139  *      to the target, which could be another vhost device, or the
1140  *      physical eth dev. The route is done in function "virtio_tx_route".
1141  * }
1142  */
1143 static int
1144 switch_worker(void *arg __rte_unused)
1145 {
1146         unsigned i;
1147         unsigned lcore_id = rte_lcore_id();
1148         struct vhost_dev *vdev;
1149         struct mbuf_table *tx_q;
1150
1151         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1152
1153         tx_q = &lcore_tx_queue[lcore_id];
1154         for (i = 0; i < rte_lcore_count(); i++) {
1155                 if (lcore_ids[i] == lcore_id) {
1156                         tx_q->txq_id = i;
1157                         break;
1158                 }
1159         }
1160
1161         while(1) {
1162                 drain_mbuf_table(tx_q);
1163
1164                 /*
1165                  * Inform the configuration core that we have exited the
1166                  * linked list and that no devices are in use if requested.
1167                  */
1168                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1169                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1170
1171                 /*
1172                  * Process vhost devices
1173                  */
1174                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1175                               lcore_vdev_entry) {
1176                         if (unlikely(vdev->remove)) {
1177                                 unlink_vmdq(vdev);
1178                                 vdev->ready = DEVICE_SAFE_REMOVE;
1179                                 continue;
1180                         }
1181
1182                         if (likely(vdev->ready == DEVICE_RX))
1183                                 drain_eth_rx(vdev);
1184
1185                         if (likely(!vdev->remove))
1186                                 drain_virtio_tx(vdev);
1187                 }
1188         }
1189
1190         return 0;
1191 }
1192
1193 /*
1194  * Remove a device from the specific data core linked list and from the
1195  * main linked list. Synchonization  occurs through the use of the
1196  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1197  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1198  */
1199 static void
1200 destroy_device(int vid)
1201 {
1202         struct vhost_dev *vdev = NULL;
1203         int lcore;
1204
1205         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1206                 if (vdev->vid == vid)
1207                         break;
1208         }
1209         if (!vdev)
1210                 return;
1211         /*set the remove flag. */
1212         vdev->remove = 1;
1213         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1214                 rte_pause();
1215         }
1216
1217         if (builtin_net_driver)
1218                 vs_vhost_net_remove(vdev);
1219
1220         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1221                      lcore_vdev_entry);
1222         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1223
1224
1225         /* Set the dev_removal_flag on each lcore. */
1226         RTE_LCORE_FOREACH_WORKER(lcore)
1227                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1228
1229         /*
1230          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1231          * we can be sure that they can no longer access the device removed
1232          * from the linked lists and that the devices are no longer in use.
1233          */
1234         RTE_LCORE_FOREACH_WORKER(lcore) {
1235                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1236                         rte_pause();
1237         }
1238
1239         lcore_info[vdev->coreid].device_num--;
1240
1241         RTE_LOG(INFO, VHOST_DATA,
1242                 "(%d) device has been removed from data core\n",
1243                 vdev->vid);
1244
1245         rte_free(vdev);
1246 }
1247
1248 /*
1249  * A new device is added to a data core. First the device is added to the main linked list
1250  * and then allocated to a specific data core.
1251  */
1252 static int
1253 new_device(int vid)
1254 {
1255         int lcore, core_add = 0;
1256         uint32_t device_num_min = num_devices;
1257         struct vhost_dev *vdev;
1258
1259         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1260         if (vdev == NULL) {
1261                 RTE_LOG(INFO, VHOST_DATA,
1262                         "(%d) couldn't allocate memory for vhost dev\n",
1263                         vid);
1264                 return -1;
1265         }
1266         vdev->vid = vid;
1267
1268         if (builtin_net_driver)
1269                 vs_vhost_net_setup(vdev);
1270
1271         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1272         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1273
1274         /*reset ready flag*/
1275         vdev->ready = DEVICE_MAC_LEARNING;
1276         vdev->remove = 0;
1277
1278         /* Find a suitable lcore to add the device. */
1279         RTE_LCORE_FOREACH_WORKER(lcore) {
1280                 if (lcore_info[lcore].device_num < device_num_min) {
1281                         device_num_min = lcore_info[lcore].device_num;
1282                         core_add = lcore;
1283                 }
1284         }
1285         vdev->coreid = core_add;
1286
1287         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1288                           lcore_vdev_entry);
1289         lcore_info[vdev->coreid].device_num++;
1290
1291         /* Disable notifications. */
1292         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1293         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1294
1295         RTE_LOG(INFO, VHOST_DATA,
1296                 "(%d) device has been added to data core %d\n",
1297                 vid, vdev->coreid);
1298
1299         return 0;
1300 }
1301
1302 /*
1303  * These callback allow devices to be added to the data core when configuration
1304  * has been fully complete.
1305  */
1306 static const struct vhost_device_ops virtio_net_device_ops =
1307 {
1308         .new_device =  new_device,
1309         .destroy_device = destroy_device,
1310 };
1311
1312 /*
1313  * This is a thread will wake up after a period to print stats if the user has
1314  * enabled them.
1315  */
1316 static void *
1317 print_stats(__rte_unused void *arg)
1318 {
1319         struct vhost_dev *vdev;
1320         uint64_t tx_dropped, rx_dropped;
1321         uint64_t tx, tx_total, rx, rx_total;
1322         const char clr[] = { 27, '[', '2', 'J', '\0' };
1323         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1324
1325         while(1) {
1326                 sleep(enable_stats);
1327
1328                 /* Clear screen and move to top left */
1329                 printf("%s%s\n", clr, top_left);
1330                 printf("Device statistics =================================\n");
1331
1332                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1333                         tx_total   = vdev->stats.tx_total;
1334                         tx         = vdev->stats.tx;
1335                         tx_dropped = tx_total - tx;
1336
1337                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1338                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1339                         rx_dropped = rx_total - rx;
1340
1341                         printf("Statistics for device %d\n"
1342                                 "-----------------------\n"
1343                                 "TX total:              %" PRIu64 "\n"
1344                                 "TX dropped:            %" PRIu64 "\n"
1345                                 "TX successful:         %" PRIu64 "\n"
1346                                 "RX total:              %" PRIu64 "\n"
1347                                 "RX dropped:            %" PRIu64 "\n"
1348                                 "RX successful:         %" PRIu64 "\n",
1349                                 vdev->vid,
1350                                 tx_total, tx_dropped, tx,
1351                                 rx_total, rx_dropped, rx);
1352                 }
1353
1354                 printf("===================================================\n");
1355
1356                 fflush(stdout);
1357         }
1358
1359         return NULL;
1360 }
1361
1362 static void
1363 unregister_drivers(int socket_num)
1364 {
1365         int i, ret;
1366
1367         for (i = 0; i < socket_num; i++) {
1368                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1369                 if (ret != 0)
1370                         RTE_LOG(ERR, VHOST_CONFIG,
1371                                 "Fail to unregister vhost driver for %s.\n",
1372                                 socket_files + i * PATH_MAX);
1373         }
1374 }
1375
1376 /* When we receive a INT signal, unregister vhost driver */
1377 static void
1378 sigint_handler(__rte_unused int signum)
1379 {
1380         /* Unregister vhost driver. */
1381         unregister_drivers(nb_sockets);
1382
1383         exit(0);
1384 }
1385
1386 /*
1387  * While creating an mbuf pool, one key thing is to figure out how
1388  * many mbuf entries is enough for our use. FYI, here are some
1389  * guidelines:
1390  *
1391  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1392  *
1393  * - For each switch core (A CPU core does the packet switch), we need
1394  *   also make some reservation for receiving the packets from virtio
1395  *   Tx queue. How many is enough depends on the usage. It's normally
1396  *   a simple calculation like following:
1397  *
1398  *       MAX_PKT_BURST * max packet size / mbuf size
1399  *
1400  *   So, we definitely need allocate more mbufs when TSO is enabled.
1401  *
1402  * - Similarly, for each switching core, we should serve @nr_rx_desc
1403  *   mbufs for receiving the packets from physical NIC device.
1404  *
1405  * - We also need make sure, for each switch core, we have allocated
1406  *   enough mbufs to fill up the mbuf cache.
1407  */
1408 static void
1409 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1410         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1411 {
1412         uint32_t nr_mbufs;
1413         uint32_t nr_mbufs_per_core;
1414         uint32_t mtu = 1500;
1415
1416         if (mergeable)
1417                 mtu = 9000;
1418         if (enable_tso)
1419                 mtu = 64 * 1024;
1420
1421         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1422                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1423         nr_mbufs_per_core += nr_rx_desc;
1424         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1425
1426         nr_mbufs  = nr_queues * nr_rx_desc;
1427         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1428         nr_mbufs *= nr_port;
1429
1430         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1431                                             nr_mbuf_cache, 0, mbuf_size,
1432                                             rte_socket_id());
1433         if (mbuf_pool == NULL)
1434                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1435 }
1436
1437 /*
1438  * Main function, does initialisation and calls the per-lcore functions.
1439  */
1440 int
1441 main(int argc, char *argv[])
1442 {
1443         unsigned lcore_id, core_id = 0;
1444         unsigned nb_ports, valid_num_ports;
1445         int ret, i;
1446         uint16_t portid;
1447         static pthread_t tid;
1448         uint64_t flags = 0;
1449
1450         signal(SIGINT, sigint_handler);
1451
1452         /* init EAL */
1453         ret = rte_eal_init(argc, argv);
1454         if (ret < 0)
1455                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1456         argc -= ret;
1457         argv += ret;
1458
1459         /* parse app arguments */
1460         ret = us_vhost_parse_args(argc, argv);
1461         if (ret < 0)
1462                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1463
1464         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1465                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1466
1467                 if (rte_lcore_is_enabled(lcore_id))
1468                         lcore_ids[core_id++] = lcore_id;
1469         }
1470
1471         if (rte_lcore_count() > RTE_MAX_LCORE)
1472                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1473
1474         /* Get the number of physical ports. */
1475         nb_ports = rte_eth_dev_count_avail();
1476
1477         /*
1478          * Update the global var NUM_PORTS and global array PORTS
1479          * and get value of var VALID_NUM_PORTS according to system ports number
1480          */
1481         valid_num_ports = check_ports_num(nb_ports);
1482
1483         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1484                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1485                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1486                 return -1;
1487         }
1488
1489         /*
1490          * FIXME: here we are trying to allocate mbufs big enough for
1491          * @MAX_QUEUES, but the truth is we're never going to use that
1492          * many queues here. We probably should only do allocation for
1493          * those queues we are going to use.
1494          */
1495         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1496                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1497
1498         if (vm2vm_mode == VM2VM_HARDWARE) {
1499                 /* Enable VT loop back to let L2 switch to do it. */
1500                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1501                 RTE_LOG(DEBUG, VHOST_CONFIG,
1502                         "Enable loop back for L2 switch in vmdq.\n");
1503         }
1504
1505         /* initialize all ports */
1506         RTE_ETH_FOREACH_DEV(portid) {
1507                 /* skip ports that are not enabled */
1508                 if ((enabled_port_mask & (1 << portid)) == 0) {
1509                         RTE_LOG(INFO, VHOST_PORT,
1510                                 "Skipping disabled port %d\n", portid);
1511                         continue;
1512                 }
1513                 if (port_init(portid) != 0)
1514                         rte_exit(EXIT_FAILURE,
1515                                 "Cannot initialize network ports\n");
1516         }
1517
1518         /* Enable stats if the user option is set. */
1519         if (enable_stats) {
1520                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1521                                         print_stats, NULL);
1522                 if (ret < 0)
1523                         rte_exit(EXIT_FAILURE,
1524                                 "Cannot create print-stats thread\n");
1525         }
1526
1527         /* Launch all data cores. */
1528         RTE_LCORE_FOREACH_WORKER(lcore_id)
1529                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1530
1531         if (client_mode)
1532                 flags |= RTE_VHOST_USER_CLIENT;
1533
1534         /* Register vhost user driver to handle vhost messages. */
1535         for (i = 0; i < nb_sockets; i++) {
1536                 char *file = socket_files + i * PATH_MAX;
1537                 ret = rte_vhost_driver_register(file, flags);
1538                 if (ret != 0) {
1539                         unregister_drivers(i);
1540                         rte_exit(EXIT_FAILURE,
1541                                 "vhost driver register failure.\n");
1542                 }
1543
1544                 if (builtin_net_driver)
1545                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1546
1547                 if (mergeable == 0) {
1548                         rte_vhost_driver_disable_features(file,
1549                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1550                 }
1551
1552                 if (enable_tx_csum == 0) {
1553                         rte_vhost_driver_disable_features(file,
1554                                 1ULL << VIRTIO_NET_F_CSUM);
1555                 }
1556
1557                 if (enable_tso == 0) {
1558                         rte_vhost_driver_disable_features(file,
1559                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1560                         rte_vhost_driver_disable_features(file,
1561                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1562                         rte_vhost_driver_disable_features(file,
1563                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1564                         rte_vhost_driver_disable_features(file,
1565                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1566                 }
1567
1568                 if (promiscuous) {
1569                         rte_vhost_driver_enable_features(file,
1570                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1571                 }
1572
1573                 ret = rte_vhost_driver_callback_register(file,
1574                         &virtio_net_device_ops);
1575                 if (ret != 0) {
1576                         rte_exit(EXIT_FAILURE,
1577                                 "failed to register vhost driver callbacks.\n");
1578                 }
1579
1580                 if (rte_vhost_driver_start(file) < 0) {
1581                         rte_exit(EXIT_FAILURE,
1582                                 "failed to start vhost driver.\n");
1583                 }
1584         }
1585
1586         RTE_LCORE_FOREACH_WORKER(lcore_id)
1587                 rte_eal_wait_lcore(lcore_id);
1588
1589         return 0;
1590
1591 }