vhost: enhance async enqueue for small packets
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77         VM2VM_DISABLED = 0,
78         VM2VM_SOFTWARE = 1,
79         VM2VM_HARDWARE = 2,
80         VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94
95 static int client_mode;
96
97 static int builtin_net_driver;
98
99 static int async_vhost_driver;
100
101 static char dma_type[MAX_LONG_OPT_SZ];
102
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114         .rxmode = {
115                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116                 .split_hdr_size = 0,
117                 /*
118                  * VLAN strip is necessary for 1G NIC such as I350,
119                  * this fixes bug of ipv4 forwarding in guest can't
120                  * forward pakets from one virtio dev to another virtio dev.
121                  */
122                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123         },
124
125         .txmode = {
126                 .mq_mode = ETH_MQ_TX_NONE,
127                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128                              DEV_TX_OFFLOAD_TCP_CKSUM |
129                              DEV_TX_OFFLOAD_VLAN_INSERT |
130                              DEV_TX_OFFLOAD_MULTI_SEGS |
131                              DEV_TX_OFFLOAD_TCP_TSO),
132         },
133         .rx_adv_conf = {
134                 /*
135                  * should be overridden separately in code with
136                  * appropriate values
137                  */
138                 .vmdq_rx_conf = {
139                         .nb_queue_pools = ETH_8_POOLS,
140                         .enable_default_pool = 0,
141                         .default_pool = 0,
142                         .nb_pool_maps = 0,
143                         .pool_map = {{0, 0},},
144                 },
145         },
146 };
147
148
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155
156 const uint16_t vlan_tags[] = {
157         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
159         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169
170 static struct vhost_dev_tailq_list vhost_dev_list =
171         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177         unsigned len;
178         unsigned txq_id;
179         struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184
185 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
186                                  / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188
189 static inline int
190 open_dma(const char *value)
191 {
192         if (strncmp(dma_type, "ioat", 4) == 0)
193                 return open_ioat(value);
194
195         return -1;
196 }
197
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205         struct rte_eth_vmdq_rx_conf conf;
206         struct rte_eth_vmdq_rx_conf *def_conf =
207                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208         unsigned i;
209
210         memset(&conf, 0, sizeof(conf));
211         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212         conf.nb_pool_maps = num_devices;
213         conf.enable_loop_back = def_conf->enable_loop_back;
214         conf.rx_mode = def_conf->rx_mode;
215
216         for (i = 0; i < conf.nb_pool_maps; i++) {
217                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
218                 conf.pool_map[i].pools = (1UL << i);
219         }
220
221         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224         return 0;
225 }
226
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234         struct rte_eth_dev_info dev_info;
235         struct rte_eth_conf port_conf;
236         struct rte_eth_rxconf *rxconf;
237         struct rte_eth_txconf *txconf;
238         int16_t rx_rings, tx_rings;
239         uint16_t rx_ring_size, tx_ring_size;
240         int retval;
241         uint16_t q;
242
243         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244         retval = rte_eth_dev_info_get(port, &dev_info);
245         if (retval != 0) {
246                 RTE_LOG(ERR, VHOST_PORT,
247                         "Error during getting device (port %u) info: %s\n",
248                         port, strerror(-retval));
249
250                 return retval;
251         }
252
253         rxconf = &dev_info.default_rxconf;
254         txconf = &dev_info.default_txconf;
255         rxconf->rx_drop_en = 1;
256
257         /*configure the number of supported virtio devices based on VMDQ limits */
258         num_devices = dev_info.max_vmdq_pools;
259
260         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262
263         tx_rings = (uint16_t)rte_lcore_count();
264
265         /* Get port configuration. */
266         retval = get_eth_conf(&port_conf, num_devices);
267         if (retval < 0)
268                 return retval;
269         /* NIC queues are divided into pf queues and vmdq queues.  */
270         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272         num_vmdq_queues = num_devices * queues_per_pool;
273         num_queues = num_pf_queues + num_vmdq_queues;
274         vmdq_queue_base = dev_info.vmdq_queue_base;
275         vmdq_pool_base  = dev_info.vmdq_pool_base;
276         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277                 num_pf_queues, num_devices, queues_per_pool);
278
279         if (!rte_eth_dev_is_valid_port(port))
280                 return -1;
281
282         rx_rings = (uint16_t)dev_info.max_rx_queues;
283         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284                 port_conf.txmode.offloads |=
285                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286         /* Configure ethernet device. */
287         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288         if (retval != 0) {
289                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290                         port, strerror(-retval));
291                 return retval;
292         }
293
294         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295                 &tx_ring_size);
296         if (retval != 0) {
297                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298                         "for port %u: %s.\n", port, strerror(-retval));
299                 return retval;
300         }
301         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303                         "for Rx queues on port %u.\n", port);
304                 return -1;
305         }
306
307         /* Setup the queues. */
308         rxconf->offloads = port_conf.rxmode.offloads;
309         for (q = 0; q < rx_rings; q ++) {
310                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311                                                 rte_eth_dev_socket_id(port),
312                                                 rxconf,
313                                                 mbuf_pool);
314                 if (retval < 0) {
315                         RTE_LOG(ERR, VHOST_PORT,
316                                 "Failed to setup rx queue %u of port %u: %s.\n",
317                                 q, port, strerror(-retval));
318                         return retval;
319                 }
320         }
321         txconf->offloads = port_conf.txmode.offloads;
322         for (q = 0; q < tx_rings; q ++) {
323                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324                                                 rte_eth_dev_socket_id(port),
325                                                 txconf);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup tx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333
334         /* Start the device. */
335         retval  = rte_eth_dev_start(port);
336         if (retval < 0) {
337                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338                         port, strerror(-retval));
339                 return retval;
340         }
341
342         if (promiscuous) {
343                 retval = rte_eth_promiscuous_enable(port);
344                 if (retval != 0) {
345                         RTE_LOG(ERR, VHOST_PORT,
346                                 "Failed to enable promiscuous mode on port %u: %s\n",
347                                 port, rte_strerror(-retval));
348                         return retval;
349                 }
350         }
351
352         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT,
355                         "Failed to get MAC address on port %u: %s\n",
356                         port, rte_strerror(-retval));
357                 return retval;
358         }
359
360         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363                         port,
364                         vmdq_ports_eth_addr[port].addr_bytes[0],
365                         vmdq_ports_eth_addr[port].addr_bytes[1],
366                         vmdq_ports_eth_addr[port].addr_bytes[2],
367                         vmdq_ports_eth_addr[port].addr_bytes[3],
368                         vmdq_ports_eth_addr[port].addr_bytes[4],
369                         vmdq_ports_eth_addr[port].addr_bytes[5]);
370
371         return 0;
372 }
373
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380         char *old;
381
382         /* parse number string */
383         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384                 return -1;
385
386         old = socket_files;
387         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388         if (socket_files == NULL) {
389                 free(old);
390                 return -1;
391         }
392
393         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394         nb_sockets++;
395
396         return 0;
397 }
398
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405         char *end = NULL;
406         unsigned long pm;
407
408         errno = 0;
409
410         /* parse hexadecimal string */
411         pm = strtoul(portmask, &end, 16);
412         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413                 return 0;
414
415         return pm;
416
417 }
418
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425         char *end = NULL;
426         unsigned long num;
427
428         errno = 0;
429
430         /* parse unsigned int string */
431         num = strtoul(q_arg, &end, 10);
432         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433                 return -1;
434
435         if (num > max_valid_value)
436                 return -1;
437
438         return num;
439
440 }
441
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449         "               --vm2vm [0|1|2]\n"
450         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451         "               --socket-file <path>\n"
452         "               --nb-devices ND\n"
453         "               -p PORTMASK: Set mask for ports to be used by application\n"
454         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460         "               --socket-file: The path of the socket file.\n"
461         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
462         "               --tso [0|1] disable/enable TCP segment offload.\n"
463         "               --client register a vhost-user socket as client mode.\n"
464         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465         "               --dmas register dma channel for specific vhost device.\n",
466                prgname);
467 }
468
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475         int opt, ret;
476         int option_index;
477         unsigned i;
478         const char *prgname = argv[0];
479         static struct option long_option[] = {
480                 {"vm2vm", required_argument, NULL, 0},
481                 {"rx-retry", required_argument, NULL, 0},
482                 {"rx-retry-delay", required_argument, NULL, 0},
483                 {"rx-retry-num", required_argument, NULL, 0},
484                 {"mergeable", required_argument, NULL, 0},
485                 {"stats", required_argument, NULL, 0},
486                 {"socket-file", required_argument, NULL, 0},
487                 {"tx-csum", required_argument, NULL, 0},
488                 {"tso", required_argument, NULL, 0},
489                 {"client", no_argument, &client_mode, 1},
490                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491                 {"dma-type", required_argument, NULL, 0},
492                 {"dmas", required_argument, NULL, 0},
493                 {NULL, 0, 0, 0},
494         };
495
496         /* Parse command line */
497         while ((opt = getopt_long(argc, argv, "p:P",
498                         long_option, &option_index)) != EOF) {
499                 switch (opt) {
500                 /* Portmask */
501                 case 'p':
502                         enabled_port_mask = parse_portmask(optarg);
503                         if (enabled_port_mask == 0) {
504                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505                                 us_vhost_usage(prgname);
506                                 return -1;
507                         }
508                         break;
509
510                 case 'P':
511                         promiscuous = 1;
512                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513                                 ETH_VMDQ_ACCEPT_BROADCAST |
514                                 ETH_VMDQ_ACCEPT_MULTICAST;
515
516                         break;
517
518                 case 0:
519                         /* Enable/disable vm2vm comms. */
520                         if (!strncmp(long_option[option_index].name, "vm2vm",
521                                 MAX_LONG_OPT_SZ)) {
522                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523                                 if (ret == -1) {
524                                         RTE_LOG(INFO, VHOST_CONFIG,
525                                                 "Invalid argument for "
526                                                 "vm2vm [0|1|2]\n");
527                                         us_vhost_usage(prgname);
528                                         return -1;
529                                 } else {
530                                         vm2vm_mode = (vm2vm_type)ret;
531                                 }
532                         }
533
534                         /* Enable/disable retries on RX. */
535                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536                                 ret = parse_num_opt(optarg, 1);
537                                 if (ret == -1) {
538                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         enable_retry = ret;
543                                 }
544                         }
545
546                         /* Enable/disable TX checksum offload. */
547                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else
554                                         enable_tx_csum = ret;
555                         }
556
557                         /* Enable/disable TSO offload. */
558                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559                                 ret = parse_num_opt(optarg, 1);
560                                 if (ret == -1) {
561                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562                                         us_vhost_usage(prgname);
563                                         return -1;
564                                 } else
565                                         enable_tso = ret;
566                         }
567
568                         /* Specify the retries delay time (in useconds) on RX. */
569                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570                                 ret = parse_num_opt(optarg, INT32_MAX);
571                                 if (ret == -1) {
572                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573                                         us_vhost_usage(prgname);
574                                         return -1;
575                                 } else {
576                                         burst_rx_delay_time = ret;
577                                 }
578                         }
579
580                         /* Specify the retries number on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_retry_num = ret;
589                                 }
590                         }
591
592                         /* Enable/disable RX mergeable buffers. */
593                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, 1);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         mergeable = !!ret;
601                                         if (ret) {
602                                                 vmdq_conf_default.rxmode.offloads |=
603                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
604                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
605                                                         = JUMBO_FRAME_MAX_SIZE;
606                                         }
607                                 }
608                         }
609
610                         /* Enable/disable stats. */
611                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612                                 ret = parse_num_opt(optarg, INT32_MAX);
613                                 if (ret == -1) {
614                                         RTE_LOG(INFO, VHOST_CONFIG,
615                                                 "Invalid argument for stats [0..N]\n");
616                                         us_vhost_usage(prgname);
617                                         return -1;
618                                 } else {
619                                         enable_stats = ret;
620                                 }
621                         }
622
623                         /* Set socket file path. */
624                         if (!strncmp(long_option[option_index].name,
625                                                 "socket-file", MAX_LONG_OPT_SZ)) {
626                                 if (us_vhost_parse_socket_path(optarg) == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                         "Invalid argument for socket name (Max %d characters)\n",
629                                         PATH_MAX);
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 }
633                         }
634
635                         if (!strncmp(long_option[option_index].name,
636                                                 "dma-type", MAX_LONG_OPT_SZ)) {
637                                 if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
638                                         RTE_LOG(INFO, VHOST_CONFIG,
639                                                 "Wrong DMA type\n");
640                                         us_vhost_usage(prgname);
641                                         return -1;
642                                 }
643                                 strcpy(dma_type, optarg);
644                         }
645
646                         if (!strncmp(long_option[option_index].name,
647                                                 "dmas", MAX_LONG_OPT_SZ)) {
648                                 if (open_dma(optarg) == -1) {
649                                         RTE_LOG(INFO, VHOST_CONFIG,
650                                                 "Wrong DMA args\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 }
654                                 async_vhost_driver = 1;
655                         }
656
657                         break;
658
659                         /* Invalid option - print options. */
660                 default:
661                         us_vhost_usage(prgname);
662                         return -1;
663                 }
664         }
665
666         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
667                 if (enabled_port_mask & (1 << i))
668                         ports[num_ports++] = i;
669         }
670
671         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
672                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
673                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
674                 return -1;
675         }
676
677         return 0;
678 }
679
680 /*
681  * Update the global var NUM_PORTS and array PORTS according to system ports number
682  * and return valid ports number
683  */
684 static unsigned check_ports_num(unsigned nb_ports)
685 {
686         unsigned valid_num_ports = num_ports;
687         unsigned portid;
688
689         if (num_ports > nb_ports) {
690                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
691                         num_ports, nb_ports);
692                 num_ports = nb_ports;
693         }
694
695         for (portid = 0; portid < num_ports; portid ++) {
696                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
697                         RTE_LOG(INFO, VHOST_PORT,
698                                 "\nSpecified port ID(%u) is not valid\n",
699                                 ports[portid]);
700                         ports[portid] = INVALID_PORT_ID;
701                         valid_num_ports--;
702                 }
703         }
704         return valid_num_ports;
705 }
706
707 static __rte_always_inline struct vhost_dev *
708 find_vhost_dev(struct rte_ether_addr *mac)
709 {
710         struct vhost_dev *vdev;
711
712         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
713                 if (vdev->ready == DEVICE_RX &&
714                     rte_is_same_ether_addr(mac, &vdev->mac_address))
715                         return vdev;
716         }
717
718         return NULL;
719 }
720
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728         struct rte_ether_hdr *pkt_hdr;
729         int i, ret;
730
731         /* Learn MAC address of guest device from packet */
732         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
733
734         if (find_vhost_dev(&pkt_hdr->s_addr)) {
735                 RTE_LOG(ERR, VHOST_DATA,
736                         "(%d) device is using a registered MAC!\n",
737                         vdev->vid);
738                 return -1;
739         }
740
741         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
742                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
743
744         /* vlan_tag currently uses the device_id. */
745         vdev->vlan_tag = vlan_tags[vdev->vid];
746
747         /* Print out VMDQ registration info. */
748         RTE_LOG(INFO, VHOST_DATA,
749                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
750                 vdev->vid,
751                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754                 vdev->vlan_tag);
755
756         /* Register the MAC address. */
757         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758                                 (uint32_t)vdev->vid + vmdq_pool_base);
759         if (ret)
760                 RTE_LOG(ERR, VHOST_DATA,
761                         "(%d) failed to add device MAC address to VMDQ\n",
762                         vdev->vid);
763
764         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
765
766         /* Set device as ready for RX. */
767         vdev->ready = DEVICE_RX;
768
769         return 0;
770 }
771
772 /*
773  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
774  * queue before disabling RX on the device.
775  */
776 static inline void
777 unlink_vmdq(struct vhost_dev *vdev)
778 {
779         unsigned i = 0;
780         unsigned rx_count;
781         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
782
783         if (vdev->ready == DEVICE_RX) {
784                 /*clear MAC and VLAN settings*/
785                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
786                 for (i = 0; i < 6; i++)
787                         vdev->mac_address.addr_bytes[i] = 0;
788
789                 vdev->vlan_tag = 0;
790
791                 /*Clear out the receive buffers*/
792                 rx_count = rte_eth_rx_burst(ports[0],
793                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
794
795                 while (rx_count) {
796                         for (i = 0; i < rx_count; i++)
797                                 rte_pktmbuf_free(pkts_burst[i]);
798
799                         rx_count = rte_eth_rx_burst(ports[0],
800                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
801                 }
802
803                 vdev->ready = DEVICE_MAC_LEARNING;
804         }
805 }
806
807 static __rte_always_inline void
808 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
809             struct rte_mbuf *m)
810 {
811         uint16_t ret;
812         struct rte_mbuf *m_cpl[1], *comp_pkt;
813         uint32_t nr_comp = 0;
814
815         if (builtin_net_driver) {
816                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
817         } else if (async_vhost_driver) {
818                 ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
819                                                 &m, 1, &comp_pkt, &nr_comp);
820                 if (nr_comp == 1)
821                         goto done;
822
823                 if (likely(ret))
824                         dst_vdev->nr_async_pkts++;
825
826                 while (likely(dst_vdev->nr_async_pkts)) {
827                         if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
828                                         VIRTIO_RXQ, m_cpl, 1))
829                                 dst_vdev->nr_async_pkts--;
830                 }
831         } else {
832                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
833         }
834
835 done:
836         if (enable_stats) {
837                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
838                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
839                 src_vdev->stats.tx_total++;
840                 src_vdev->stats.tx += ret;
841         }
842 }
843
844 /*
845  * Check if the packet destination MAC address is for a local device. If so then put
846  * the packet on that devices RX queue. If not then return.
847  */
848 static __rte_always_inline int
849 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
850 {
851         struct rte_ether_hdr *pkt_hdr;
852         struct vhost_dev *dst_vdev;
853
854         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
855
856         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
857         if (!dst_vdev)
858                 return -1;
859
860         if (vdev->vid == dst_vdev->vid) {
861                 RTE_LOG_DP(DEBUG, VHOST_DATA,
862                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
863                         vdev->vid);
864                 return 0;
865         }
866
867         RTE_LOG_DP(DEBUG, VHOST_DATA,
868                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
869
870         if (unlikely(dst_vdev->remove)) {
871                 RTE_LOG_DP(DEBUG, VHOST_DATA,
872                         "(%d) device is marked for removal\n", dst_vdev->vid);
873                 return 0;
874         }
875
876         virtio_xmit(dst_vdev, vdev, m);
877         return 0;
878 }
879
880 /*
881  * Check if the destination MAC of a packet is one local VM,
882  * and get its vlan tag, and offset if it is.
883  */
884 static __rte_always_inline int
885 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
886         uint32_t *offset, uint16_t *vlan_tag)
887 {
888         struct vhost_dev *dst_vdev;
889         struct rte_ether_hdr *pkt_hdr =
890                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
891
892         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
893         if (!dst_vdev)
894                 return 0;
895
896         if (vdev->vid == dst_vdev->vid) {
897                 RTE_LOG_DP(DEBUG, VHOST_DATA,
898                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
899                         vdev->vid);
900                 return -1;
901         }
902
903         /*
904          * HW vlan strip will reduce the packet length
905          * by minus length of vlan tag, so need restore
906          * the packet length by plus it.
907          */
908         *offset  = VLAN_HLEN;
909         *vlan_tag = vlan_tags[vdev->vid];
910
911         RTE_LOG_DP(DEBUG, VHOST_DATA,
912                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
913                 vdev->vid, dst_vdev->vid, *vlan_tag);
914
915         return 0;
916 }
917
918 static uint16_t
919 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
920 {
921         if (ol_flags & PKT_TX_IPV4)
922                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
923         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
924                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
925 }
926
927 static void virtio_tx_offload(struct rte_mbuf *m)
928 {
929         void *l3_hdr;
930         struct rte_ipv4_hdr *ipv4_hdr = NULL;
931         struct rte_tcp_hdr *tcp_hdr = NULL;
932         struct rte_ether_hdr *eth_hdr =
933                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
934
935         l3_hdr = (char *)eth_hdr + m->l2_len;
936
937         if (m->ol_flags & PKT_TX_IPV4) {
938                 ipv4_hdr = l3_hdr;
939                 ipv4_hdr->hdr_checksum = 0;
940                 m->ol_flags |= PKT_TX_IP_CKSUM;
941         }
942
943         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
944         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
945 }
946
947 static inline void
948 free_pkts(struct rte_mbuf **pkts, uint16_t n)
949 {
950         while (n--)
951                 rte_pktmbuf_free(pkts[n]);
952 }
953
954 static __rte_always_inline void
955 do_drain_mbuf_table(struct mbuf_table *tx_q)
956 {
957         uint16_t count;
958
959         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
960                                  tx_q->m_table, tx_q->len);
961         if (unlikely(count < tx_q->len))
962                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
963
964         tx_q->len = 0;
965 }
966
967 /*
968  * This function routes the TX packet to the correct interface. This
969  * may be a local device or the physical port.
970  */
971 static __rte_always_inline void
972 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
973 {
974         struct mbuf_table *tx_q;
975         unsigned offset = 0;
976         const uint16_t lcore_id = rte_lcore_id();
977         struct rte_ether_hdr *nh;
978
979
980         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
981         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
982                 struct vhost_dev *vdev2;
983
984                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
985                         if (vdev2 != vdev)
986                                 virtio_xmit(vdev2, vdev, m);
987                 }
988                 goto queue2nic;
989         }
990
991         /*check if destination is local VM*/
992         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
993                 rte_pktmbuf_free(m);
994                 return;
995         }
996
997         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
998                 if (unlikely(find_local_dest(vdev, m, &offset,
999                                              &vlan_tag) != 0)) {
1000                         rte_pktmbuf_free(m);
1001                         return;
1002                 }
1003         }
1004
1005         RTE_LOG_DP(DEBUG, VHOST_DATA,
1006                 "(%d) TX: MAC address is external\n", vdev->vid);
1007
1008 queue2nic:
1009
1010         /*Add packet to the port tx queue*/
1011         tx_q = &lcore_tx_queue[lcore_id];
1012
1013         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1014         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1015                 /* Guest has inserted the vlan tag. */
1016                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1017                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1018                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1019                         (vh->vlan_tci != vlan_tag_be))
1020                         vh->vlan_tci = vlan_tag_be;
1021         } else {
1022                 m->ol_flags |= PKT_TX_VLAN_PKT;
1023
1024                 /*
1025                  * Find the right seg to adjust the data len when offset is
1026                  * bigger than tail room size.
1027                  */
1028                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1029                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1030                                 m->data_len += offset;
1031                         else {
1032                                 struct rte_mbuf *seg = m;
1033
1034                                 while ((seg->next != NULL) &&
1035                                         (offset > rte_pktmbuf_tailroom(seg)))
1036                                         seg = seg->next;
1037
1038                                 seg->data_len += offset;
1039                         }
1040                         m->pkt_len += offset;
1041                 }
1042
1043                 m->vlan_tci = vlan_tag;
1044         }
1045
1046         if (m->ol_flags & PKT_TX_TCP_SEG)
1047                 virtio_tx_offload(m);
1048
1049         tx_q->m_table[tx_q->len++] = m;
1050         if (enable_stats) {
1051                 vdev->stats.tx_total++;
1052                 vdev->stats.tx++;
1053         }
1054
1055         if (unlikely(tx_q->len == MAX_PKT_BURST))
1056                 do_drain_mbuf_table(tx_q);
1057 }
1058
1059
1060 static __rte_always_inline void
1061 drain_mbuf_table(struct mbuf_table *tx_q)
1062 {
1063         static uint64_t prev_tsc;
1064         uint64_t cur_tsc;
1065
1066         if (tx_q->len == 0)
1067                 return;
1068
1069         cur_tsc = rte_rdtsc();
1070         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1071                 prev_tsc = cur_tsc;
1072
1073                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1074                         "TX queue drained after timeout with burst size %u\n",
1075                         tx_q->len);
1076                 do_drain_mbuf_table(tx_q);
1077         }
1078 }
1079
1080 static __rte_always_inline void
1081 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1082 {
1083         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1084         uint16_t complete_count;
1085
1086         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1087                                                 qid, p_cpl, MAX_PKT_BURST);
1088         vdev->nr_async_pkts -= complete_count;
1089         if (complete_count)
1090                 free_pkts(p_cpl, complete_count);
1091 }
1092
1093 static __rte_always_inline void
1094 drain_eth_rx(struct vhost_dev *vdev)
1095 {
1096         uint16_t rx_count, enqueue_count;
1097         struct rte_mbuf *pkts[MAX_PKT_BURST], *comp_pkts[MAX_PKT_BURST];
1098         uint32_t nr_comp = 0;
1099
1100         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1101                                     pkts, MAX_PKT_BURST);
1102
1103         while (likely(vdev->nr_async_pkts))
1104                 complete_async_pkts(vdev, VIRTIO_RXQ);
1105
1106         if (!rx_count)
1107                 return;
1108
1109         /*
1110          * When "enable_retry" is set, here we wait and retry when there
1111          * is no enough free slots in the queue to hold @rx_count packets,
1112          * to diminish packet loss.
1113          */
1114         if (enable_retry &&
1115             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1116                         VIRTIO_RXQ))) {
1117                 uint32_t retry;
1118
1119                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1120                         rte_delay_us(burst_rx_delay_time);
1121                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1122                                         VIRTIO_RXQ))
1123                                 break;
1124                 }
1125         }
1126
1127         if (builtin_net_driver) {
1128                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1129                                                 pkts, rx_count);
1130         } else if (async_vhost_driver) {
1131                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1132                                         VIRTIO_RXQ, pkts, rx_count, comp_pkts,
1133                                         &nr_comp);
1134                 if (nr_comp > 0) {
1135                         free_pkts(comp_pkts, nr_comp);
1136                         enqueue_count -= nr_comp;
1137                 }
1138                 vdev->nr_async_pkts += enqueue_count;
1139         } else {
1140                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1141                                                 pkts, rx_count);
1142         }
1143
1144         if (enable_stats) {
1145                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1146                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1147         }
1148
1149         if (!async_vhost_driver)
1150                 free_pkts(pkts, rx_count);
1151 }
1152
1153 static __rte_always_inline void
1154 drain_virtio_tx(struct vhost_dev *vdev)
1155 {
1156         struct rte_mbuf *pkts[MAX_PKT_BURST];
1157         uint16_t count;
1158         uint16_t i;
1159
1160         if (builtin_net_driver) {
1161                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1162                                         pkts, MAX_PKT_BURST);
1163         } else {
1164                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1165                                         mbuf_pool, pkts, MAX_PKT_BURST);
1166         }
1167
1168         /* setup VMDq for the first packet */
1169         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1170                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1171                         free_pkts(pkts, count);
1172         }
1173
1174         for (i = 0; i < count; ++i)
1175                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1176 }
1177
1178 /*
1179  * Main function of vhost-switch. It basically does:
1180  *
1181  * for each vhost device {
1182  *    - drain_eth_rx()
1183  *
1184  *      Which drains the host eth Rx queue linked to the vhost device,
1185  *      and deliver all of them to guest virito Rx ring associated with
1186  *      this vhost device.
1187  *
1188  *    - drain_virtio_tx()
1189  *
1190  *      Which drains the guest virtio Tx queue and deliver all of them
1191  *      to the target, which could be another vhost device, or the
1192  *      physical eth dev. The route is done in function "virtio_tx_route".
1193  * }
1194  */
1195 static int
1196 switch_worker(void *arg __rte_unused)
1197 {
1198         unsigned i;
1199         unsigned lcore_id = rte_lcore_id();
1200         struct vhost_dev *vdev;
1201         struct mbuf_table *tx_q;
1202
1203         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1204
1205         tx_q = &lcore_tx_queue[lcore_id];
1206         for (i = 0; i < rte_lcore_count(); i++) {
1207                 if (lcore_ids[i] == lcore_id) {
1208                         tx_q->txq_id = i;
1209                         break;
1210                 }
1211         }
1212
1213         while(1) {
1214                 drain_mbuf_table(tx_q);
1215
1216                 /*
1217                  * Inform the configuration core that we have exited the
1218                  * linked list and that no devices are in use if requested.
1219                  */
1220                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1221                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1222
1223                 /*
1224                  * Process vhost devices
1225                  */
1226                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1227                               lcore_vdev_entry) {
1228                         if (unlikely(vdev->remove)) {
1229                                 unlink_vmdq(vdev);
1230                                 vdev->ready = DEVICE_SAFE_REMOVE;
1231                                 continue;
1232                         }
1233
1234                         if (likely(vdev->ready == DEVICE_RX))
1235                                 drain_eth_rx(vdev);
1236
1237                         if (likely(!vdev->remove))
1238                                 drain_virtio_tx(vdev);
1239                 }
1240         }
1241
1242         return 0;
1243 }
1244
1245 /*
1246  * Remove a device from the specific data core linked list and from the
1247  * main linked list. Synchonization  occurs through the use of the
1248  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1249  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1250  */
1251 static void
1252 destroy_device(int vid)
1253 {
1254         struct vhost_dev *vdev = NULL;
1255         int lcore;
1256
1257         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1258                 if (vdev->vid == vid)
1259                         break;
1260         }
1261         if (!vdev)
1262                 return;
1263         /*set the remove flag. */
1264         vdev->remove = 1;
1265         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1266                 rte_pause();
1267         }
1268
1269         if (builtin_net_driver)
1270                 vs_vhost_net_remove(vdev);
1271
1272         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1273                      lcore_vdev_entry);
1274         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1275
1276
1277         /* Set the dev_removal_flag on each lcore. */
1278         RTE_LCORE_FOREACH_WORKER(lcore)
1279                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1280
1281         /*
1282          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1283          * we can be sure that they can no longer access the device removed
1284          * from the linked lists and that the devices are no longer in use.
1285          */
1286         RTE_LCORE_FOREACH_WORKER(lcore) {
1287                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1288                         rte_pause();
1289         }
1290
1291         lcore_info[vdev->coreid].device_num--;
1292
1293         RTE_LOG(INFO, VHOST_DATA,
1294                 "(%d) device has been removed from data core\n",
1295                 vdev->vid);
1296
1297         if (async_vhost_driver)
1298                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1299
1300         rte_free(vdev);
1301 }
1302
1303 /*
1304  * A new device is added to a data core. First the device is added to the main linked list
1305  * and then allocated to a specific data core.
1306  */
1307 static int
1308 new_device(int vid)
1309 {
1310         int lcore, core_add = 0;
1311         uint32_t device_num_min = num_devices;
1312         struct vhost_dev *vdev;
1313         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1314         if (vdev == NULL) {
1315                 RTE_LOG(INFO, VHOST_DATA,
1316                         "(%d) couldn't allocate memory for vhost dev\n",
1317                         vid);
1318                 return -1;
1319         }
1320         vdev->vid = vid;
1321
1322         if (builtin_net_driver)
1323                 vs_vhost_net_setup(vdev);
1324
1325         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1326         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1327
1328         /*reset ready flag*/
1329         vdev->ready = DEVICE_MAC_LEARNING;
1330         vdev->remove = 0;
1331
1332         /* Find a suitable lcore to add the device. */
1333         RTE_LCORE_FOREACH_WORKER(lcore) {
1334                 if (lcore_info[lcore].device_num < device_num_min) {
1335                         device_num_min = lcore_info[lcore].device_num;
1336                         core_add = lcore;
1337                 }
1338         }
1339         vdev->coreid = core_add;
1340
1341         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1342                           lcore_vdev_entry);
1343         lcore_info[vdev->coreid].device_num++;
1344
1345         /* Disable notifications. */
1346         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1347         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1348
1349         RTE_LOG(INFO, VHOST_DATA,
1350                 "(%d) device has been added to data core %d\n",
1351                 vid, vdev->coreid);
1352
1353         if (async_vhost_driver) {
1354                 struct rte_vhost_async_features f;
1355                 struct rte_vhost_async_channel_ops channel_ops;
1356                 if (strncmp(dma_type, "ioat", 4) == 0) {
1357                         channel_ops.transfer_data = ioat_transfer_data_cb;
1358                         channel_ops.check_completed_copies =
1359                                 ioat_check_completed_copies_cb;
1360                         f.async_inorder = 1;
1361                         f.async_threshold = 256;
1362                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1363                                 f.intval, &channel_ops);
1364                 }
1365         }
1366
1367         return 0;
1368 }
1369
1370 /*
1371  * These callback allow devices to be added to the data core when configuration
1372  * has been fully complete.
1373  */
1374 static const struct vhost_device_ops virtio_net_device_ops =
1375 {
1376         .new_device =  new_device,
1377         .destroy_device = destroy_device,
1378 };
1379
1380 /*
1381  * This is a thread will wake up after a period to print stats if the user has
1382  * enabled them.
1383  */
1384 static void *
1385 print_stats(__rte_unused void *arg)
1386 {
1387         struct vhost_dev *vdev;
1388         uint64_t tx_dropped, rx_dropped;
1389         uint64_t tx, tx_total, rx, rx_total;
1390         const char clr[] = { 27, '[', '2', 'J', '\0' };
1391         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1392
1393         while(1) {
1394                 sleep(enable_stats);
1395
1396                 /* Clear screen and move to top left */
1397                 printf("%s%s\n", clr, top_left);
1398                 printf("Device statistics =================================\n");
1399
1400                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1401                         tx_total   = vdev->stats.tx_total;
1402                         tx         = vdev->stats.tx;
1403                         tx_dropped = tx_total - tx;
1404
1405                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1406                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1407                         rx_dropped = rx_total - rx;
1408
1409                         printf("Statistics for device %d\n"
1410                                 "-----------------------\n"
1411                                 "TX total:              %" PRIu64 "\n"
1412                                 "TX dropped:            %" PRIu64 "\n"
1413                                 "TX successful:         %" PRIu64 "\n"
1414                                 "RX total:              %" PRIu64 "\n"
1415                                 "RX dropped:            %" PRIu64 "\n"
1416                                 "RX successful:         %" PRIu64 "\n",
1417                                 vdev->vid,
1418                                 tx_total, tx_dropped, tx,
1419                                 rx_total, rx_dropped, rx);
1420                 }
1421
1422                 printf("===================================================\n");
1423
1424                 fflush(stdout);
1425         }
1426
1427         return NULL;
1428 }
1429
1430 static void
1431 unregister_drivers(int socket_num)
1432 {
1433         int i, ret;
1434
1435         for (i = 0; i < socket_num; i++) {
1436                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1437                 if (ret != 0)
1438                         RTE_LOG(ERR, VHOST_CONFIG,
1439                                 "Fail to unregister vhost driver for %s.\n",
1440                                 socket_files + i * PATH_MAX);
1441         }
1442 }
1443
1444 /* When we receive a INT signal, unregister vhost driver */
1445 static void
1446 sigint_handler(__rte_unused int signum)
1447 {
1448         /* Unregister vhost driver. */
1449         unregister_drivers(nb_sockets);
1450
1451         exit(0);
1452 }
1453
1454 /*
1455  * While creating an mbuf pool, one key thing is to figure out how
1456  * many mbuf entries is enough for our use. FYI, here are some
1457  * guidelines:
1458  *
1459  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1460  *
1461  * - For each switch core (A CPU core does the packet switch), we need
1462  *   also make some reservation for receiving the packets from virtio
1463  *   Tx queue. How many is enough depends on the usage. It's normally
1464  *   a simple calculation like following:
1465  *
1466  *       MAX_PKT_BURST * max packet size / mbuf size
1467  *
1468  *   So, we definitely need allocate more mbufs when TSO is enabled.
1469  *
1470  * - Similarly, for each switching core, we should serve @nr_rx_desc
1471  *   mbufs for receiving the packets from physical NIC device.
1472  *
1473  * - We also need make sure, for each switch core, we have allocated
1474  *   enough mbufs to fill up the mbuf cache.
1475  */
1476 static void
1477 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1478         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1479 {
1480         uint32_t nr_mbufs;
1481         uint32_t nr_mbufs_per_core;
1482         uint32_t mtu = 1500;
1483
1484         if (mergeable)
1485                 mtu = 9000;
1486         if (enable_tso)
1487                 mtu = 64 * 1024;
1488
1489         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1490                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1491         nr_mbufs_per_core += nr_rx_desc;
1492         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1493
1494         nr_mbufs  = nr_queues * nr_rx_desc;
1495         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1496         nr_mbufs *= nr_port;
1497
1498         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1499                                             nr_mbuf_cache, 0, mbuf_size,
1500                                             rte_socket_id());
1501         if (mbuf_pool == NULL)
1502                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1503 }
1504
1505 /*
1506  * Main function, does initialisation and calls the per-lcore functions.
1507  */
1508 int
1509 main(int argc, char *argv[])
1510 {
1511         unsigned lcore_id, core_id = 0;
1512         unsigned nb_ports, valid_num_ports;
1513         int ret, i;
1514         uint16_t portid;
1515         static pthread_t tid;
1516         uint64_t flags = 0;
1517
1518         signal(SIGINT, sigint_handler);
1519
1520         /* init EAL */
1521         ret = rte_eal_init(argc, argv);
1522         if (ret < 0)
1523                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1524         argc -= ret;
1525         argv += ret;
1526
1527         /* parse app arguments */
1528         ret = us_vhost_parse_args(argc, argv);
1529         if (ret < 0)
1530                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1531
1532         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1533                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1534
1535                 if (rte_lcore_is_enabled(lcore_id))
1536                         lcore_ids[core_id++] = lcore_id;
1537         }
1538
1539         if (rte_lcore_count() > RTE_MAX_LCORE)
1540                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1541
1542         /* Get the number of physical ports. */
1543         nb_ports = rte_eth_dev_count_avail();
1544
1545         /*
1546          * Update the global var NUM_PORTS and global array PORTS
1547          * and get value of var VALID_NUM_PORTS according to system ports number
1548          */
1549         valid_num_ports = check_ports_num(nb_ports);
1550
1551         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1552                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1553                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1554                 return -1;
1555         }
1556
1557         /*
1558          * FIXME: here we are trying to allocate mbufs big enough for
1559          * @MAX_QUEUES, but the truth is we're never going to use that
1560          * many queues here. We probably should only do allocation for
1561          * those queues we are going to use.
1562          */
1563         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1564                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1565
1566         if (vm2vm_mode == VM2VM_HARDWARE) {
1567                 /* Enable VT loop back to let L2 switch to do it. */
1568                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1569                 RTE_LOG(DEBUG, VHOST_CONFIG,
1570                         "Enable loop back for L2 switch in vmdq.\n");
1571         }
1572
1573         /* initialize all ports */
1574         RTE_ETH_FOREACH_DEV(portid) {
1575                 /* skip ports that are not enabled */
1576                 if ((enabled_port_mask & (1 << portid)) == 0) {
1577                         RTE_LOG(INFO, VHOST_PORT,
1578                                 "Skipping disabled port %d\n", portid);
1579                         continue;
1580                 }
1581                 if (port_init(portid) != 0)
1582                         rte_exit(EXIT_FAILURE,
1583                                 "Cannot initialize network ports\n");
1584         }
1585
1586         /* Enable stats if the user option is set. */
1587         if (enable_stats) {
1588                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1589                                         print_stats, NULL);
1590                 if (ret < 0)
1591                         rte_exit(EXIT_FAILURE,
1592                                 "Cannot create print-stats thread\n");
1593         }
1594
1595         /* Launch all data cores. */
1596         RTE_LCORE_FOREACH_WORKER(lcore_id)
1597                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1598
1599         if (client_mode)
1600                 flags |= RTE_VHOST_USER_CLIENT;
1601
1602         /* Register vhost user driver to handle vhost messages. */
1603         for (i = 0; i < nb_sockets; i++) {
1604                 char *file = socket_files + i * PATH_MAX;
1605                 if (async_vhost_driver)
1606                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1607
1608                 ret = rte_vhost_driver_register(file, flags);
1609                 if (ret != 0) {
1610                         unregister_drivers(i);
1611                         rte_exit(EXIT_FAILURE,
1612                                 "vhost driver register failure.\n");
1613                 }
1614
1615                 if (builtin_net_driver)
1616                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1617
1618                 if (mergeable == 0) {
1619                         rte_vhost_driver_disable_features(file,
1620                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1621                 }
1622
1623                 if (enable_tx_csum == 0) {
1624                         rte_vhost_driver_disable_features(file,
1625                                 1ULL << VIRTIO_NET_F_CSUM);
1626                 }
1627
1628                 if (enable_tso == 0) {
1629                         rte_vhost_driver_disable_features(file,
1630                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1631                         rte_vhost_driver_disable_features(file,
1632                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1633                         rte_vhost_driver_disable_features(file,
1634                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1635                         rte_vhost_driver_disable_features(file,
1636                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1637                 }
1638
1639                 if (promiscuous) {
1640                         rte_vhost_driver_enable_features(file,
1641                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1642                 }
1643
1644                 ret = rte_vhost_driver_callback_register(file,
1645                         &virtio_net_device_ops);
1646                 if (ret != 0) {
1647                         rte_exit(EXIT_FAILURE,
1648                                 "failed to register vhost driver callbacks.\n");
1649                 }
1650
1651                 if (rte_vhost_driver_start(file) < 0) {
1652                         rte_exit(EXIT_FAILURE,
1653                                 "failed to start vhost driver.\n");
1654                 }
1655         }
1656
1657         RTE_LCORE_FOREACH_WORKER(lcore_id)
1658                 rte_eal_wait_lcore(lcore_id);
1659
1660         return 0;
1661
1662 }