59a1aff07c5a952572685361b68f355243247648
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* Maximum long option length for option parsing. */
60 #define MAX_LONG_OPT_SZ 64
61
62 /* mask of enabled ports */
63 static uint32_t enabled_port_mask = 0;
64
65 /* Promiscuous mode */
66 static uint32_t promiscuous;
67
68 /* number of devices/queues to support*/
69 static uint32_t num_queues = 0;
70 static uint32_t num_devices;
71
72 static struct rte_mempool *mbuf_pool;
73 static int mergeable;
74
75 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76 typedef enum {
77         VM2VM_DISABLED = 0,
78         VM2VM_SOFTWARE = 1,
79         VM2VM_HARDWARE = 2,
80         VM2VM_LAST
81 } vm2vm_type;
82 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83
84 /* Enable stats. */
85 static uint32_t enable_stats = 0;
86 /* Enable retries on RX. */
87 static uint32_t enable_retry = 1;
88
89 /* Disable TX checksum offload */
90 static uint32_t enable_tx_csum;
91
92 /* Disable TSO offload */
93 static uint32_t enable_tso;
94
95 static int client_mode;
96
97 static int builtin_net_driver;
98
99 static int async_vhost_driver;
100
101 static char dma_type[MAX_LONG_OPT_SZ];
102
103 /* Specify timeout (in useconds) between retries on RX. */
104 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105 /* Specify the number of retries on RX. */
106 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107
108 /* Socket file paths. Can be set by user */
109 static char *socket_files;
110 static int nb_sockets;
111
112 /* empty vmdq configuration structure. Filled in programatically */
113 static struct rte_eth_conf vmdq_conf_default = {
114         .rxmode = {
115                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116                 .split_hdr_size = 0,
117                 /*
118                  * VLAN strip is necessary for 1G NIC such as I350,
119                  * this fixes bug of ipv4 forwarding in guest can't
120                  * forward pakets from one virtio dev to another virtio dev.
121                  */
122                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123         },
124
125         .txmode = {
126                 .mq_mode = ETH_MQ_TX_NONE,
127                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128                              DEV_TX_OFFLOAD_TCP_CKSUM |
129                              DEV_TX_OFFLOAD_VLAN_INSERT |
130                              DEV_TX_OFFLOAD_MULTI_SEGS |
131                              DEV_TX_OFFLOAD_TCP_TSO),
132         },
133         .rx_adv_conf = {
134                 /*
135                  * should be overridden separately in code with
136                  * appropriate values
137                  */
138                 .vmdq_rx_conf = {
139                         .nb_queue_pools = ETH_8_POOLS,
140                         .enable_default_pool = 0,
141                         .default_pool = 0,
142                         .nb_pool_maps = 0,
143                         .pool_map = {{0, 0},},
144                 },
145         },
146 };
147
148
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155
156 const uint16_t vlan_tags[] = {
157         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
159         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166
167 /* ethernet addresses of ports */
168 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169
170 static struct vhost_dev_tailq_list vhost_dev_list =
171         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177         unsigned len;
178         unsigned txq_id;
179         struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184
185 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
186                                  / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188
189 static inline int
190 open_dma(const char *value)
191 {
192         if (strncmp(dma_type, "ioat", 4) == 0)
193                 return open_ioat(value);
194
195         return -1;
196 }
197
198 /*
199  * Builds up the correct configuration for VMDQ VLAN pool map
200  * according to the pool & queue limits.
201  */
202 static inline int
203 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204 {
205         struct rte_eth_vmdq_rx_conf conf;
206         struct rte_eth_vmdq_rx_conf *def_conf =
207                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208         unsigned i;
209
210         memset(&conf, 0, sizeof(conf));
211         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212         conf.nb_pool_maps = num_devices;
213         conf.enable_loop_back = def_conf->enable_loop_back;
214         conf.rx_mode = def_conf->rx_mode;
215
216         for (i = 0; i < conf.nb_pool_maps; i++) {
217                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
218                 conf.pool_map[i].pools = (1UL << i);
219         }
220
221         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224         return 0;
225 }
226
227 /*
228  * Initialises a given port using global settings and with the rx buffers
229  * coming from the mbuf_pool passed as parameter
230  */
231 static inline int
232 port_init(uint16_t port)
233 {
234         struct rte_eth_dev_info dev_info;
235         struct rte_eth_conf port_conf;
236         struct rte_eth_rxconf *rxconf;
237         struct rte_eth_txconf *txconf;
238         int16_t rx_rings, tx_rings;
239         uint16_t rx_ring_size, tx_ring_size;
240         int retval;
241         uint16_t q;
242
243         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
244         retval = rte_eth_dev_info_get(port, &dev_info);
245         if (retval != 0) {
246                 RTE_LOG(ERR, VHOST_PORT,
247                         "Error during getting device (port %u) info: %s\n",
248                         port, strerror(-retval));
249
250                 return retval;
251         }
252
253         rxconf = &dev_info.default_rxconf;
254         txconf = &dev_info.default_txconf;
255         rxconf->rx_drop_en = 1;
256
257         /*configure the number of supported virtio devices based on VMDQ limits */
258         num_devices = dev_info.max_vmdq_pools;
259
260         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
262
263         tx_rings = (uint16_t)rte_lcore_count();
264
265         /* Get port configuration. */
266         retval = get_eth_conf(&port_conf, num_devices);
267         if (retval < 0)
268                 return retval;
269         /* NIC queues are divided into pf queues and vmdq queues.  */
270         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272         num_vmdq_queues = num_devices * queues_per_pool;
273         num_queues = num_pf_queues + num_vmdq_queues;
274         vmdq_queue_base = dev_info.vmdq_queue_base;
275         vmdq_pool_base  = dev_info.vmdq_pool_base;
276         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277                 num_pf_queues, num_devices, queues_per_pool);
278
279         if (!rte_eth_dev_is_valid_port(port))
280                 return -1;
281
282         rx_rings = (uint16_t)dev_info.max_rx_queues;
283         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284                 port_conf.txmode.offloads |=
285                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286         /* Configure ethernet device. */
287         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288         if (retval != 0) {
289                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290                         port, strerror(-retval));
291                 return retval;
292         }
293
294         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
295                 &tx_ring_size);
296         if (retval != 0) {
297                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
298                         "for port %u: %s.\n", port, strerror(-retval));
299                 return retval;
300         }
301         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
302                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
303                         "for Rx queues on port %u.\n", port);
304                 return -1;
305         }
306
307         /* Setup the queues. */
308         rxconf->offloads = port_conf.rxmode.offloads;
309         for (q = 0; q < rx_rings; q ++) {
310                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311                                                 rte_eth_dev_socket_id(port),
312                                                 rxconf,
313                                                 mbuf_pool);
314                 if (retval < 0) {
315                         RTE_LOG(ERR, VHOST_PORT,
316                                 "Failed to setup rx queue %u of port %u: %s.\n",
317                                 q, port, strerror(-retval));
318                         return retval;
319                 }
320         }
321         txconf->offloads = port_conf.txmode.offloads;
322         for (q = 0; q < tx_rings; q ++) {
323                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324                                                 rte_eth_dev_socket_id(port),
325                                                 txconf);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup tx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333
334         /* Start the device. */
335         retval  = rte_eth_dev_start(port);
336         if (retval < 0) {
337                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338                         port, strerror(-retval));
339                 return retval;
340         }
341
342         if (promiscuous) {
343                 retval = rte_eth_promiscuous_enable(port);
344                 if (retval != 0) {
345                         RTE_LOG(ERR, VHOST_PORT,
346                                 "Failed to enable promiscuous mode on port %u: %s\n",
347                                 port, rte_strerror(-retval));
348                         return retval;
349                 }
350         }
351
352         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT,
355                         "Failed to get MAC address on port %u: %s\n",
356                         port, rte_strerror(-retval));
357                 return retval;
358         }
359
360         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
363                         port,
364                         vmdq_ports_eth_addr[port].addr_bytes[0],
365                         vmdq_ports_eth_addr[port].addr_bytes[1],
366                         vmdq_ports_eth_addr[port].addr_bytes[2],
367                         vmdq_ports_eth_addr[port].addr_bytes[3],
368                         vmdq_ports_eth_addr[port].addr_bytes[4],
369                         vmdq_ports_eth_addr[port].addr_bytes[5]);
370
371         return 0;
372 }
373
374 /*
375  * Set socket file path.
376  */
377 static int
378 us_vhost_parse_socket_path(const char *q_arg)
379 {
380         char *old;
381
382         /* parse number string */
383         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384                 return -1;
385
386         old = socket_files;
387         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388         if (socket_files == NULL) {
389                 free(old);
390                 return -1;
391         }
392
393         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
394         nb_sockets++;
395
396         return 0;
397 }
398
399 /*
400  * Parse the portmask provided at run time.
401  */
402 static int
403 parse_portmask(const char *portmask)
404 {
405         char *end = NULL;
406         unsigned long pm;
407
408         errno = 0;
409
410         /* parse hexadecimal string */
411         pm = strtoul(portmask, &end, 16);
412         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413                 return 0;
414
415         return pm;
416
417 }
418
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425         char *end = NULL;
426         unsigned long num;
427
428         errno = 0;
429
430         /* parse unsigned int string */
431         num = strtoul(q_arg, &end, 10);
432         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433                 return -1;
434
435         if (num > max_valid_value)
436                 return -1;
437
438         return num;
439
440 }
441
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449         "               --vm2vm [0|1|2]\n"
450         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451         "               --socket-file <path>\n"
452         "               --nb-devices ND\n"
453         "               -p PORTMASK: Set mask for ports to be used by application\n"
454         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460         "               --socket-file: The path of the socket file.\n"
461         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
462         "               --tso [0|1] disable/enable TCP segment offload.\n"
463         "               --client register a vhost-user socket as client mode.\n"
464         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465         "               --dmas register dma channel for specific vhost device.\n",
466                prgname);
467 }
468
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475         int opt, ret;
476         int option_index;
477         unsigned i;
478         const char *prgname = argv[0];
479         static struct option long_option[] = {
480                 {"vm2vm", required_argument, NULL, 0},
481                 {"rx-retry", required_argument, NULL, 0},
482                 {"rx-retry-delay", required_argument, NULL, 0},
483                 {"rx-retry-num", required_argument, NULL, 0},
484                 {"mergeable", required_argument, NULL, 0},
485                 {"stats", required_argument, NULL, 0},
486                 {"socket-file", required_argument, NULL, 0},
487                 {"tx-csum", required_argument, NULL, 0},
488                 {"tso", required_argument, NULL, 0},
489                 {"client", no_argument, &client_mode, 1},
490                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491                 {"dma-type", required_argument, NULL, 0},
492                 {"dmas", required_argument, NULL, 0},
493                 {NULL, 0, 0, 0},
494         };
495
496         /* Parse command line */
497         while ((opt = getopt_long(argc, argv, "p:P",
498                         long_option, &option_index)) != EOF) {
499                 switch (opt) {
500                 /* Portmask */
501                 case 'p':
502                         enabled_port_mask = parse_portmask(optarg);
503                         if (enabled_port_mask == 0) {
504                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505                                 us_vhost_usage(prgname);
506                                 return -1;
507                         }
508                         break;
509
510                 case 'P':
511                         promiscuous = 1;
512                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513                                 ETH_VMDQ_ACCEPT_BROADCAST |
514                                 ETH_VMDQ_ACCEPT_MULTICAST;
515
516                         break;
517
518                 case 0:
519                         /* Enable/disable vm2vm comms. */
520                         if (!strncmp(long_option[option_index].name, "vm2vm",
521                                 MAX_LONG_OPT_SZ)) {
522                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523                                 if (ret == -1) {
524                                         RTE_LOG(INFO, VHOST_CONFIG,
525                                                 "Invalid argument for "
526                                                 "vm2vm [0|1|2]\n");
527                                         us_vhost_usage(prgname);
528                                         return -1;
529                                 } else {
530                                         vm2vm_mode = (vm2vm_type)ret;
531                                 }
532                         }
533
534                         /* Enable/disable retries on RX. */
535                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536                                 ret = parse_num_opt(optarg, 1);
537                                 if (ret == -1) {
538                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         enable_retry = ret;
543                                 }
544                         }
545
546                         /* Enable/disable TX checksum offload. */
547                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else
554                                         enable_tx_csum = ret;
555                         }
556
557                         /* Enable/disable TSO offload. */
558                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559                                 ret = parse_num_opt(optarg, 1);
560                                 if (ret == -1) {
561                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562                                         us_vhost_usage(prgname);
563                                         return -1;
564                                 } else
565                                         enable_tso = ret;
566                         }
567
568                         /* Specify the retries delay time (in useconds) on RX. */
569                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570                                 ret = parse_num_opt(optarg, INT32_MAX);
571                                 if (ret == -1) {
572                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573                                         us_vhost_usage(prgname);
574                                         return -1;
575                                 } else {
576                                         burst_rx_delay_time = ret;
577                                 }
578                         }
579
580                         /* Specify the retries number on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_retry_num = ret;
589                                 }
590                         }
591
592                         /* Enable/disable RX mergeable buffers. */
593                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, 1);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         mergeable = !!ret;
601                                         if (ret) {
602                                                 vmdq_conf_default.rxmode.offloads |=
603                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
604                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
605                                                         = JUMBO_FRAME_MAX_SIZE;
606                                         }
607                                 }
608                         }
609
610                         /* Enable/disable stats. */
611                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612                                 ret = parse_num_opt(optarg, INT32_MAX);
613                                 if (ret == -1) {
614                                         RTE_LOG(INFO, VHOST_CONFIG,
615                                                 "Invalid argument for stats [0..N]\n");
616                                         us_vhost_usage(prgname);
617                                         return -1;
618                                 } else {
619                                         enable_stats = ret;
620                                 }
621                         }
622
623                         /* Set socket file path. */
624                         if (!strncmp(long_option[option_index].name,
625                                                 "socket-file", MAX_LONG_OPT_SZ)) {
626                                 if (us_vhost_parse_socket_path(optarg) == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                         "Invalid argument for socket name (Max %d characters)\n",
629                                         PATH_MAX);
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 }
633                         }
634
635                         if (!strncmp(long_option[option_index].name,
636                                                 "dma-type", MAX_LONG_OPT_SZ)) {
637                                 strcpy(dma_type, optarg);
638                         }
639
640                         if (!strncmp(long_option[option_index].name,
641                                                 "dmas", MAX_LONG_OPT_SZ)) {
642                                 if (open_dma(optarg) == -1) {
643                                         RTE_LOG(INFO, VHOST_CONFIG,
644                                                 "Wrong DMA args\n");
645                                         us_vhost_usage(prgname);
646                                         return -1;
647                                 }
648                                 async_vhost_driver = 1;
649                         }
650
651                         break;
652
653                         /* Invalid option - print options. */
654                 default:
655                         us_vhost_usage(prgname);
656                         return -1;
657                 }
658         }
659
660         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
661                 if (enabled_port_mask & (1 << i))
662                         ports[num_ports++] = i;
663         }
664
665         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
666                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
667                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
668                 return -1;
669         }
670
671         return 0;
672 }
673
674 /*
675  * Update the global var NUM_PORTS and array PORTS according to system ports number
676  * and return valid ports number
677  */
678 static unsigned check_ports_num(unsigned nb_ports)
679 {
680         unsigned valid_num_ports = num_ports;
681         unsigned portid;
682
683         if (num_ports > nb_ports) {
684                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
685                         num_ports, nb_ports);
686                 num_ports = nb_ports;
687         }
688
689         for (portid = 0; portid < num_ports; portid ++) {
690                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
691                         RTE_LOG(INFO, VHOST_PORT,
692                                 "\nSpecified port ID(%u) is not valid\n",
693                                 ports[portid]);
694                         ports[portid] = INVALID_PORT_ID;
695                         valid_num_ports--;
696                 }
697         }
698         return valid_num_ports;
699 }
700
701 static __rte_always_inline struct vhost_dev *
702 find_vhost_dev(struct rte_ether_addr *mac)
703 {
704         struct vhost_dev *vdev;
705
706         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
707                 if (vdev->ready == DEVICE_RX &&
708                     rte_is_same_ether_addr(mac, &vdev->mac_address))
709                         return vdev;
710         }
711
712         return NULL;
713 }
714
715 /*
716  * This function learns the MAC address of the device and registers this along with a
717  * vlan tag to a VMDQ.
718  */
719 static int
720 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
721 {
722         struct rte_ether_hdr *pkt_hdr;
723         int i, ret;
724
725         /* Learn MAC address of guest device from packet */
726         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
727
728         if (find_vhost_dev(&pkt_hdr->s_addr)) {
729                 RTE_LOG(ERR, VHOST_DATA,
730                         "(%d) device is using a registered MAC!\n",
731                         vdev->vid);
732                 return -1;
733         }
734
735         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
736                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
737
738         /* vlan_tag currently uses the device_id. */
739         vdev->vlan_tag = vlan_tags[vdev->vid];
740
741         /* Print out VMDQ registration info. */
742         RTE_LOG(INFO, VHOST_DATA,
743                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
744                 vdev->vid,
745                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
746                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
747                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
748                 vdev->vlan_tag);
749
750         /* Register the MAC address. */
751         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
752                                 (uint32_t)vdev->vid + vmdq_pool_base);
753         if (ret)
754                 RTE_LOG(ERR, VHOST_DATA,
755                         "(%d) failed to add device MAC address to VMDQ\n",
756                         vdev->vid);
757
758         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
759
760         /* Set device as ready for RX. */
761         vdev->ready = DEVICE_RX;
762
763         return 0;
764 }
765
766 /*
767  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
768  * queue before disabling RX on the device.
769  */
770 static inline void
771 unlink_vmdq(struct vhost_dev *vdev)
772 {
773         unsigned i = 0;
774         unsigned rx_count;
775         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
776
777         if (vdev->ready == DEVICE_RX) {
778                 /*clear MAC and VLAN settings*/
779                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
780                 for (i = 0; i < 6; i++)
781                         vdev->mac_address.addr_bytes[i] = 0;
782
783                 vdev->vlan_tag = 0;
784
785                 /*Clear out the receive buffers*/
786                 rx_count = rte_eth_rx_burst(ports[0],
787                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
788
789                 while (rx_count) {
790                         for (i = 0; i < rx_count; i++)
791                                 rte_pktmbuf_free(pkts_burst[i]);
792
793                         rx_count = rte_eth_rx_burst(ports[0],
794                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
795                 }
796
797                 vdev->ready = DEVICE_MAC_LEARNING;
798         }
799 }
800
801 static __rte_always_inline void
802 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
803             struct rte_mbuf *m)
804 {
805         uint16_t ret;
806         struct rte_mbuf *m_cpl[1];
807
808         if (builtin_net_driver) {
809                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
810         } else if (async_vhost_driver) {
811                 ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
812                                                 &m, 1);
813
814                 if (likely(ret))
815                         dst_vdev->nr_async_pkts++;
816
817                 while (likely(dst_vdev->nr_async_pkts)) {
818                         if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
819                                         VIRTIO_RXQ, m_cpl, 1))
820                                 dst_vdev->nr_async_pkts--;
821                 }
822         } else {
823                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
824         }
825
826         if (enable_stats) {
827                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
828                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
829                 src_vdev->stats.tx_total++;
830                 src_vdev->stats.tx += ret;
831         }
832 }
833
834 /*
835  * Check if the packet destination MAC address is for a local device. If so then put
836  * the packet on that devices RX queue. If not then return.
837  */
838 static __rte_always_inline int
839 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
840 {
841         struct rte_ether_hdr *pkt_hdr;
842         struct vhost_dev *dst_vdev;
843
844         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
845
846         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
847         if (!dst_vdev)
848                 return -1;
849
850         if (vdev->vid == dst_vdev->vid) {
851                 RTE_LOG_DP(DEBUG, VHOST_DATA,
852                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
853                         vdev->vid);
854                 return 0;
855         }
856
857         RTE_LOG_DP(DEBUG, VHOST_DATA,
858                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
859
860         if (unlikely(dst_vdev->remove)) {
861                 RTE_LOG_DP(DEBUG, VHOST_DATA,
862                         "(%d) device is marked for removal\n", dst_vdev->vid);
863                 return 0;
864         }
865
866         virtio_xmit(dst_vdev, vdev, m);
867         return 0;
868 }
869
870 /*
871  * Check if the destination MAC of a packet is one local VM,
872  * and get its vlan tag, and offset if it is.
873  */
874 static __rte_always_inline int
875 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
876         uint32_t *offset, uint16_t *vlan_tag)
877 {
878         struct vhost_dev *dst_vdev;
879         struct rte_ether_hdr *pkt_hdr =
880                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
881
882         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
883         if (!dst_vdev)
884                 return 0;
885
886         if (vdev->vid == dst_vdev->vid) {
887                 RTE_LOG_DP(DEBUG, VHOST_DATA,
888                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
889                         vdev->vid);
890                 return -1;
891         }
892
893         /*
894          * HW vlan strip will reduce the packet length
895          * by minus length of vlan tag, so need restore
896          * the packet length by plus it.
897          */
898         *offset  = VLAN_HLEN;
899         *vlan_tag = vlan_tags[vdev->vid];
900
901         RTE_LOG_DP(DEBUG, VHOST_DATA,
902                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
903                 vdev->vid, dst_vdev->vid, *vlan_tag);
904
905         return 0;
906 }
907
908 static uint16_t
909 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
910 {
911         if (ol_flags & PKT_TX_IPV4)
912                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
913         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
914                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
915 }
916
917 static void virtio_tx_offload(struct rte_mbuf *m)
918 {
919         void *l3_hdr;
920         struct rte_ipv4_hdr *ipv4_hdr = NULL;
921         struct rte_tcp_hdr *tcp_hdr = NULL;
922         struct rte_ether_hdr *eth_hdr =
923                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
924
925         l3_hdr = (char *)eth_hdr + m->l2_len;
926
927         if (m->ol_flags & PKT_TX_IPV4) {
928                 ipv4_hdr = l3_hdr;
929                 ipv4_hdr->hdr_checksum = 0;
930                 m->ol_flags |= PKT_TX_IP_CKSUM;
931         }
932
933         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
934         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
935 }
936
937 static inline void
938 free_pkts(struct rte_mbuf **pkts, uint16_t n)
939 {
940         while (n--)
941                 rte_pktmbuf_free(pkts[n]);
942 }
943
944 static __rte_always_inline void
945 do_drain_mbuf_table(struct mbuf_table *tx_q)
946 {
947         uint16_t count;
948
949         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
950                                  tx_q->m_table, tx_q->len);
951         if (unlikely(count < tx_q->len))
952                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
953
954         tx_q->len = 0;
955 }
956
957 /*
958  * This function routes the TX packet to the correct interface. This
959  * may be a local device or the physical port.
960  */
961 static __rte_always_inline void
962 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
963 {
964         struct mbuf_table *tx_q;
965         unsigned offset = 0;
966         const uint16_t lcore_id = rte_lcore_id();
967         struct rte_ether_hdr *nh;
968
969
970         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
971         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
972                 struct vhost_dev *vdev2;
973
974                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
975                         if (vdev2 != vdev)
976                                 virtio_xmit(vdev2, vdev, m);
977                 }
978                 goto queue2nic;
979         }
980
981         /*check if destination is local VM*/
982         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
983                 rte_pktmbuf_free(m);
984                 return;
985         }
986
987         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
988                 if (unlikely(find_local_dest(vdev, m, &offset,
989                                              &vlan_tag) != 0)) {
990                         rte_pktmbuf_free(m);
991                         return;
992                 }
993         }
994
995         RTE_LOG_DP(DEBUG, VHOST_DATA,
996                 "(%d) TX: MAC address is external\n", vdev->vid);
997
998 queue2nic:
999
1000         /*Add packet to the port tx queue*/
1001         tx_q = &lcore_tx_queue[lcore_id];
1002
1003         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1004         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1005                 /* Guest has inserted the vlan tag. */
1006                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1007                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1008                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1009                         (vh->vlan_tci != vlan_tag_be))
1010                         vh->vlan_tci = vlan_tag_be;
1011         } else {
1012                 m->ol_flags |= PKT_TX_VLAN_PKT;
1013
1014                 /*
1015                  * Find the right seg to adjust the data len when offset is
1016                  * bigger than tail room size.
1017                  */
1018                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1019                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1020                                 m->data_len += offset;
1021                         else {
1022                                 struct rte_mbuf *seg = m;
1023
1024                                 while ((seg->next != NULL) &&
1025                                         (offset > rte_pktmbuf_tailroom(seg)))
1026                                         seg = seg->next;
1027
1028                                 seg->data_len += offset;
1029                         }
1030                         m->pkt_len += offset;
1031                 }
1032
1033                 m->vlan_tci = vlan_tag;
1034         }
1035
1036         if (m->ol_flags & PKT_TX_TCP_SEG)
1037                 virtio_tx_offload(m);
1038
1039         tx_q->m_table[tx_q->len++] = m;
1040         if (enable_stats) {
1041                 vdev->stats.tx_total++;
1042                 vdev->stats.tx++;
1043         }
1044
1045         if (unlikely(tx_q->len == MAX_PKT_BURST))
1046                 do_drain_mbuf_table(tx_q);
1047 }
1048
1049
1050 static __rte_always_inline void
1051 drain_mbuf_table(struct mbuf_table *tx_q)
1052 {
1053         static uint64_t prev_tsc;
1054         uint64_t cur_tsc;
1055
1056         if (tx_q->len == 0)
1057                 return;
1058
1059         cur_tsc = rte_rdtsc();
1060         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1061                 prev_tsc = cur_tsc;
1062
1063                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1064                         "TX queue drained after timeout with burst size %u\n",
1065                         tx_q->len);
1066                 do_drain_mbuf_table(tx_q);
1067         }
1068 }
1069
1070 static __rte_always_inline void
1071 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1072 {
1073         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1074         uint16_t complete_count;
1075
1076         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1077                                                 qid, p_cpl, MAX_PKT_BURST);
1078         vdev->nr_async_pkts -= complete_count;
1079         if (complete_count)
1080                 free_pkts(p_cpl, complete_count);
1081 }
1082
1083 static __rte_always_inline void
1084 drain_eth_rx(struct vhost_dev *vdev)
1085 {
1086         uint16_t rx_count, enqueue_count;
1087         struct rte_mbuf *pkts[MAX_PKT_BURST];
1088
1089         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1090                                     pkts, MAX_PKT_BURST);
1091
1092         while (likely(vdev->nr_async_pkts))
1093                 complete_async_pkts(vdev, VIRTIO_RXQ);
1094
1095         if (!rx_count)
1096                 return;
1097
1098         /*
1099          * When "enable_retry" is set, here we wait and retry when there
1100          * is no enough free slots in the queue to hold @rx_count packets,
1101          * to diminish packet loss.
1102          */
1103         if (enable_retry &&
1104             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1105                         VIRTIO_RXQ))) {
1106                 uint32_t retry;
1107
1108                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1109                         rte_delay_us(burst_rx_delay_time);
1110                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1111                                         VIRTIO_RXQ))
1112                                 break;
1113                 }
1114         }
1115
1116         if (builtin_net_driver) {
1117                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1118                                                 pkts, rx_count);
1119         } else if (async_vhost_driver) {
1120                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1121                                         VIRTIO_RXQ, pkts, rx_count);
1122                 vdev->nr_async_pkts += enqueue_count;
1123         } else {
1124                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1125                                                 pkts, rx_count);
1126         }
1127
1128         if (enable_stats) {
1129                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1130                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1131         }
1132
1133         if (!async_vhost_driver)
1134                 free_pkts(pkts, rx_count);
1135 }
1136
1137 static __rte_always_inline void
1138 drain_virtio_tx(struct vhost_dev *vdev)
1139 {
1140         struct rte_mbuf *pkts[MAX_PKT_BURST];
1141         uint16_t count;
1142         uint16_t i;
1143
1144         if (builtin_net_driver) {
1145                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1146                                         pkts, MAX_PKT_BURST);
1147         } else {
1148                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1149                                         mbuf_pool, pkts, MAX_PKT_BURST);
1150         }
1151
1152         /* setup VMDq for the first packet */
1153         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1154                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1155                         free_pkts(pkts, count);
1156         }
1157
1158         for (i = 0; i < count; ++i)
1159                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1160 }
1161
1162 /*
1163  * Main function of vhost-switch. It basically does:
1164  *
1165  * for each vhost device {
1166  *    - drain_eth_rx()
1167  *
1168  *      Which drains the host eth Rx queue linked to the vhost device,
1169  *      and deliver all of them to guest virito Rx ring associated with
1170  *      this vhost device.
1171  *
1172  *    - drain_virtio_tx()
1173  *
1174  *      Which drains the guest virtio Tx queue and deliver all of them
1175  *      to the target, which could be another vhost device, or the
1176  *      physical eth dev. The route is done in function "virtio_tx_route".
1177  * }
1178  */
1179 static int
1180 switch_worker(void *arg __rte_unused)
1181 {
1182         unsigned i;
1183         unsigned lcore_id = rte_lcore_id();
1184         struct vhost_dev *vdev;
1185         struct mbuf_table *tx_q;
1186
1187         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1188
1189         tx_q = &lcore_tx_queue[lcore_id];
1190         for (i = 0; i < rte_lcore_count(); i++) {
1191                 if (lcore_ids[i] == lcore_id) {
1192                         tx_q->txq_id = i;
1193                         break;
1194                 }
1195         }
1196
1197         while(1) {
1198                 drain_mbuf_table(tx_q);
1199
1200                 /*
1201                  * Inform the configuration core that we have exited the
1202                  * linked list and that no devices are in use if requested.
1203                  */
1204                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1205                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1206
1207                 /*
1208                  * Process vhost devices
1209                  */
1210                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1211                               lcore_vdev_entry) {
1212                         if (unlikely(vdev->remove)) {
1213                                 unlink_vmdq(vdev);
1214                                 vdev->ready = DEVICE_SAFE_REMOVE;
1215                                 continue;
1216                         }
1217
1218                         if (likely(vdev->ready == DEVICE_RX))
1219                                 drain_eth_rx(vdev);
1220
1221                         if (likely(!vdev->remove))
1222                                 drain_virtio_tx(vdev);
1223                 }
1224         }
1225
1226         return 0;
1227 }
1228
1229 /*
1230  * Remove a device from the specific data core linked list and from the
1231  * main linked list. Synchonization  occurs through the use of the
1232  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1233  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1234  */
1235 static void
1236 destroy_device(int vid)
1237 {
1238         struct vhost_dev *vdev = NULL;
1239         int lcore;
1240
1241         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1242                 if (vdev->vid == vid)
1243                         break;
1244         }
1245         if (!vdev)
1246                 return;
1247         /*set the remove flag. */
1248         vdev->remove = 1;
1249         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1250                 rte_pause();
1251         }
1252
1253         if (builtin_net_driver)
1254                 vs_vhost_net_remove(vdev);
1255
1256         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1257                      lcore_vdev_entry);
1258         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1259
1260
1261         /* Set the dev_removal_flag on each lcore. */
1262         RTE_LCORE_FOREACH_WORKER(lcore)
1263                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1264
1265         /*
1266          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1267          * we can be sure that they can no longer access the device removed
1268          * from the linked lists and that the devices are no longer in use.
1269          */
1270         RTE_LCORE_FOREACH_WORKER(lcore) {
1271                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1272                         rte_pause();
1273         }
1274
1275         lcore_info[vdev->coreid].device_num--;
1276
1277         RTE_LOG(INFO, VHOST_DATA,
1278                 "(%d) device has been removed from data core\n",
1279                 vdev->vid);
1280
1281         if (async_vhost_driver)
1282                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1283
1284         rte_free(vdev);
1285 }
1286
1287 /*
1288  * A new device is added to a data core. First the device is added to the main linked list
1289  * and then allocated to a specific data core.
1290  */
1291 static int
1292 new_device(int vid)
1293 {
1294         int lcore, core_add = 0;
1295         uint32_t device_num_min = num_devices;
1296         struct vhost_dev *vdev;
1297
1298         struct rte_vhost_async_channel_ops channel_ops = {
1299                 .transfer_data = ioat_transfer_data_cb,
1300                 .check_completed_copies = ioat_check_completed_copies_cb
1301         };
1302         struct rte_vhost_async_features f;
1303
1304         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1305         if (vdev == NULL) {
1306                 RTE_LOG(INFO, VHOST_DATA,
1307                         "(%d) couldn't allocate memory for vhost dev\n",
1308                         vid);
1309                 return -1;
1310         }
1311         vdev->vid = vid;
1312
1313         if (builtin_net_driver)
1314                 vs_vhost_net_setup(vdev);
1315
1316         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1317         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1318
1319         /*reset ready flag*/
1320         vdev->ready = DEVICE_MAC_LEARNING;
1321         vdev->remove = 0;
1322
1323         /* Find a suitable lcore to add the device. */
1324         RTE_LCORE_FOREACH_WORKER(lcore) {
1325                 if (lcore_info[lcore].device_num < device_num_min) {
1326                         device_num_min = lcore_info[lcore].device_num;
1327                         core_add = lcore;
1328                 }
1329         }
1330         vdev->coreid = core_add;
1331
1332         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1333                           lcore_vdev_entry);
1334         lcore_info[vdev->coreid].device_num++;
1335
1336         /* Disable notifications. */
1337         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1338         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1339
1340         RTE_LOG(INFO, VHOST_DATA,
1341                 "(%d) device has been added to data core %d\n",
1342                 vid, vdev->coreid);
1343
1344         if (async_vhost_driver) {
1345                 f.async_inorder = 1;
1346                 f.async_threshold = 256;
1347                 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1348                         f.intval, &channel_ops);
1349         }
1350
1351         return 0;
1352 }
1353
1354 /*
1355  * These callback allow devices to be added to the data core when configuration
1356  * has been fully complete.
1357  */
1358 static const struct vhost_device_ops virtio_net_device_ops =
1359 {
1360         .new_device =  new_device,
1361         .destroy_device = destroy_device,
1362 };
1363
1364 /*
1365  * This is a thread will wake up after a period to print stats if the user has
1366  * enabled them.
1367  */
1368 static void *
1369 print_stats(__rte_unused void *arg)
1370 {
1371         struct vhost_dev *vdev;
1372         uint64_t tx_dropped, rx_dropped;
1373         uint64_t tx, tx_total, rx, rx_total;
1374         const char clr[] = { 27, '[', '2', 'J', '\0' };
1375         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1376
1377         while(1) {
1378                 sleep(enable_stats);
1379
1380                 /* Clear screen and move to top left */
1381                 printf("%s%s\n", clr, top_left);
1382                 printf("Device statistics =================================\n");
1383
1384                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1385                         tx_total   = vdev->stats.tx_total;
1386                         tx         = vdev->stats.tx;
1387                         tx_dropped = tx_total - tx;
1388
1389                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1390                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1391                         rx_dropped = rx_total - rx;
1392
1393                         printf("Statistics for device %d\n"
1394                                 "-----------------------\n"
1395                                 "TX total:              %" PRIu64 "\n"
1396                                 "TX dropped:            %" PRIu64 "\n"
1397                                 "TX successful:         %" PRIu64 "\n"
1398                                 "RX total:              %" PRIu64 "\n"
1399                                 "RX dropped:            %" PRIu64 "\n"
1400                                 "RX successful:         %" PRIu64 "\n",
1401                                 vdev->vid,
1402                                 tx_total, tx_dropped, tx,
1403                                 rx_total, rx_dropped, rx);
1404                 }
1405
1406                 printf("===================================================\n");
1407
1408                 fflush(stdout);
1409         }
1410
1411         return NULL;
1412 }
1413
1414 static void
1415 unregister_drivers(int socket_num)
1416 {
1417         int i, ret;
1418
1419         for (i = 0; i < socket_num; i++) {
1420                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1421                 if (ret != 0)
1422                         RTE_LOG(ERR, VHOST_CONFIG,
1423                                 "Fail to unregister vhost driver for %s.\n",
1424                                 socket_files + i * PATH_MAX);
1425         }
1426 }
1427
1428 /* When we receive a INT signal, unregister vhost driver */
1429 static void
1430 sigint_handler(__rte_unused int signum)
1431 {
1432         /* Unregister vhost driver. */
1433         unregister_drivers(nb_sockets);
1434
1435         exit(0);
1436 }
1437
1438 /*
1439  * While creating an mbuf pool, one key thing is to figure out how
1440  * many mbuf entries is enough for our use. FYI, here are some
1441  * guidelines:
1442  *
1443  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1444  *
1445  * - For each switch core (A CPU core does the packet switch), we need
1446  *   also make some reservation for receiving the packets from virtio
1447  *   Tx queue. How many is enough depends on the usage. It's normally
1448  *   a simple calculation like following:
1449  *
1450  *       MAX_PKT_BURST * max packet size / mbuf size
1451  *
1452  *   So, we definitely need allocate more mbufs when TSO is enabled.
1453  *
1454  * - Similarly, for each switching core, we should serve @nr_rx_desc
1455  *   mbufs for receiving the packets from physical NIC device.
1456  *
1457  * - We also need make sure, for each switch core, we have allocated
1458  *   enough mbufs to fill up the mbuf cache.
1459  */
1460 static void
1461 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1462         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1463 {
1464         uint32_t nr_mbufs;
1465         uint32_t nr_mbufs_per_core;
1466         uint32_t mtu = 1500;
1467
1468         if (mergeable)
1469                 mtu = 9000;
1470         if (enable_tso)
1471                 mtu = 64 * 1024;
1472
1473         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1474                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1475         nr_mbufs_per_core += nr_rx_desc;
1476         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1477
1478         nr_mbufs  = nr_queues * nr_rx_desc;
1479         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1480         nr_mbufs *= nr_port;
1481
1482         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1483                                             nr_mbuf_cache, 0, mbuf_size,
1484                                             rte_socket_id());
1485         if (mbuf_pool == NULL)
1486                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1487 }
1488
1489 /*
1490  * Main function, does initialisation and calls the per-lcore functions.
1491  */
1492 int
1493 main(int argc, char *argv[])
1494 {
1495         unsigned lcore_id, core_id = 0;
1496         unsigned nb_ports, valid_num_ports;
1497         int ret, i;
1498         uint16_t portid;
1499         static pthread_t tid;
1500         uint64_t flags = 0;
1501
1502         signal(SIGINT, sigint_handler);
1503
1504         /* init EAL */
1505         ret = rte_eal_init(argc, argv);
1506         if (ret < 0)
1507                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1508         argc -= ret;
1509         argv += ret;
1510
1511         /* parse app arguments */
1512         ret = us_vhost_parse_args(argc, argv);
1513         if (ret < 0)
1514                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1515
1516         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1517                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1518
1519                 if (rte_lcore_is_enabled(lcore_id))
1520                         lcore_ids[core_id++] = lcore_id;
1521         }
1522
1523         if (rte_lcore_count() > RTE_MAX_LCORE)
1524                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1525
1526         /* Get the number of physical ports. */
1527         nb_ports = rte_eth_dev_count_avail();
1528
1529         /*
1530          * Update the global var NUM_PORTS and global array PORTS
1531          * and get value of var VALID_NUM_PORTS according to system ports number
1532          */
1533         valid_num_ports = check_ports_num(nb_ports);
1534
1535         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1536                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1537                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1538                 return -1;
1539         }
1540
1541         /*
1542          * FIXME: here we are trying to allocate mbufs big enough for
1543          * @MAX_QUEUES, but the truth is we're never going to use that
1544          * many queues here. We probably should only do allocation for
1545          * those queues we are going to use.
1546          */
1547         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1548                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1549
1550         if (vm2vm_mode == VM2VM_HARDWARE) {
1551                 /* Enable VT loop back to let L2 switch to do it. */
1552                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1553                 RTE_LOG(DEBUG, VHOST_CONFIG,
1554                         "Enable loop back for L2 switch in vmdq.\n");
1555         }
1556
1557         /* initialize all ports */
1558         RTE_ETH_FOREACH_DEV(portid) {
1559                 /* skip ports that are not enabled */
1560                 if ((enabled_port_mask & (1 << portid)) == 0) {
1561                         RTE_LOG(INFO, VHOST_PORT,
1562                                 "Skipping disabled port %d\n", portid);
1563                         continue;
1564                 }
1565                 if (port_init(portid) != 0)
1566                         rte_exit(EXIT_FAILURE,
1567                                 "Cannot initialize network ports\n");
1568         }
1569
1570         /* Enable stats if the user option is set. */
1571         if (enable_stats) {
1572                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1573                                         print_stats, NULL);
1574                 if (ret < 0)
1575                         rte_exit(EXIT_FAILURE,
1576                                 "Cannot create print-stats thread\n");
1577         }
1578
1579         /* Launch all data cores. */
1580         RTE_LCORE_FOREACH_WORKER(lcore_id)
1581                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1582
1583         if (client_mode)
1584                 flags |= RTE_VHOST_USER_CLIENT;
1585
1586         /* Register vhost user driver to handle vhost messages. */
1587         for (i = 0; i < nb_sockets; i++) {
1588                 char *file = socket_files + i * PATH_MAX;
1589                 if (async_vhost_driver)
1590                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1591
1592                 ret = rte_vhost_driver_register(file, flags);
1593                 if (ret != 0) {
1594                         unregister_drivers(i);
1595                         rte_exit(EXIT_FAILURE,
1596                                 "vhost driver register failure.\n");
1597                 }
1598
1599                 if (builtin_net_driver)
1600                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1601
1602                 if (mergeable == 0) {
1603                         rte_vhost_driver_disable_features(file,
1604                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1605                 }
1606
1607                 if (enable_tx_csum == 0) {
1608                         rte_vhost_driver_disable_features(file,
1609                                 1ULL << VIRTIO_NET_F_CSUM);
1610                 }
1611
1612                 if (enable_tso == 0) {
1613                         rte_vhost_driver_disable_features(file,
1614                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1615                         rte_vhost_driver_disable_features(file,
1616                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1617                         rte_vhost_driver_disable_features(file,
1618                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1619                         rte_vhost_driver_disable_features(file,
1620                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1621                 }
1622
1623                 if (promiscuous) {
1624                         rte_vhost_driver_enable_features(file,
1625                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1626                 }
1627
1628                 ret = rte_vhost_driver_callback_register(file,
1629                         &virtio_net_device_ops);
1630                 if (ret != 0) {
1631                         rte_exit(EXIT_FAILURE,
1632                                 "failed to register vhost driver callbacks.\n");
1633                 }
1634
1635                 if (rte_vhost_driver_start(file) < 0) {
1636                         rte_exit(EXIT_FAILURE,
1637                                 "failed to start vhost driver.\n");
1638                 }
1639         }
1640
1641         RTE_LCORE_FOREACH_WORKER(lcore_id)
1642                 rte_eal_wait_lcore(lcore_id);
1643
1644         return 0;
1645
1646 }