ab649bf147e16b3267a6a2b4e337f47f0598408a
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95 static int dequeue_zero_copy;
96
97 static int builtin_net_driver;
98
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110         .rxmode = {
111                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112                 .split_hdr_size = 0,
113                 /*
114                  * VLAN strip is necessary for 1G NIC such as I350,
115                  * this fixes bug of ipv4 forwarding in guest can't
116                  * forward pakets from one virtio dev to another virtio dev.
117                  */
118                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119         },
120
121         .txmode = {
122                 .mq_mode = ETH_MQ_TX_NONE,
123                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124                              DEV_TX_OFFLOAD_TCP_CKSUM |
125                              DEV_TX_OFFLOAD_VLAN_INSERT |
126                              DEV_TX_OFFLOAD_MULTI_SEGS |
127                              DEV_TX_OFFLOAD_TCP_TSO),
128         },
129         .rx_adv_conf = {
130                 /*
131                  * should be overridden separately in code with
132                  * appropriate values
133                  */
134                 .vmdq_rx_conf = {
135                         .nb_queue_pools = ETH_8_POOLS,
136                         .enable_default_pool = 0,
137                         .default_pool = 0,
138                         .nb_pool_maps = 0,
139                         .pool_map = {{0, 0},},
140                 },
141         },
142 };
143
144
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151
152 const uint16_t vlan_tags[] = {
153         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165
166 static struct vhost_dev_tailq_list vhost_dev_list =
167         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173         unsigned len;
174         unsigned txq_id;
175         struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
180
181 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
182                                  / US_PER_S * BURST_TX_DRAIN_US)
183 #define VLAN_HLEN       4
184
185 /*
186  * Builds up the correct configuration for VMDQ VLAN pool map
187  * according to the pool & queue limits.
188  */
189 static inline int
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
191 {
192         struct rte_eth_vmdq_rx_conf conf;
193         struct rte_eth_vmdq_rx_conf *def_conf =
194                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
195         unsigned i;
196
197         memset(&conf, 0, sizeof(conf));
198         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199         conf.nb_pool_maps = num_devices;
200         conf.enable_loop_back = def_conf->enable_loop_back;
201         conf.rx_mode = def_conf->rx_mode;
202
203         for (i = 0; i < conf.nb_pool_maps; i++) {
204                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
205                 conf.pool_map[i].pools = (1UL << i);
206         }
207
208         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
211         return 0;
212 }
213
214 /*
215  * Initialises a given port using global settings and with the rx buffers
216  * coming from the mbuf_pool passed as parameter
217  */
218 static inline int
219 port_init(uint16_t port)
220 {
221         struct rte_eth_dev_info dev_info;
222         struct rte_eth_conf port_conf;
223         struct rte_eth_rxconf *rxconf;
224         struct rte_eth_txconf *txconf;
225         int16_t rx_rings, tx_rings;
226         uint16_t rx_ring_size, tx_ring_size;
227         int retval;
228         uint16_t q;
229
230         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231         retval = rte_eth_dev_info_get(port, &dev_info);
232         if (retval != 0) {
233                 RTE_LOG(ERR, VHOST_PORT,
234                         "Error during getting device (port %u) info: %s\n",
235                         port, strerror(-retval));
236
237                 return retval;
238         }
239
240         rxconf = &dev_info.default_rxconf;
241         txconf = &dev_info.default_txconf;
242         rxconf->rx_drop_en = 1;
243
244         /*configure the number of supported virtio devices based on VMDQ limits */
245         num_devices = dev_info.max_vmdq_pools;
246
247         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
248         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
249
250         /*
251          * When dequeue zero copy is enabled, guest Tx used vring will be
252          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
253          * (tx_ring_size here) must be small enough so that the driver will
254          * hit the free threshold easily and free mbufs timely. Otherwise,
255          * guest Tx vring would be starved.
256          */
257         if (dequeue_zero_copy)
258                 tx_ring_size = 64;
259
260         tx_rings = (uint16_t)rte_lcore_count();
261
262         /* Get port configuration. */
263         retval = get_eth_conf(&port_conf, num_devices);
264         if (retval < 0)
265                 return retval;
266         /* NIC queues are divided into pf queues and vmdq queues.  */
267         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
268         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
269         num_vmdq_queues = num_devices * queues_per_pool;
270         num_queues = num_pf_queues + num_vmdq_queues;
271         vmdq_queue_base = dev_info.vmdq_queue_base;
272         vmdq_pool_base  = dev_info.vmdq_pool_base;
273         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
274                 num_pf_queues, num_devices, queues_per_pool);
275
276         if (!rte_eth_dev_is_valid_port(port))
277                 return -1;
278
279         rx_rings = (uint16_t)dev_info.max_rx_queues;
280         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
281                 port_conf.txmode.offloads |=
282                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
283         /* Configure ethernet device. */
284         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
285         if (retval != 0) {
286                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
287                         port, strerror(-retval));
288                 return retval;
289         }
290
291         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
292                 &tx_ring_size);
293         if (retval != 0) {
294                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
295                         "for port %u: %s.\n", port, strerror(-retval));
296                 return retval;
297         }
298         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
299                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
300                         "for Rx queues on port %u.\n", port);
301                 return -1;
302         }
303
304         /* Setup the queues. */
305         rxconf->offloads = port_conf.rxmode.offloads;
306         for (q = 0; q < rx_rings; q ++) {
307                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
308                                                 rte_eth_dev_socket_id(port),
309                                                 rxconf,
310                                                 mbuf_pool);
311                 if (retval < 0) {
312                         RTE_LOG(ERR, VHOST_PORT,
313                                 "Failed to setup rx queue %u of port %u: %s.\n",
314                                 q, port, strerror(-retval));
315                         return retval;
316                 }
317         }
318         txconf->offloads = port_conf.txmode.offloads;
319         for (q = 0; q < tx_rings; q ++) {
320                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
321                                                 rte_eth_dev_socket_id(port),
322                                                 txconf);
323                 if (retval < 0) {
324                         RTE_LOG(ERR, VHOST_PORT,
325                                 "Failed to setup tx queue %u of port %u: %s.\n",
326                                 q, port, strerror(-retval));
327                         return retval;
328                 }
329         }
330
331         /* Start the device. */
332         retval  = rte_eth_dev_start(port);
333         if (retval < 0) {
334                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
335                         port, strerror(-retval));
336                 return retval;
337         }
338
339         if (promiscuous) {
340                 retval = rte_eth_promiscuous_enable(port);
341                 if (retval != 0) {
342                         RTE_LOG(ERR, VHOST_PORT,
343                                 "Failed to enable promiscuous mode on port %u: %s\n",
344                                 port, rte_strerror(-retval));
345                         return retval;
346                 }
347         }
348
349         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
350         if (retval < 0) {
351                 RTE_LOG(ERR, VHOST_PORT,
352                         "Failed to get MAC address on port %u: %s\n",
353                         port, rte_strerror(-retval));
354                 return retval;
355         }
356
357         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
358         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
359                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
360                         port,
361                         vmdq_ports_eth_addr[port].addr_bytes[0],
362                         vmdq_ports_eth_addr[port].addr_bytes[1],
363                         vmdq_ports_eth_addr[port].addr_bytes[2],
364                         vmdq_ports_eth_addr[port].addr_bytes[3],
365                         vmdq_ports_eth_addr[port].addr_bytes[4],
366                         vmdq_ports_eth_addr[port].addr_bytes[5]);
367
368         return 0;
369 }
370
371 /*
372  * Set socket file path.
373  */
374 static int
375 us_vhost_parse_socket_path(const char *q_arg)
376 {
377         char *old;
378
379         /* parse number string */
380         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
381                 return -1;
382
383         old = socket_files;
384         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
385         if (socket_files == NULL) {
386                 free(old);
387                 return -1;
388         }
389
390         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
391         nb_sockets++;
392
393         return 0;
394 }
395
396 /*
397  * Parse the portmask provided at run time.
398  */
399 static int
400 parse_portmask(const char *portmask)
401 {
402         char *end = NULL;
403         unsigned long pm;
404
405         errno = 0;
406
407         /* parse hexadecimal string */
408         pm = strtoul(portmask, &end, 16);
409         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
410                 return -1;
411
412         if (pm == 0)
413                 return -1;
414
415         return pm;
416
417 }
418
419 /*
420  * Parse num options at run time.
421  */
422 static int
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424 {
425         char *end = NULL;
426         unsigned long num;
427
428         errno = 0;
429
430         /* parse unsigned int string */
431         num = strtoul(q_arg, &end, 10);
432         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433                 return -1;
434
435         if (num > max_valid_value)
436                 return -1;
437
438         return num;
439
440 }
441
442 /*
443  * Display usage
444  */
445 static void
446 us_vhost_usage(const char *prgname)
447 {
448         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449         "               --vm2vm [0|1|2]\n"
450         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451         "               --socket-file <path>\n"
452         "               --nb-devices ND\n"
453         "               -p PORTMASK: Set mask for ports to be used by application\n"
454         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460         "               --socket-file: The path of the socket file.\n"
461         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
462         "               --tso [0|1] disable/enable TCP segment offload.\n"
463         "               --client register a vhost-user socket as client mode.\n"
464         "               --dequeue-zero-copy enables dequeue zero copy\n",
465                prgname);
466 }
467
468 /*
469  * Parse the arguments given in the command line of the application.
470  */
471 static int
472 us_vhost_parse_args(int argc, char **argv)
473 {
474         int opt, ret;
475         int option_index;
476         unsigned i;
477         const char *prgname = argv[0];
478         static struct option long_option[] = {
479                 {"vm2vm", required_argument, NULL, 0},
480                 {"rx-retry", required_argument, NULL, 0},
481                 {"rx-retry-delay", required_argument, NULL, 0},
482                 {"rx-retry-num", required_argument, NULL, 0},
483                 {"mergeable", required_argument, NULL, 0},
484                 {"stats", required_argument, NULL, 0},
485                 {"socket-file", required_argument, NULL, 0},
486                 {"tx-csum", required_argument, NULL, 0},
487                 {"tso", required_argument, NULL, 0},
488                 {"client", no_argument, &client_mode, 1},
489                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
490                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491                 {NULL, 0, 0, 0},
492         };
493
494         /* Parse command line */
495         while ((opt = getopt_long(argc, argv, "p:P",
496                         long_option, &option_index)) != EOF) {
497                 switch (opt) {
498                 /* Portmask */
499                 case 'p':
500                         enabled_port_mask = parse_portmask(optarg);
501                         if (enabled_port_mask == 0) {
502                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
503                                 us_vhost_usage(prgname);
504                                 return -1;
505                         }
506                         break;
507
508                 case 'P':
509                         promiscuous = 1;
510                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
511                                 ETH_VMDQ_ACCEPT_BROADCAST |
512                                 ETH_VMDQ_ACCEPT_MULTICAST;
513
514                         break;
515
516                 case 0:
517                         /* Enable/disable vm2vm comms. */
518                         if (!strncmp(long_option[option_index].name, "vm2vm",
519                                 MAX_LONG_OPT_SZ)) {
520                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
521                                 if (ret == -1) {
522                                         RTE_LOG(INFO, VHOST_CONFIG,
523                                                 "Invalid argument for "
524                                                 "vm2vm [0|1|2]\n");
525                                         us_vhost_usage(prgname);
526                                         return -1;
527                                 } else {
528                                         vm2vm_mode = (vm2vm_type)ret;
529                                 }
530                         }
531
532                         /* Enable/disable retries on RX. */
533                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
534                                 ret = parse_num_opt(optarg, 1);
535                                 if (ret == -1) {
536                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
537                                         us_vhost_usage(prgname);
538                                         return -1;
539                                 } else {
540                                         enable_retry = ret;
541                                 }
542                         }
543
544                         /* Enable/disable TX checksum offload. */
545                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
546                                 ret = parse_num_opt(optarg, 1);
547                                 if (ret == -1) {
548                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
549                                         us_vhost_usage(prgname);
550                                         return -1;
551                                 } else
552                                         enable_tx_csum = ret;
553                         }
554
555                         /* Enable/disable TSO offload. */
556                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
557                                 ret = parse_num_opt(optarg, 1);
558                                 if (ret == -1) {
559                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
560                                         us_vhost_usage(prgname);
561                                         return -1;
562                                 } else
563                                         enable_tso = ret;
564                         }
565
566                         /* Specify the retries delay time (in useconds) on RX. */
567                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
568                                 ret = parse_num_opt(optarg, INT32_MAX);
569                                 if (ret == -1) {
570                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
571                                         us_vhost_usage(prgname);
572                                         return -1;
573                                 } else {
574                                         burst_rx_delay_time = ret;
575                                 }
576                         }
577
578                         /* Specify the retries number on RX. */
579                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
580                                 ret = parse_num_opt(optarg, INT32_MAX);
581                                 if (ret == -1) {
582                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
583                                         us_vhost_usage(prgname);
584                                         return -1;
585                                 } else {
586                                         burst_rx_retry_num = ret;
587                                 }
588                         }
589
590                         /* Enable/disable RX mergeable buffers. */
591                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
592                                 ret = parse_num_opt(optarg, 1);
593                                 if (ret == -1) {
594                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
595                                         us_vhost_usage(prgname);
596                                         return -1;
597                                 } else {
598                                         mergeable = !!ret;
599                                         if (ret) {
600                                                 vmdq_conf_default.rxmode.offloads |=
601                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
602                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
603                                                         = JUMBO_FRAME_MAX_SIZE;
604                                         }
605                                 }
606                         }
607
608                         /* Enable/disable stats. */
609                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
610                                 ret = parse_num_opt(optarg, INT32_MAX);
611                                 if (ret == -1) {
612                                         RTE_LOG(INFO, VHOST_CONFIG,
613                                                 "Invalid argument for stats [0..N]\n");
614                                         us_vhost_usage(prgname);
615                                         return -1;
616                                 } else {
617                                         enable_stats = ret;
618                                 }
619                         }
620
621                         /* Set socket file path. */
622                         if (!strncmp(long_option[option_index].name,
623                                                 "socket-file", MAX_LONG_OPT_SZ)) {
624                                 if (us_vhost_parse_socket_path(optarg) == -1) {
625                                         RTE_LOG(INFO, VHOST_CONFIG,
626                                         "Invalid argument for socket name (Max %d characters)\n",
627                                         PATH_MAX);
628                                         us_vhost_usage(prgname);
629                                         return -1;
630                                 }
631                         }
632
633                         break;
634
635                         /* Invalid option - print options. */
636                 default:
637                         us_vhost_usage(prgname);
638                         return -1;
639                 }
640         }
641
642         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
643                 if (enabled_port_mask & (1 << i))
644                         ports[num_ports++] = i;
645         }
646
647         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
648                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
649                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
650                 return -1;
651         }
652
653         return 0;
654 }
655
656 /*
657  * Update the global var NUM_PORTS and array PORTS according to system ports number
658  * and return valid ports number
659  */
660 static unsigned check_ports_num(unsigned nb_ports)
661 {
662         unsigned valid_num_ports = num_ports;
663         unsigned portid;
664
665         if (num_ports > nb_ports) {
666                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
667                         num_ports, nb_ports);
668                 num_ports = nb_ports;
669         }
670
671         for (portid = 0; portid < num_ports; portid ++) {
672                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
673                         RTE_LOG(INFO, VHOST_PORT,
674                                 "\nSpecified port ID(%u) is not valid\n",
675                                 ports[portid]);
676                         ports[portid] = INVALID_PORT_ID;
677                         valid_num_ports--;
678                 }
679         }
680         return valid_num_ports;
681 }
682
683 static __rte_always_inline struct vhost_dev *
684 find_vhost_dev(struct rte_ether_addr *mac)
685 {
686         struct vhost_dev *vdev;
687
688         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
689                 if (vdev->ready == DEVICE_RX &&
690                     rte_is_same_ether_addr(mac, &vdev->mac_address))
691                         return vdev;
692         }
693
694         return NULL;
695 }
696
697 /*
698  * This function learns the MAC address of the device and registers this along with a
699  * vlan tag to a VMDQ.
700  */
701 static int
702 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
703 {
704         struct rte_ether_hdr *pkt_hdr;
705         int i, ret;
706
707         /* Learn MAC address of guest device from packet */
708         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
709
710         if (find_vhost_dev(&pkt_hdr->s_addr)) {
711                 RTE_LOG(ERR, VHOST_DATA,
712                         "(%d) device is using a registered MAC!\n",
713                         vdev->vid);
714                 return -1;
715         }
716
717         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
718                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
719
720         /* vlan_tag currently uses the device_id. */
721         vdev->vlan_tag = vlan_tags[vdev->vid];
722
723         /* Print out VMDQ registration info. */
724         RTE_LOG(INFO, VHOST_DATA,
725                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
726                 vdev->vid,
727                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
728                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
729                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
730                 vdev->vlan_tag);
731
732         /* Register the MAC address. */
733         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
734                                 (uint32_t)vdev->vid + vmdq_pool_base);
735         if (ret)
736                 RTE_LOG(ERR, VHOST_DATA,
737                         "(%d) failed to add device MAC address to VMDQ\n",
738                         vdev->vid);
739
740         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
741
742         /* Set device as ready for RX. */
743         vdev->ready = DEVICE_RX;
744
745         return 0;
746 }
747
748 /*
749  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
750  * queue before disabling RX on the device.
751  */
752 static inline void
753 unlink_vmdq(struct vhost_dev *vdev)
754 {
755         unsigned i = 0;
756         unsigned rx_count;
757         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
758
759         if (vdev->ready == DEVICE_RX) {
760                 /*clear MAC and VLAN settings*/
761                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
762                 for (i = 0; i < 6; i++)
763                         vdev->mac_address.addr_bytes[i] = 0;
764
765                 vdev->vlan_tag = 0;
766
767                 /*Clear out the receive buffers*/
768                 rx_count = rte_eth_rx_burst(ports[0],
769                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
770
771                 while (rx_count) {
772                         for (i = 0; i < rx_count; i++)
773                                 rte_pktmbuf_free(pkts_burst[i]);
774
775                         rx_count = rte_eth_rx_burst(ports[0],
776                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
777                 }
778
779                 vdev->ready = DEVICE_MAC_LEARNING;
780         }
781 }
782
783 static __rte_always_inline void
784 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
785             struct rte_mbuf *m)
786 {
787         uint16_t ret;
788
789         if (builtin_net_driver) {
790                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
791         } else {
792                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
793         }
794
795         if (enable_stats) {
796                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
797                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
798                 src_vdev->stats.tx_total++;
799                 src_vdev->stats.tx += ret;
800         }
801 }
802
803 /*
804  * Check if the packet destination MAC address is for a local device. If so then put
805  * the packet on that devices RX queue. If not then return.
806  */
807 static __rte_always_inline int
808 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
809 {
810         struct rte_ether_hdr *pkt_hdr;
811         struct vhost_dev *dst_vdev;
812
813         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
814
815         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
816         if (!dst_vdev)
817                 return -1;
818
819         if (vdev->vid == dst_vdev->vid) {
820                 RTE_LOG_DP(DEBUG, VHOST_DATA,
821                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
822                         vdev->vid);
823                 return 0;
824         }
825
826         RTE_LOG_DP(DEBUG, VHOST_DATA,
827                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
828
829         if (unlikely(dst_vdev->remove)) {
830                 RTE_LOG_DP(DEBUG, VHOST_DATA,
831                         "(%d) device is marked for removal\n", dst_vdev->vid);
832                 return 0;
833         }
834
835         virtio_xmit(dst_vdev, vdev, m);
836         return 0;
837 }
838
839 /*
840  * Check if the destination MAC of a packet is one local VM,
841  * and get its vlan tag, and offset if it is.
842  */
843 static __rte_always_inline int
844 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
845         uint32_t *offset, uint16_t *vlan_tag)
846 {
847         struct vhost_dev *dst_vdev;
848         struct rte_ether_hdr *pkt_hdr =
849                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
850
851         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
852         if (!dst_vdev)
853                 return 0;
854
855         if (vdev->vid == dst_vdev->vid) {
856                 RTE_LOG_DP(DEBUG, VHOST_DATA,
857                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
858                         vdev->vid);
859                 return -1;
860         }
861
862         /*
863          * HW vlan strip will reduce the packet length
864          * by minus length of vlan tag, so need restore
865          * the packet length by plus it.
866          */
867         *offset  = VLAN_HLEN;
868         *vlan_tag = vlan_tags[vdev->vid];
869
870         RTE_LOG_DP(DEBUG, VHOST_DATA,
871                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
872                 vdev->vid, dst_vdev->vid, *vlan_tag);
873
874         return 0;
875 }
876
877 static uint16_t
878 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
879 {
880         if (ol_flags & PKT_TX_IPV4)
881                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
882         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
883                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
884 }
885
886 static void virtio_tx_offload(struct rte_mbuf *m)
887 {
888         void *l3_hdr;
889         struct rte_ipv4_hdr *ipv4_hdr = NULL;
890         struct rte_tcp_hdr *tcp_hdr = NULL;
891         struct rte_ether_hdr *eth_hdr =
892                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
893
894         l3_hdr = (char *)eth_hdr + m->l2_len;
895
896         if (m->ol_flags & PKT_TX_IPV4) {
897                 ipv4_hdr = l3_hdr;
898                 ipv4_hdr->hdr_checksum = 0;
899                 m->ol_flags |= PKT_TX_IP_CKSUM;
900         }
901
902         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
903         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
904 }
905
906 static inline void
907 free_pkts(struct rte_mbuf **pkts, uint16_t n)
908 {
909         while (n--)
910                 rte_pktmbuf_free(pkts[n]);
911 }
912
913 static __rte_always_inline void
914 do_drain_mbuf_table(struct mbuf_table *tx_q)
915 {
916         uint16_t count;
917
918         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
919                                  tx_q->m_table, tx_q->len);
920         if (unlikely(count < tx_q->len))
921                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
922
923         tx_q->len = 0;
924 }
925
926 /*
927  * This function routes the TX packet to the correct interface. This
928  * may be a local device or the physical port.
929  */
930 static __rte_always_inline void
931 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
932 {
933         struct mbuf_table *tx_q;
934         unsigned offset = 0;
935         const uint16_t lcore_id = rte_lcore_id();
936         struct rte_ether_hdr *nh;
937
938
939         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
940         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
941                 struct vhost_dev *vdev2;
942
943                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
944                         if (vdev2 != vdev)
945                                 virtio_xmit(vdev2, vdev, m);
946                 }
947                 goto queue2nic;
948         }
949
950         /*check if destination is local VM*/
951         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
952                 rte_pktmbuf_free(m);
953                 return;
954         }
955
956         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
957                 if (unlikely(find_local_dest(vdev, m, &offset,
958                                              &vlan_tag) != 0)) {
959                         rte_pktmbuf_free(m);
960                         return;
961                 }
962         }
963
964         RTE_LOG_DP(DEBUG, VHOST_DATA,
965                 "(%d) TX: MAC address is external\n", vdev->vid);
966
967 queue2nic:
968
969         /*Add packet to the port tx queue*/
970         tx_q = &lcore_tx_queue[lcore_id];
971
972         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
973         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
974                 /* Guest has inserted the vlan tag. */
975                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
976                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
977                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
978                         (vh->vlan_tci != vlan_tag_be))
979                         vh->vlan_tci = vlan_tag_be;
980         } else {
981                 m->ol_flags |= PKT_TX_VLAN_PKT;
982
983                 /*
984                  * Find the right seg to adjust the data len when offset is
985                  * bigger than tail room size.
986                  */
987                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
988                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
989                                 m->data_len += offset;
990                         else {
991                                 struct rte_mbuf *seg = m;
992
993                                 while ((seg->next != NULL) &&
994                                         (offset > rte_pktmbuf_tailroom(seg)))
995                                         seg = seg->next;
996
997                                 seg->data_len += offset;
998                         }
999                         m->pkt_len += offset;
1000                 }
1001
1002                 m->vlan_tci = vlan_tag;
1003         }
1004
1005         if (m->ol_flags & PKT_TX_TCP_SEG)
1006                 virtio_tx_offload(m);
1007
1008         tx_q->m_table[tx_q->len++] = m;
1009         if (enable_stats) {
1010                 vdev->stats.tx_total++;
1011                 vdev->stats.tx++;
1012         }
1013
1014         if (unlikely(tx_q->len == MAX_PKT_BURST))
1015                 do_drain_mbuf_table(tx_q);
1016 }
1017
1018
1019 static __rte_always_inline void
1020 drain_mbuf_table(struct mbuf_table *tx_q)
1021 {
1022         static uint64_t prev_tsc;
1023         uint64_t cur_tsc;
1024
1025         if (tx_q->len == 0)
1026                 return;
1027
1028         cur_tsc = rte_rdtsc();
1029         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1030                 prev_tsc = cur_tsc;
1031
1032                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1033                         "TX queue drained after timeout with burst size %u\n",
1034                         tx_q->len);
1035                 do_drain_mbuf_table(tx_q);
1036         }
1037 }
1038
1039 static __rte_always_inline void
1040 drain_eth_rx(struct vhost_dev *vdev)
1041 {
1042         uint16_t rx_count, enqueue_count;
1043         struct rte_mbuf *pkts[MAX_PKT_BURST];
1044
1045         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1046                                     pkts, MAX_PKT_BURST);
1047         if (!rx_count)
1048                 return;
1049
1050         /*
1051          * When "enable_retry" is set, here we wait and retry when there
1052          * is no enough free slots in the queue to hold @rx_count packets,
1053          * to diminish packet loss.
1054          */
1055         if (enable_retry &&
1056             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1057                         VIRTIO_RXQ))) {
1058                 uint32_t retry;
1059
1060                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1061                         rte_delay_us(burst_rx_delay_time);
1062                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1063                                         VIRTIO_RXQ))
1064                                 break;
1065                 }
1066         }
1067
1068         if (builtin_net_driver) {
1069                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1070                                                 pkts, rx_count);
1071         } else {
1072                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1073                                                 pkts, rx_count);
1074         }
1075         if (enable_stats) {
1076                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1077                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1078         }
1079
1080         free_pkts(pkts, rx_count);
1081 }
1082
1083 static __rte_always_inline void
1084 drain_virtio_tx(struct vhost_dev *vdev)
1085 {
1086         struct rte_mbuf *pkts[MAX_PKT_BURST];
1087         uint16_t count;
1088         uint16_t i;
1089
1090         if (builtin_net_driver) {
1091                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1092                                         pkts, MAX_PKT_BURST);
1093         } else {
1094                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1095                                         mbuf_pool, pkts, MAX_PKT_BURST);
1096         }
1097
1098         /* setup VMDq for the first packet */
1099         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1100                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1101                         free_pkts(pkts, count);
1102         }
1103
1104         for (i = 0; i < count; ++i)
1105                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1106 }
1107
1108 /*
1109  * Main function of vhost-switch. It basically does:
1110  *
1111  * for each vhost device {
1112  *    - drain_eth_rx()
1113  *
1114  *      Which drains the host eth Rx queue linked to the vhost device,
1115  *      and deliver all of them to guest virito Rx ring associated with
1116  *      this vhost device.
1117  *
1118  *    - drain_virtio_tx()
1119  *
1120  *      Which drains the guest virtio Tx queue and deliver all of them
1121  *      to the target, which could be another vhost device, or the
1122  *      physical eth dev. The route is done in function "virtio_tx_route".
1123  * }
1124  */
1125 static int
1126 switch_worker(void *arg __rte_unused)
1127 {
1128         unsigned i;
1129         unsigned lcore_id = rte_lcore_id();
1130         struct vhost_dev *vdev;
1131         struct mbuf_table *tx_q;
1132
1133         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1134
1135         tx_q = &lcore_tx_queue[lcore_id];
1136         for (i = 0; i < rte_lcore_count(); i++) {
1137                 if (lcore_ids[i] == lcore_id) {
1138                         tx_q->txq_id = i;
1139                         break;
1140                 }
1141         }
1142
1143         while(1) {
1144                 drain_mbuf_table(tx_q);
1145
1146                 /*
1147                  * Inform the configuration core that we have exited the
1148                  * linked list and that no devices are in use if requested.
1149                  */
1150                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1151                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1152
1153                 /*
1154                  * Process vhost devices
1155                  */
1156                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1157                               lcore_vdev_entry) {
1158                         if (unlikely(vdev->remove)) {
1159                                 unlink_vmdq(vdev);
1160                                 vdev->ready = DEVICE_SAFE_REMOVE;
1161                                 continue;
1162                         }
1163
1164                         if (likely(vdev->ready == DEVICE_RX))
1165                                 drain_eth_rx(vdev);
1166
1167                         if (likely(!vdev->remove))
1168                                 drain_virtio_tx(vdev);
1169                 }
1170         }
1171
1172         return 0;
1173 }
1174
1175 /*
1176  * Remove a device from the specific data core linked list and from the
1177  * main linked list. Synchonization  occurs through the use of the
1178  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1179  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1180  */
1181 static void
1182 destroy_device(int vid)
1183 {
1184         struct vhost_dev *vdev = NULL;
1185         int lcore;
1186
1187         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1188                 if (vdev->vid == vid)
1189                         break;
1190         }
1191         if (!vdev)
1192                 return;
1193         /*set the remove flag. */
1194         vdev->remove = 1;
1195         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1196                 rte_pause();
1197         }
1198
1199         if (builtin_net_driver)
1200                 vs_vhost_net_remove(vdev);
1201
1202         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1203                      lcore_vdev_entry);
1204         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1205
1206
1207         /* Set the dev_removal_flag on each lcore. */
1208         RTE_LCORE_FOREACH_SLAVE(lcore)
1209                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1210
1211         /*
1212          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1213          * we can be sure that they can no longer access the device removed
1214          * from the linked lists and that the devices are no longer in use.
1215          */
1216         RTE_LCORE_FOREACH_SLAVE(lcore) {
1217                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1218                         rte_pause();
1219         }
1220
1221         lcore_info[vdev->coreid].device_num--;
1222
1223         RTE_LOG(INFO, VHOST_DATA,
1224                 "(%d) device has been removed from data core\n",
1225                 vdev->vid);
1226
1227         rte_free(vdev);
1228 }
1229
1230 /*
1231  * A new device is added to a data core. First the device is added to the main linked list
1232  * and then allocated to a specific data core.
1233  */
1234 static int
1235 new_device(int vid)
1236 {
1237         int lcore, core_add = 0;
1238         uint32_t device_num_min = num_devices;
1239         struct vhost_dev *vdev;
1240
1241         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1242         if (vdev == NULL) {
1243                 RTE_LOG(INFO, VHOST_DATA,
1244                         "(%d) couldn't allocate memory for vhost dev\n",
1245                         vid);
1246                 return -1;
1247         }
1248         vdev->vid = vid;
1249
1250         if (builtin_net_driver)
1251                 vs_vhost_net_setup(vdev);
1252
1253         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1254         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1255
1256         /*reset ready flag*/
1257         vdev->ready = DEVICE_MAC_LEARNING;
1258         vdev->remove = 0;
1259
1260         /* Find a suitable lcore to add the device. */
1261         RTE_LCORE_FOREACH_SLAVE(lcore) {
1262                 if (lcore_info[lcore].device_num < device_num_min) {
1263                         device_num_min = lcore_info[lcore].device_num;
1264                         core_add = lcore;
1265                 }
1266         }
1267         vdev->coreid = core_add;
1268
1269         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1270                           lcore_vdev_entry);
1271         lcore_info[vdev->coreid].device_num++;
1272
1273         /* Disable notifications. */
1274         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1275         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1276
1277         RTE_LOG(INFO, VHOST_DATA,
1278                 "(%d) device has been added to data core %d\n",
1279                 vid, vdev->coreid);
1280
1281         return 0;
1282 }
1283
1284 /*
1285  * These callback allow devices to be added to the data core when configuration
1286  * has been fully complete.
1287  */
1288 static const struct vhost_device_ops virtio_net_device_ops =
1289 {
1290         .new_device =  new_device,
1291         .destroy_device = destroy_device,
1292 };
1293
1294 /*
1295  * This is a thread will wake up after a period to print stats if the user has
1296  * enabled them.
1297  */
1298 static void *
1299 print_stats(__rte_unused void *arg)
1300 {
1301         struct vhost_dev *vdev;
1302         uint64_t tx_dropped, rx_dropped;
1303         uint64_t tx, tx_total, rx, rx_total;
1304         const char clr[] = { 27, '[', '2', 'J', '\0' };
1305         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1306
1307         while(1) {
1308                 sleep(enable_stats);
1309
1310                 /* Clear screen and move to top left */
1311                 printf("%s%s\n", clr, top_left);
1312                 printf("Device statistics =================================\n");
1313
1314                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1315                         tx_total   = vdev->stats.tx_total;
1316                         tx         = vdev->stats.tx;
1317                         tx_dropped = tx_total - tx;
1318
1319                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1320                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1321                         rx_dropped = rx_total - rx;
1322
1323                         printf("Statistics for device %d\n"
1324                                 "-----------------------\n"
1325                                 "TX total:              %" PRIu64 "\n"
1326                                 "TX dropped:            %" PRIu64 "\n"
1327                                 "TX successful:         %" PRIu64 "\n"
1328                                 "RX total:              %" PRIu64 "\n"
1329                                 "RX dropped:            %" PRIu64 "\n"
1330                                 "RX successful:         %" PRIu64 "\n",
1331                                 vdev->vid,
1332                                 tx_total, tx_dropped, tx,
1333                                 rx_total, rx_dropped, rx);
1334                 }
1335
1336                 printf("===================================================\n");
1337         }
1338
1339         return NULL;
1340 }
1341
1342 static void
1343 unregister_drivers(int socket_num)
1344 {
1345         int i, ret;
1346
1347         for (i = 0; i < socket_num; i++) {
1348                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1349                 if (ret != 0)
1350                         RTE_LOG(ERR, VHOST_CONFIG,
1351                                 "Fail to unregister vhost driver for %s.\n",
1352                                 socket_files + i * PATH_MAX);
1353         }
1354 }
1355
1356 /* When we receive a INT signal, unregister vhost driver */
1357 static void
1358 sigint_handler(__rte_unused int signum)
1359 {
1360         /* Unregister vhost driver. */
1361         unregister_drivers(nb_sockets);
1362
1363         exit(0);
1364 }
1365
1366 /*
1367  * While creating an mbuf pool, one key thing is to figure out how
1368  * many mbuf entries is enough for our use. FYI, here are some
1369  * guidelines:
1370  *
1371  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1372  *
1373  * - For each switch core (A CPU core does the packet switch), we need
1374  *   also make some reservation for receiving the packets from virtio
1375  *   Tx queue. How many is enough depends on the usage. It's normally
1376  *   a simple calculation like following:
1377  *
1378  *       MAX_PKT_BURST * max packet size / mbuf size
1379  *
1380  *   So, we definitely need allocate more mbufs when TSO is enabled.
1381  *
1382  * - Similarly, for each switching core, we should serve @nr_rx_desc
1383  *   mbufs for receiving the packets from physical NIC device.
1384  *
1385  * - We also need make sure, for each switch core, we have allocated
1386  *   enough mbufs to fill up the mbuf cache.
1387  */
1388 static void
1389 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1390         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1391 {
1392         uint32_t nr_mbufs;
1393         uint32_t nr_mbufs_per_core;
1394         uint32_t mtu = 1500;
1395
1396         if (mergeable)
1397                 mtu = 9000;
1398         if (enable_tso)
1399                 mtu = 64 * 1024;
1400
1401         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1402                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1403         nr_mbufs_per_core += nr_rx_desc;
1404         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1405
1406         nr_mbufs  = nr_queues * nr_rx_desc;
1407         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1408         nr_mbufs *= nr_port;
1409
1410         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1411                                             nr_mbuf_cache, 0, mbuf_size,
1412                                             rte_socket_id());
1413         if (mbuf_pool == NULL)
1414                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1415 }
1416
1417 /*
1418  * Main function, does initialisation and calls the per-lcore functions.
1419  */
1420 int
1421 main(int argc, char *argv[])
1422 {
1423         unsigned lcore_id, core_id = 0;
1424         unsigned nb_ports, valid_num_ports;
1425         int ret, i;
1426         uint16_t portid;
1427         static pthread_t tid;
1428         uint64_t flags = 0;
1429
1430         signal(SIGINT, sigint_handler);
1431
1432         /* init EAL */
1433         ret = rte_eal_init(argc, argv);
1434         if (ret < 0)
1435                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1436         argc -= ret;
1437         argv += ret;
1438
1439         /* parse app arguments */
1440         ret = us_vhost_parse_args(argc, argv);
1441         if (ret < 0)
1442                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1443
1444         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1445                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1446
1447                 if (rte_lcore_is_enabled(lcore_id))
1448                         lcore_ids[core_id++] = lcore_id;
1449         }
1450
1451         if (rte_lcore_count() > RTE_MAX_LCORE)
1452                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1453
1454         /* Get the number of physical ports. */
1455         nb_ports = rte_eth_dev_count_avail();
1456
1457         /*
1458          * Update the global var NUM_PORTS and global array PORTS
1459          * and get value of var VALID_NUM_PORTS according to system ports number
1460          */
1461         valid_num_ports = check_ports_num(nb_ports);
1462
1463         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1464                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1465                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1466                 return -1;
1467         }
1468
1469         /*
1470          * FIXME: here we are trying to allocate mbufs big enough for
1471          * @MAX_QUEUES, but the truth is we're never going to use that
1472          * many queues here. We probably should only do allocation for
1473          * those queues we are going to use.
1474          */
1475         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1476                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1477
1478         if (vm2vm_mode == VM2VM_HARDWARE) {
1479                 /* Enable VT loop back to let L2 switch to do it. */
1480                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1481                 RTE_LOG(DEBUG, VHOST_CONFIG,
1482                         "Enable loop back for L2 switch in vmdq.\n");
1483         }
1484
1485         /* initialize all ports */
1486         RTE_ETH_FOREACH_DEV(portid) {
1487                 /* skip ports that are not enabled */
1488                 if ((enabled_port_mask & (1 << portid)) == 0) {
1489                         RTE_LOG(INFO, VHOST_PORT,
1490                                 "Skipping disabled port %d\n", portid);
1491                         continue;
1492                 }
1493                 if (port_init(portid) != 0)
1494                         rte_exit(EXIT_FAILURE,
1495                                 "Cannot initialize network ports\n");
1496         }
1497
1498         /* Enable stats if the user option is set. */
1499         if (enable_stats) {
1500                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1501                                         print_stats, NULL);
1502                 if (ret < 0)
1503                         rte_exit(EXIT_FAILURE,
1504                                 "Cannot create print-stats thread\n");
1505         }
1506
1507         /* Launch all data cores. */
1508         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1509                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1510
1511         if (client_mode)
1512                 flags |= RTE_VHOST_USER_CLIENT;
1513
1514         if (dequeue_zero_copy)
1515                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1516
1517         /* Register vhost user driver to handle vhost messages. */
1518         for (i = 0; i < nb_sockets; i++) {
1519                 char *file = socket_files + i * PATH_MAX;
1520                 ret = rte_vhost_driver_register(file, flags);
1521                 if (ret != 0) {
1522                         unregister_drivers(i);
1523                         rte_exit(EXIT_FAILURE,
1524                                 "vhost driver register failure.\n");
1525                 }
1526
1527                 if (builtin_net_driver)
1528                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1529
1530                 if (mergeable == 0) {
1531                         rte_vhost_driver_disable_features(file,
1532                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1533                 }
1534
1535                 if (enable_tx_csum == 0) {
1536                         rte_vhost_driver_disable_features(file,
1537                                 1ULL << VIRTIO_NET_F_CSUM);
1538                 }
1539
1540                 if (enable_tso == 0) {
1541                         rte_vhost_driver_disable_features(file,
1542                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1543                         rte_vhost_driver_disable_features(file,
1544                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1545                         rte_vhost_driver_disable_features(file,
1546                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1547                         rte_vhost_driver_disable_features(file,
1548                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1549                 }
1550
1551                 if (promiscuous) {
1552                         rte_vhost_driver_enable_features(file,
1553                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1554                 }
1555
1556                 ret = rte_vhost_driver_callback_register(file,
1557                         &virtio_net_device_ops);
1558                 if (ret != 0) {
1559                         rte_exit(EXIT_FAILURE,
1560                                 "failed to register vhost driver callbacks.\n");
1561                 }
1562
1563                 if (rte_vhost_driver_start(file) < 0) {
1564                         rte_exit(EXIT_FAILURE,
1565                                 "failed to start vhost driver.\n");
1566                 }
1567         }
1568
1569         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1570                 rte_eal_wait_lcore(lcore_id);
1571
1572         return 0;
1573
1574 }