794d12ba7e470ba3bdd82425d9680f2f94be107a
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95 static int dequeue_zero_copy;
96
97 static int builtin_net_driver;
98
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110         .rxmode = {
111                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112                 .split_hdr_size = 0,
113                 /*
114                  * VLAN strip is necessary for 1G NIC such as I350,
115                  * this fixes bug of ipv4 forwarding in guest can't
116                  * forward pakets from one virtio dev to another virtio dev.
117                  */
118                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119         },
120
121         .txmode = {
122                 .mq_mode = ETH_MQ_TX_NONE,
123                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124                              DEV_TX_OFFLOAD_TCP_CKSUM |
125                              DEV_TX_OFFLOAD_VLAN_INSERT |
126                              DEV_TX_OFFLOAD_MULTI_SEGS |
127                              DEV_TX_OFFLOAD_TCP_TSO),
128         },
129         .rx_adv_conf = {
130                 /*
131                  * should be overridden separately in code with
132                  * appropriate values
133                  */
134                 .vmdq_rx_conf = {
135                         .nb_queue_pools = ETH_8_POOLS,
136                         .enable_default_pool = 0,
137                         .default_pool = 0,
138                         .nb_pool_maps = 0,
139                         .pool_map = {{0, 0},},
140                 },
141         },
142 };
143
144
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151
152 const uint16_t vlan_tags[] = {
153         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165
166 static struct vhost_dev_tailq_list vhost_dev_list =
167         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173         unsigned len;
174         unsigned txq_id;
175         struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
180
181 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
182                                  / US_PER_S * BURST_TX_DRAIN_US)
183 #define VLAN_HLEN       4
184
185 /*
186  * Builds up the correct configuration for VMDQ VLAN pool map
187  * according to the pool & queue limits.
188  */
189 static inline int
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
191 {
192         struct rte_eth_vmdq_rx_conf conf;
193         struct rte_eth_vmdq_rx_conf *def_conf =
194                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
195         unsigned i;
196
197         memset(&conf, 0, sizeof(conf));
198         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199         conf.nb_pool_maps = num_devices;
200         conf.enable_loop_back = def_conf->enable_loop_back;
201         conf.rx_mode = def_conf->rx_mode;
202
203         for (i = 0; i < conf.nb_pool_maps; i++) {
204                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
205                 conf.pool_map[i].pools = (1UL << i);
206         }
207
208         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
211         return 0;
212 }
213
214 /*
215  * Initialises a given port using global settings and with the rx buffers
216  * coming from the mbuf_pool passed as parameter
217  */
218 static inline int
219 port_init(uint16_t port)
220 {
221         struct rte_eth_dev_info dev_info;
222         struct rte_eth_conf port_conf;
223         struct rte_eth_rxconf *rxconf;
224         struct rte_eth_txconf *txconf;
225         int16_t rx_rings, tx_rings;
226         uint16_t rx_ring_size, tx_ring_size;
227         int retval;
228         uint16_t q;
229
230         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231         retval = rte_eth_dev_info_get(port, &dev_info);
232         if (retval != 0) {
233                 RTE_LOG(ERR, VHOST_PORT,
234                         "Error during getting device (port %u) info: %s\n",
235                         port, strerror(-retval));
236
237                 return retval;
238         }
239
240         rxconf = &dev_info.default_rxconf;
241         txconf = &dev_info.default_txconf;
242         rxconf->rx_drop_en = 1;
243
244         /*configure the number of supported virtio devices based on VMDQ limits */
245         num_devices = dev_info.max_vmdq_pools;
246
247         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
248         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
249
250         /*
251          * When dequeue zero copy is enabled, guest Tx used vring will be
252          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
253          * (tx_ring_size here) must be small enough so that the driver will
254          * hit the free threshold easily and free mbufs timely. Otherwise,
255          * guest Tx vring would be starved.
256          */
257         if (dequeue_zero_copy)
258                 tx_ring_size = 64;
259
260         tx_rings = (uint16_t)rte_lcore_count();
261
262         /* Get port configuration. */
263         retval = get_eth_conf(&port_conf, num_devices);
264         if (retval < 0)
265                 return retval;
266         /* NIC queues are divided into pf queues and vmdq queues.  */
267         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
268         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
269         num_vmdq_queues = num_devices * queues_per_pool;
270         num_queues = num_pf_queues + num_vmdq_queues;
271         vmdq_queue_base = dev_info.vmdq_queue_base;
272         vmdq_pool_base  = dev_info.vmdq_pool_base;
273         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
274                 num_pf_queues, num_devices, queues_per_pool);
275
276         if (!rte_eth_dev_is_valid_port(port))
277                 return -1;
278
279         rx_rings = (uint16_t)dev_info.max_rx_queues;
280         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
281                 port_conf.txmode.offloads |=
282                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
283         /* Configure ethernet device. */
284         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
285         if (retval != 0) {
286                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
287                         port, strerror(-retval));
288                 return retval;
289         }
290
291         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
292                 &tx_ring_size);
293         if (retval != 0) {
294                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
295                         "for port %u: %s.\n", port, strerror(-retval));
296                 return retval;
297         }
298         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
299                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
300                         "for Rx queues on port %u.\n", port);
301                 return -1;
302         }
303
304         /* Setup the queues. */
305         rxconf->offloads = port_conf.rxmode.offloads;
306         for (q = 0; q < rx_rings; q ++) {
307                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
308                                                 rte_eth_dev_socket_id(port),
309                                                 rxconf,
310                                                 mbuf_pool);
311                 if (retval < 0) {
312                         RTE_LOG(ERR, VHOST_PORT,
313                                 "Failed to setup rx queue %u of port %u: %s.\n",
314                                 q, port, strerror(-retval));
315                         return retval;
316                 }
317         }
318         txconf->offloads = port_conf.txmode.offloads;
319         for (q = 0; q < tx_rings; q ++) {
320                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
321                                                 rte_eth_dev_socket_id(port),
322                                                 txconf);
323                 if (retval < 0) {
324                         RTE_LOG(ERR, VHOST_PORT,
325                                 "Failed to setup tx queue %u of port %u: %s.\n",
326                                 q, port, strerror(-retval));
327                         return retval;
328                 }
329         }
330
331         /* Start the device. */
332         retval  = rte_eth_dev_start(port);
333         if (retval < 0) {
334                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
335                         port, strerror(-retval));
336                 return retval;
337         }
338
339         if (promiscuous)
340                 rte_eth_promiscuous_enable(port);
341
342         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
343         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
344         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
345                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
346                         port,
347                         vmdq_ports_eth_addr[port].addr_bytes[0],
348                         vmdq_ports_eth_addr[port].addr_bytes[1],
349                         vmdq_ports_eth_addr[port].addr_bytes[2],
350                         vmdq_ports_eth_addr[port].addr_bytes[3],
351                         vmdq_ports_eth_addr[port].addr_bytes[4],
352                         vmdq_ports_eth_addr[port].addr_bytes[5]);
353
354         return 0;
355 }
356
357 /*
358  * Set socket file path.
359  */
360 static int
361 us_vhost_parse_socket_path(const char *q_arg)
362 {
363         char *old;
364
365         /* parse number string */
366         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
367                 return -1;
368
369         old = socket_files;
370         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
371         if (socket_files == NULL) {
372                 free(old);
373                 return -1;
374         }
375
376         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
377         nb_sockets++;
378
379         return 0;
380 }
381
382 /*
383  * Parse the portmask provided at run time.
384  */
385 static int
386 parse_portmask(const char *portmask)
387 {
388         char *end = NULL;
389         unsigned long pm;
390
391         errno = 0;
392
393         /* parse hexadecimal string */
394         pm = strtoul(portmask, &end, 16);
395         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
396                 return -1;
397
398         if (pm == 0)
399                 return -1;
400
401         return pm;
402
403 }
404
405 /*
406  * Parse num options at run time.
407  */
408 static int
409 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
410 {
411         char *end = NULL;
412         unsigned long num;
413
414         errno = 0;
415
416         /* parse unsigned int string */
417         num = strtoul(q_arg, &end, 10);
418         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
419                 return -1;
420
421         if (num > max_valid_value)
422                 return -1;
423
424         return num;
425
426 }
427
428 /*
429  * Display usage
430  */
431 static void
432 us_vhost_usage(const char *prgname)
433 {
434         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
435         "               --vm2vm [0|1|2]\n"
436         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
437         "               --socket-file <path>\n"
438         "               --nb-devices ND\n"
439         "               -p PORTMASK: Set mask for ports to be used by application\n"
440         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
441         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
442         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
443         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
444         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
445         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
446         "               --socket-file: The path of the socket file.\n"
447         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
448         "               --tso [0|1] disable/enable TCP segment offload.\n"
449         "               --client register a vhost-user socket as client mode.\n"
450         "               --dequeue-zero-copy enables dequeue zero copy\n",
451                prgname);
452 }
453
454 /*
455  * Parse the arguments given in the command line of the application.
456  */
457 static int
458 us_vhost_parse_args(int argc, char **argv)
459 {
460         int opt, ret;
461         int option_index;
462         unsigned i;
463         const char *prgname = argv[0];
464         static struct option long_option[] = {
465                 {"vm2vm", required_argument, NULL, 0},
466                 {"rx-retry", required_argument, NULL, 0},
467                 {"rx-retry-delay", required_argument, NULL, 0},
468                 {"rx-retry-num", required_argument, NULL, 0},
469                 {"mergeable", required_argument, NULL, 0},
470                 {"stats", required_argument, NULL, 0},
471                 {"socket-file", required_argument, NULL, 0},
472                 {"tx-csum", required_argument, NULL, 0},
473                 {"tso", required_argument, NULL, 0},
474                 {"client", no_argument, &client_mode, 1},
475                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
476                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
477                 {NULL, 0, 0, 0},
478         };
479
480         /* Parse command line */
481         while ((opt = getopt_long(argc, argv, "p:P",
482                         long_option, &option_index)) != EOF) {
483                 switch (opt) {
484                 /* Portmask */
485                 case 'p':
486                         enabled_port_mask = parse_portmask(optarg);
487                         if (enabled_port_mask == 0) {
488                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
489                                 us_vhost_usage(prgname);
490                                 return -1;
491                         }
492                         break;
493
494                 case 'P':
495                         promiscuous = 1;
496                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
497                                 ETH_VMDQ_ACCEPT_BROADCAST |
498                                 ETH_VMDQ_ACCEPT_MULTICAST;
499
500                         break;
501
502                 case 0:
503                         /* Enable/disable vm2vm comms. */
504                         if (!strncmp(long_option[option_index].name, "vm2vm",
505                                 MAX_LONG_OPT_SZ)) {
506                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
507                                 if (ret == -1) {
508                                         RTE_LOG(INFO, VHOST_CONFIG,
509                                                 "Invalid argument for "
510                                                 "vm2vm [0|1|2]\n");
511                                         us_vhost_usage(prgname);
512                                         return -1;
513                                 } else {
514                                         vm2vm_mode = (vm2vm_type)ret;
515                                 }
516                         }
517
518                         /* Enable/disable retries on RX. */
519                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
520                                 ret = parse_num_opt(optarg, 1);
521                                 if (ret == -1) {
522                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
523                                         us_vhost_usage(prgname);
524                                         return -1;
525                                 } else {
526                                         enable_retry = ret;
527                                 }
528                         }
529
530                         /* Enable/disable TX checksum offload. */
531                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
532                                 ret = parse_num_opt(optarg, 1);
533                                 if (ret == -1) {
534                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
535                                         us_vhost_usage(prgname);
536                                         return -1;
537                                 } else
538                                         enable_tx_csum = ret;
539                         }
540
541                         /* Enable/disable TSO offload. */
542                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
543                                 ret = parse_num_opt(optarg, 1);
544                                 if (ret == -1) {
545                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
546                                         us_vhost_usage(prgname);
547                                         return -1;
548                                 } else
549                                         enable_tso = ret;
550                         }
551
552                         /* Specify the retries delay time (in useconds) on RX. */
553                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
554                                 ret = parse_num_opt(optarg, INT32_MAX);
555                                 if (ret == -1) {
556                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
557                                         us_vhost_usage(prgname);
558                                         return -1;
559                                 } else {
560                                         burst_rx_delay_time = ret;
561                                 }
562                         }
563
564                         /* Specify the retries number on RX. */
565                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
566                                 ret = parse_num_opt(optarg, INT32_MAX);
567                                 if (ret == -1) {
568                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
569                                         us_vhost_usage(prgname);
570                                         return -1;
571                                 } else {
572                                         burst_rx_retry_num = ret;
573                                 }
574                         }
575
576                         /* Enable/disable RX mergeable buffers. */
577                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
578                                 ret = parse_num_opt(optarg, 1);
579                                 if (ret == -1) {
580                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
581                                         us_vhost_usage(prgname);
582                                         return -1;
583                                 } else {
584                                         mergeable = !!ret;
585                                         if (ret) {
586                                                 vmdq_conf_default.rxmode.offloads |=
587                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
588                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
589                                                         = JUMBO_FRAME_MAX_SIZE;
590                                         }
591                                 }
592                         }
593
594                         /* Enable/disable stats. */
595                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
596                                 ret = parse_num_opt(optarg, INT32_MAX);
597                                 if (ret == -1) {
598                                         RTE_LOG(INFO, VHOST_CONFIG,
599                                                 "Invalid argument for stats [0..N]\n");
600                                         us_vhost_usage(prgname);
601                                         return -1;
602                                 } else {
603                                         enable_stats = ret;
604                                 }
605                         }
606
607                         /* Set socket file path. */
608                         if (!strncmp(long_option[option_index].name,
609                                                 "socket-file", MAX_LONG_OPT_SZ)) {
610                                 if (us_vhost_parse_socket_path(optarg) == -1) {
611                                         RTE_LOG(INFO, VHOST_CONFIG,
612                                         "Invalid argument for socket name (Max %d characters)\n",
613                                         PATH_MAX);
614                                         us_vhost_usage(prgname);
615                                         return -1;
616                                 }
617                         }
618
619                         break;
620
621                         /* Invalid option - print options. */
622                 default:
623                         us_vhost_usage(prgname);
624                         return -1;
625                 }
626         }
627
628         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
629                 if (enabled_port_mask & (1 << i))
630                         ports[num_ports++] = i;
631         }
632
633         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
634                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
635                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
636                 return -1;
637         }
638
639         return 0;
640 }
641
642 /*
643  * Update the global var NUM_PORTS and array PORTS according to system ports number
644  * and return valid ports number
645  */
646 static unsigned check_ports_num(unsigned nb_ports)
647 {
648         unsigned valid_num_ports = num_ports;
649         unsigned portid;
650
651         if (num_ports > nb_ports) {
652                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
653                         num_ports, nb_ports);
654                 num_ports = nb_ports;
655         }
656
657         for (portid = 0; portid < num_ports; portid ++) {
658                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
659                         RTE_LOG(INFO, VHOST_PORT,
660                                 "\nSpecified port ID(%u) is not valid\n",
661                                 ports[portid]);
662                         ports[portid] = INVALID_PORT_ID;
663                         valid_num_ports--;
664                 }
665         }
666         return valid_num_ports;
667 }
668
669 static __rte_always_inline struct vhost_dev *
670 find_vhost_dev(struct rte_ether_addr *mac)
671 {
672         struct vhost_dev *vdev;
673
674         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
675                 if (vdev->ready == DEVICE_RX &&
676                     rte_is_same_ether_addr(mac, &vdev->mac_address))
677                         return vdev;
678         }
679
680         return NULL;
681 }
682
683 /*
684  * This function learns the MAC address of the device and registers this along with a
685  * vlan tag to a VMDQ.
686  */
687 static int
688 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
689 {
690         struct rte_ether_hdr *pkt_hdr;
691         int i, ret;
692
693         /* Learn MAC address of guest device from packet */
694         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
695
696         if (find_vhost_dev(&pkt_hdr->s_addr)) {
697                 RTE_LOG(ERR, VHOST_DATA,
698                         "(%d) device is using a registered MAC!\n",
699                         vdev->vid);
700                 return -1;
701         }
702
703         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
704                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
705
706         /* vlan_tag currently uses the device_id. */
707         vdev->vlan_tag = vlan_tags[vdev->vid];
708
709         /* Print out VMDQ registration info. */
710         RTE_LOG(INFO, VHOST_DATA,
711                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
712                 vdev->vid,
713                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
714                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
715                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
716                 vdev->vlan_tag);
717
718         /* Register the MAC address. */
719         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
720                                 (uint32_t)vdev->vid + vmdq_pool_base);
721         if (ret)
722                 RTE_LOG(ERR, VHOST_DATA,
723                         "(%d) failed to add device MAC address to VMDQ\n",
724                         vdev->vid);
725
726         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
727
728         /* Set device as ready for RX. */
729         vdev->ready = DEVICE_RX;
730
731         return 0;
732 }
733
734 /*
735  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
736  * queue before disabling RX on the device.
737  */
738 static inline void
739 unlink_vmdq(struct vhost_dev *vdev)
740 {
741         unsigned i = 0;
742         unsigned rx_count;
743         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
744
745         if (vdev->ready == DEVICE_RX) {
746                 /*clear MAC and VLAN settings*/
747                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
748                 for (i = 0; i < 6; i++)
749                         vdev->mac_address.addr_bytes[i] = 0;
750
751                 vdev->vlan_tag = 0;
752
753                 /*Clear out the receive buffers*/
754                 rx_count = rte_eth_rx_burst(ports[0],
755                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
756
757                 while (rx_count) {
758                         for (i = 0; i < rx_count; i++)
759                                 rte_pktmbuf_free(pkts_burst[i]);
760
761                         rx_count = rte_eth_rx_burst(ports[0],
762                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
763                 }
764
765                 vdev->ready = DEVICE_MAC_LEARNING;
766         }
767 }
768
769 static __rte_always_inline void
770 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
771             struct rte_mbuf *m)
772 {
773         uint16_t ret;
774
775         if (builtin_net_driver) {
776                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
777         } else {
778                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
779         }
780
781         if (enable_stats) {
782                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
783                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
784                 src_vdev->stats.tx_total++;
785                 src_vdev->stats.tx += ret;
786         }
787 }
788
789 /*
790  * Check if the packet destination MAC address is for a local device. If so then put
791  * the packet on that devices RX queue. If not then return.
792  */
793 static __rte_always_inline int
794 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
795 {
796         struct rte_ether_hdr *pkt_hdr;
797         struct vhost_dev *dst_vdev;
798
799         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
800
801         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
802         if (!dst_vdev)
803                 return -1;
804
805         if (vdev->vid == dst_vdev->vid) {
806                 RTE_LOG_DP(DEBUG, VHOST_DATA,
807                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
808                         vdev->vid);
809                 return 0;
810         }
811
812         RTE_LOG_DP(DEBUG, VHOST_DATA,
813                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
814
815         if (unlikely(dst_vdev->remove)) {
816                 RTE_LOG_DP(DEBUG, VHOST_DATA,
817                         "(%d) device is marked for removal\n", dst_vdev->vid);
818                 return 0;
819         }
820
821         virtio_xmit(dst_vdev, vdev, m);
822         return 0;
823 }
824
825 /*
826  * Check if the destination MAC of a packet is one local VM,
827  * and get its vlan tag, and offset if it is.
828  */
829 static __rte_always_inline int
830 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
831         uint32_t *offset, uint16_t *vlan_tag)
832 {
833         struct vhost_dev *dst_vdev;
834         struct rte_ether_hdr *pkt_hdr =
835                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
836
837         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
838         if (!dst_vdev)
839                 return 0;
840
841         if (vdev->vid == dst_vdev->vid) {
842                 RTE_LOG_DP(DEBUG, VHOST_DATA,
843                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
844                         vdev->vid);
845                 return -1;
846         }
847
848         /*
849          * HW vlan strip will reduce the packet length
850          * by minus length of vlan tag, so need restore
851          * the packet length by plus it.
852          */
853         *offset  = VLAN_HLEN;
854         *vlan_tag = vlan_tags[vdev->vid];
855
856         RTE_LOG_DP(DEBUG, VHOST_DATA,
857                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
858                 vdev->vid, dst_vdev->vid, *vlan_tag);
859
860         return 0;
861 }
862
863 static uint16_t
864 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
865 {
866         if (ol_flags & PKT_TX_IPV4)
867                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
868         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
869                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
870 }
871
872 static void virtio_tx_offload(struct rte_mbuf *m)
873 {
874         void *l3_hdr;
875         struct rte_ipv4_hdr *ipv4_hdr = NULL;
876         struct rte_tcp_hdr *tcp_hdr = NULL;
877         struct rte_ether_hdr *eth_hdr =
878                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
879
880         l3_hdr = (char *)eth_hdr + m->l2_len;
881
882         if (m->ol_flags & PKT_TX_IPV4) {
883                 ipv4_hdr = l3_hdr;
884                 ipv4_hdr->hdr_checksum = 0;
885                 m->ol_flags |= PKT_TX_IP_CKSUM;
886         }
887
888         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
889         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
890 }
891
892 static inline void
893 free_pkts(struct rte_mbuf **pkts, uint16_t n)
894 {
895         while (n--)
896                 rte_pktmbuf_free(pkts[n]);
897 }
898
899 static __rte_always_inline void
900 do_drain_mbuf_table(struct mbuf_table *tx_q)
901 {
902         uint16_t count;
903
904         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
905                                  tx_q->m_table, tx_q->len);
906         if (unlikely(count < tx_q->len))
907                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
908
909         tx_q->len = 0;
910 }
911
912 /*
913  * This function routes the TX packet to the correct interface. This
914  * may be a local device or the physical port.
915  */
916 static __rte_always_inline void
917 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
918 {
919         struct mbuf_table *tx_q;
920         unsigned offset = 0;
921         const uint16_t lcore_id = rte_lcore_id();
922         struct rte_ether_hdr *nh;
923
924
925         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
926         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
927                 struct vhost_dev *vdev2;
928
929                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
930                         if (vdev2 != vdev)
931                                 virtio_xmit(vdev2, vdev, m);
932                 }
933                 goto queue2nic;
934         }
935
936         /*check if destination is local VM*/
937         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
938                 rte_pktmbuf_free(m);
939                 return;
940         }
941
942         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
943                 if (unlikely(find_local_dest(vdev, m, &offset,
944                                              &vlan_tag) != 0)) {
945                         rte_pktmbuf_free(m);
946                         return;
947                 }
948         }
949
950         RTE_LOG_DP(DEBUG, VHOST_DATA,
951                 "(%d) TX: MAC address is external\n", vdev->vid);
952
953 queue2nic:
954
955         /*Add packet to the port tx queue*/
956         tx_q = &lcore_tx_queue[lcore_id];
957
958         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
959         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
960                 /* Guest has inserted the vlan tag. */
961                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
962                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
963                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
964                         (vh->vlan_tci != vlan_tag_be))
965                         vh->vlan_tci = vlan_tag_be;
966         } else {
967                 m->ol_flags |= PKT_TX_VLAN_PKT;
968
969                 /*
970                  * Find the right seg to adjust the data len when offset is
971                  * bigger than tail room size.
972                  */
973                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
974                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
975                                 m->data_len += offset;
976                         else {
977                                 struct rte_mbuf *seg = m;
978
979                                 while ((seg->next != NULL) &&
980                                         (offset > rte_pktmbuf_tailroom(seg)))
981                                         seg = seg->next;
982
983                                 seg->data_len += offset;
984                         }
985                         m->pkt_len += offset;
986                 }
987
988                 m->vlan_tci = vlan_tag;
989         }
990
991         if (m->ol_flags & PKT_TX_TCP_SEG)
992                 virtio_tx_offload(m);
993
994         tx_q->m_table[tx_q->len++] = m;
995         if (enable_stats) {
996                 vdev->stats.tx_total++;
997                 vdev->stats.tx++;
998         }
999
1000         if (unlikely(tx_q->len == MAX_PKT_BURST))
1001                 do_drain_mbuf_table(tx_q);
1002 }
1003
1004
1005 static __rte_always_inline void
1006 drain_mbuf_table(struct mbuf_table *tx_q)
1007 {
1008         static uint64_t prev_tsc;
1009         uint64_t cur_tsc;
1010
1011         if (tx_q->len == 0)
1012                 return;
1013
1014         cur_tsc = rte_rdtsc();
1015         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1016                 prev_tsc = cur_tsc;
1017
1018                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1019                         "TX queue drained after timeout with burst size %u\n",
1020                         tx_q->len);
1021                 do_drain_mbuf_table(tx_q);
1022         }
1023 }
1024
1025 static __rte_always_inline void
1026 drain_eth_rx(struct vhost_dev *vdev)
1027 {
1028         uint16_t rx_count, enqueue_count;
1029         struct rte_mbuf *pkts[MAX_PKT_BURST];
1030
1031         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1032                                     pkts, MAX_PKT_BURST);
1033         if (!rx_count)
1034                 return;
1035
1036         /*
1037          * When "enable_retry" is set, here we wait and retry when there
1038          * is no enough free slots in the queue to hold @rx_count packets,
1039          * to diminish packet loss.
1040          */
1041         if (enable_retry &&
1042             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1043                         VIRTIO_RXQ))) {
1044                 uint32_t retry;
1045
1046                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1047                         rte_delay_us(burst_rx_delay_time);
1048                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1049                                         VIRTIO_RXQ))
1050                                 break;
1051                 }
1052         }
1053
1054         if (builtin_net_driver) {
1055                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1056                                                 pkts, rx_count);
1057         } else {
1058                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1059                                                 pkts, rx_count);
1060         }
1061         if (enable_stats) {
1062                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1063                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1064         }
1065
1066         free_pkts(pkts, rx_count);
1067 }
1068
1069 static __rte_always_inline void
1070 drain_virtio_tx(struct vhost_dev *vdev)
1071 {
1072         struct rte_mbuf *pkts[MAX_PKT_BURST];
1073         uint16_t count;
1074         uint16_t i;
1075
1076         if (builtin_net_driver) {
1077                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1078                                         pkts, MAX_PKT_BURST);
1079         } else {
1080                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1081                                         mbuf_pool, pkts, MAX_PKT_BURST);
1082         }
1083
1084         /* setup VMDq for the first packet */
1085         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1086                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1087                         free_pkts(pkts, count);
1088         }
1089
1090         for (i = 0; i < count; ++i)
1091                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1092 }
1093
1094 /*
1095  * Main function of vhost-switch. It basically does:
1096  *
1097  * for each vhost device {
1098  *    - drain_eth_rx()
1099  *
1100  *      Which drains the host eth Rx queue linked to the vhost device,
1101  *      and deliver all of them to guest virito Rx ring associated with
1102  *      this vhost device.
1103  *
1104  *    - drain_virtio_tx()
1105  *
1106  *      Which drains the guest virtio Tx queue and deliver all of them
1107  *      to the target, which could be another vhost device, or the
1108  *      physical eth dev. The route is done in function "virtio_tx_route".
1109  * }
1110  */
1111 static int
1112 switch_worker(void *arg __rte_unused)
1113 {
1114         unsigned i;
1115         unsigned lcore_id = rte_lcore_id();
1116         struct vhost_dev *vdev;
1117         struct mbuf_table *tx_q;
1118
1119         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1120
1121         tx_q = &lcore_tx_queue[lcore_id];
1122         for (i = 0; i < rte_lcore_count(); i++) {
1123                 if (lcore_ids[i] == lcore_id) {
1124                         tx_q->txq_id = i;
1125                         break;
1126                 }
1127         }
1128
1129         while(1) {
1130                 drain_mbuf_table(tx_q);
1131
1132                 /*
1133                  * Inform the configuration core that we have exited the
1134                  * linked list and that no devices are in use if requested.
1135                  */
1136                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1137                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1138
1139                 /*
1140                  * Process vhost devices
1141                  */
1142                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1143                               lcore_vdev_entry) {
1144                         if (unlikely(vdev->remove)) {
1145                                 unlink_vmdq(vdev);
1146                                 vdev->ready = DEVICE_SAFE_REMOVE;
1147                                 continue;
1148                         }
1149
1150                         if (likely(vdev->ready == DEVICE_RX))
1151                                 drain_eth_rx(vdev);
1152
1153                         if (likely(!vdev->remove))
1154                                 drain_virtio_tx(vdev);
1155                 }
1156         }
1157
1158         return 0;
1159 }
1160
1161 /*
1162  * Remove a device from the specific data core linked list and from the
1163  * main linked list. Synchonization  occurs through the use of the
1164  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1165  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1166  */
1167 static void
1168 destroy_device(int vid)
1169 {
1170         struct vhost_dev *vdev = NULL;
1171         int lcore;
1172
1173         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1174                 if (vdev->vid == vid)
1175                         break;
1176         }
1177         if (!vdev)
1178                 return;
1179         /*set the remove flag. */
1180         vdev->remove = 1;
1181         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1182                 rte_pause();
1183         }
1184
1185         if (builtin_net_driver)
1186                 vs_vhost_net_remove(vdev);
1187
1188         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1189                      lcore_vdev_entry);
1190         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1191
1192
1193         /* Set the dev_removal_flag on each lcore. */
1194         RTE_LCORE_FOREACH_SLAVE(lcore)
1195                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1196
1197         /*
1198          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1199          * we can be sure that they can no longer access the device removed
1200          * from the linked lists and that the devices are no longer in use.
1201          */
1202         RTE_LCORE_FOREACH_SLAVE(lcore) {
1203                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1204                         rte_pause();
1205         }
1206
1207         lcore_info[vdev->coreid].device_num--;
1208
1209         RTE_LOG(INFO, VHOST_DATA,
1210                 "(%d) device has been removed from data core\n",
1211                 vdev->vid);
1212
1213         rte_free(vdev);
1214 }
1215
1216 /*
1217  * A new device is added to a data core. First the device is added to the main linked list
1218  * and then allocated to a specific data core.
1219  */
1220 static int
1221 new_device(int vid)
1222 {
1223         int lcore, core_add = 0;
1224         uint32_t device_num_min = num_devices;
1225         struct vhost_dev *vdev;
1226
1227         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1228         if (vdev == NULL) {
1229                 RTE_LOG(INFO, VHOST_DATA,
1230                         "(%d) couldn't allocate memory for vhost dev\n",
1231                         vid);
1232                 return -1;
1233         }
1234         vdev->vid = vid;
1235
1236         if (builtin_net_driver)
1237                 vs_vhost_net_setup(vdev);
1238
1239         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1240         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1241
1242         /*reset ready flag*/
1243         vdev->ready = DEVICE_MAC_LEARNING;
1244         vdev->remove = 0;
1245
1246         /* Find a suitable lcore to add the device. */
1247         RTE_LCORE_FOREACH_SLAVE(lcore) {
1248                 if (lcore_info[lcore].device_num < device_num_min) {
1249                         device_num_min = lcore_info[lcore].device_num;
1250                         core_add = lcore;
1251                 }
1252         }
1253         vdev->coreid = core_add;
1254
1255         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1256                           lcore_vdev_entry);
1257         lcore_info[vdev->coreid].device_num++;
1258
1259         /* Disable notifications. */
1260         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1261         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1262
1263         RTE_LOG(INFO, VHOST_DATA,
1264                 "(%d) device has been added to data core %d\n",
1265                 vid, vdev->coreid);
1266
1267         return 0;
1268 }
1269
1270 /*
1271  * These callback allow devices to be added to the data core when configuration
1272  * has been fully complete.
1273  */
1274 static const struct vhost_device_ops virtio_net_device_ops =
1275 {
1276         .new_device =  new_device,
1277         .destroy_device = destroy_device,
1278 };
1279
1280 /*
1281  * This is a thread will wake up after a period to print stats if the user has
1282  * enabled them.
1283  */
1284 static void *
1285 print_stats(__rte_unused void *arg)
1286 {
1287         struct vhost_dev *vdev;
1288         uint64_t tx_dropped, rx_dropped;
1289         uint64_t tx, tx_total, rx, rx_total;
1290         const char clr[] = { 27, '[', '2', 'J', '\0' };
1291         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1292
1293         while(1) {
1294                 sleep(enable_stats);
1295
1296                 /* Clear screen and move to top left */
1297                 printf("%s%s\n", clr, top_left);
1298                 printf("Device statistics =================================\n");
1299
1300                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1301                         tx_total   = vdev->stats.tx_total;
1302                         tx         = vdev->stats.tx;
1303                         tx_dropped = tx_total - tx;
1304
1305                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1306                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1307                         rx_dropped = rx_total - rx;
1308
1309                         printf("Statistics for device %d\n"
1310                                 "-----------------------\n"
1311                                 "TX total:              %" PRIu64 "\n"
1312                                 "TX dropped:            %" PRIu64 "\n"
1313                                 "TX successful:         %" PRIu64 "\n"
1314                                 "RX total:              %" PRIu64 "\n"
1315                                 "RX dropped:            %" PRIu64 "\n"
1316                                 "RX successful:         %" PRIu64 "\n",
1317                                 vdev->vid,
1318                                 tx_total, tx_dropped, tx,
1319                                 rx_total, rx_dropped, rx);
1320                 }
1321
1322                 printf("===================================================\n");
1323         }
1324
1325         return NULL;
1326 }
1327
1328 static void
1329 unregister_drivers(int socket_num)
1330 {
1331         int i, ret;
1332
1333         for (i = 0; i < socket_num; i++) {
1334                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1335                 if (ret != 0)
1336                         RTE_LOG(ERR, VHOST_CONFIG,
1337                                 "Fail to unregister vhost driver for %s.\n",
1338                                 socket_files + i * PATH_MAX);
1339         }
1340 }
1341
1342 /* When we receive a INT signal, unregister vhost driver */
1343 static void
1344 sigint_handler(__rte_unused int signum)
1345 {
1346         /* Unregister vhost driver. */
1347         unregister_drivers(nb_sockets);
1348
1349         exit(0);
1350 }
1351
1352 /*
1353  * While creating an mbuf pool, one key thing is to figure out how
1354  * many mbuf entries is enough for our use. FYI, here are some
1355  * guidelines:
1356  *
1357  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1358  *
1359  * - For each switch core (A CPU core does the packet switch), we need
1360  *   also make some reservation for receiving the packets from virtio
1361  *   Tx queue. How many is enough depends on the usage. It's normally
1362  *   a simple calculation like following:
1363  *
1364  *       MAX_PKT_BURST * max packet size / mbuf size
1365  *
1366  *   So, we definitely need allocate more mbufs when TSO is enabled.
1367  *
1368  * - Similarly, for each switching core, we should serve @nr_rx_desc
1369  *   mbufs for receiving the packets from physical NIC device.
1370  *
1371  * - We also need make sure, for each switch core, we have allocated
1372  *   enough mbufs to fill up the mbuf cache.
1373  */
1374 static void
1375 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1376         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1377 {
1378         uint32_t nr_mbufs;
1379         uint32_t nr_mbufs_per_core;
1380         uint32_t mtu = 1500;
1381
1382         if (mergeable)
1383                 mtu = 9000;
1384         if (enable_tso)
1385                 mtu = 64 * 1024;
1386
1387         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1388                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1389         nr_mbufs_per_core += nr_rx_desc;
1390         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1391
1392         nr_mbufs  = nr_queues * nr_rx_desc;
1393         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1394         nr_mbufs *= nr_port;
1395
1396         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1397                                             nr_mbuf_cache, 0, mbuf_size,
1398                                             rte_socket_id());
1399         if (mbuf_pool == NULL)
1400                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1401 }
1402
1403 /*
1404  * Main function, does initialisation and calls the per-lcore functions.
1405  */
1406 int
1407 main(int argc, char *argv[])
1408 {
1409         unsigned lcore_id, core_id = 0;
1410         unsigned nb_ports, valid_num_ports;
1411         int ret, i;
1412         uint16_t portid;
1413         static pthread_t tid;
1414         uint64_t flags = 0;
1415
1416         signal(SIGINT, sigint_handler);
1417
1418         /* init EAL */
1419         ret = rte_eal_init(argc, argv);
1420         if (ret < 0)
1421                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1422         argc -= ret;
1423         argv += ret;
1424
1425         /* parse app arguments */
1426         ret = us_vhost_parse_args(argc, argv);
1427         if (ret < 0)
1428                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1429
1430         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1431                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1432
1433                 if (rte_lcore_is_enabled(lcore_id))
1434                         lcore_ids[core_id++] = lcore_id;
1435         }
1436
1437         if (rte_lcore_count() > RTE_MAX_LCORE)
1438                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1439
1440         /* Get the number of physical ports. */
1441         nb_ports = rte_eth_dev_count_avail();
1442
1443         /*
1444          * Update the global var NUM_PORTS and global array PORTS
1445          * and get value of var VALID_NUM_PORTS according to system ports number
1446          */
1447         valid_num_ports = check_ports_num(nb_ports);
1448
1449         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1450                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1451                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1452                 return -1;
1453         }
1454
1455         /*
1456          * FIXME: here we are trying to allocate mbufs big enough for
1457          * @MAX_QUEUES, but the truth is we're never going to use that
1458          * many queues here. We probably should only do allocation for
1459          * those queues we are going to use.
1460          */
1461         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1462                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1463
1464         if (vm2vm_mode == VM2VM_HARDWARE) {
1465                 /* Enable VT loop back to let L2 switch to do it. */
1466                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1467                 RTE_LOG(DEBUG, VHOST_CONFIG,
1468                         "Enable loop back for L2 switch in vmdq.\n");
1469         }
1470
1471         /* initialize all ports */
1472         RTE_ETH_FOREACH_DEV(portid) {
1473                 /* skip ports that are not enabled */
1474                 if ((enabled_port_mask & (1 << portid)) == 0) {
1475                         RTE_LOG(INFO, VHOST_PORT,
1476                                 "Skipping disabled port %d\n", portid);
1477                         continue;
1478                 }
1479                 if (port_init(portid) != 0)
1480                         rte_exit(EXIT_FAILURE,
1481                                 "Cannot initialize network ports\n");
1482         }
1483
1484         /* Enable stats if the user option is set. */
1485         if (enable_stats) {
1486                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1487                                         print_stats, NULL);
1488                 if (ret < 0)
1489                         rte_exit(EXIT_FAILURE,
1490                                 "Cannot create print-stats thread\n");
1491         }
1492
1493         /* Launch all data cores. */
1494         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1495                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1496
1497         if (client_mode)
1498                 flags |= RTE_VHOST_USER_CLIENT;
1499
1500         if (dequeue_zero_copy)
1501                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1502
1503         /* Register vhost user driver to handle vhost messages. */
1504         for (i = 0; i < nb_sockets; i++) {
1505                 char *file = socket_files + i * PATH_MAX;
1506                 ret = rte_vhost_driver_register(file, flags);
1507                 if (ret != 0) {
1508                         unregister_drivers(i);
1509                         rte_exit(EXIT_FAILURE,
1510                                 "vhost driver register failure.\n");
1511                 }
1512
1513                 if (builtin_net_driver)
1514                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1515
1516                 if (mergeable == 0) {
1517                         rte_vhost_driver_disable_features(file,
1518                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1519                 }
1520
1521                 if (enable_tx_csum == 0) {
1522                         rte_vhost_driver_disable_features(file,
1523                                 1ULL << VIRTIO_NET_F_CSUM);
1524                 }
1525
1526                 if (enable_tso == 0) {
1527                         rte_vhost_driver_disable_features(file,
1528                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1529                         rte_vhost_driver_disable_features(file,
1530                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1531                         rte_vhost_driver_disable_features(file,
1532                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1533                         rte_vhost_driver_disable_features(file,
1534                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1535                 }
1536
1537                 if (promiscuous) {
1538                         rte_vhost_driver_enable_features(file,
1539                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1540                 }
1541
1542                 ret = rte_vhost_driver_callback_register(file,
1543                         &virtio_net_device_ops);
1544                 if (ret != 0) {
1545                         rte_exit(EXIT_FAILURE,
1546                                 "failed to register vhost driver callbacks.\n");
1547                 }
1548
1549                 if (rte_vhost_driver_start(file) < 0) {
1550                         rte_exit(EXIT_FAILURE,
1551                                 "failed to start vhost driver.\n");
1552                 }
1553         }
1554
1555         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1556                 rte_eal_wait_lcore(lcore_id);
1557
1558         return 0;
1559
1560 }