eal: rename lcore master and slave
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95
96 static int builtin_net_driver;
97
98 /* Specify timeout (in useconds) between retries on RX. */
99 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
100 /* Specify the number of retries on RX. */
101 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
102
103 /* Socket file paths. Can be set by user */
104 static char *socket_files;
105 static int nb_sockets;
106
107 /* empty vmdq configuration structure. Filled in programatically */
108 static struct rte_eth_conf vmdq_conf_default = {
109         .rxmode = {
110                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
111                 .split_hdr_size = 0,
112                 /*
113                  * VLAN strip is necessary for 1G NIC such as I350,
114                  * this fixes bug of ipv4 forwarding in guest can't
115                  * forward pakets from one virtio dev to another virtio dev.
116                  */
117                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
118         },
119
120         .txmode = {
121                 .mq_mode = ETH_MQ_TX_NONE,
122                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
123                              DEV_TX_OFFLOAD_TCP_CKSUM |
124                              DEV_TX_OFFLOAD_VLAN_INSERT |
125                              DEV_TX_OFFLOAD_MULTI_SEGS |
126                              DEV_TX_OFFLOAD_TCP_TSO),
127         },
128         .rx_adv_conf = {
129                 /*
130                  * should be overridden separately in code with
131                  * appropriate values
132                  */
133                 .vmdq_rx_conf = {
134                         .nb_queue_pools = ETH_8_POOLS,
135                         .enable_default_pool = 0,
136                         .default_pool = 0,
137                         .nb_pool_maps = 0,
138                         .pool_map = {{0, 0},},
139                 },
140         },
141 };
142
143
144 static unsigned lcore_ids[RTE_MAX_LCORE];
145 static uint16_t ports[RTE_MAX_ETHPORTS];
146 static unsigned num_ports = 0; /**< The number of ports specified in command line */
147 static uint16_t num_pf_queues, num_vmdq_queues;
148 static uint16_t vmdq_pool_base, vmdq_queue_base;
149 static uint16_t queues_per_pool;
150
151 const uint16_t vlan_tags[] = {
152         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
153         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
154         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
155         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
156         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
157         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
158         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
159         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
160 };
161
162 /* ethernet addresses of ports */
163 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
164
165 static struct vhost_dev_tailq_list vhost_dev_list =
166         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
167
168 static struct lcore_info lcore_info[RTE_MAX_LCORE];
169
170 /* Used for queueing bursts of TX packets. */
171 struct mbuf_table {
172         unsigned len;
173         unsigned txq_id;
174         struct rte_mbuf *m_table[MAX_PKT_BURST];
175 };
176
177 /* TX queue for each data core. */
178 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
179
180 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
181                                  / US_PER_S * BURST_TX_DRAIN_US)
182 #define VLAN_HLEN       4
183
184 /*
185  * Builds up the correct configuration for VMDQ VLAN pool map
186  * according to the pool & queue limits.
187  */
188 static inline int
189 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
190 {
191         struct rte_eth_vmdq_rx_conf conf;
192         struct rte_eth_vmdq_rx_conf *def_conf =
193                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
194         unsigned i;
195
196         memset(&conf, 0, sizeof(conf));
197         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
198         conf.nb_pool_maps = num_devices;
199         conf.enable_loop_back = def_conf->enable_loop_back;
200         conf.rx_mode = def_conf->rx_mode;
201
202         for (i = 0; i < conf.nb_pool_maps; i++) {
203                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
204                 conf.pool_map[i].pools = (1UL << i);
205         }
206
207         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
208         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
209                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
210         return 0;
211 }
212
213 /*
214  * Initialises a given port using global settings and with the rx buffers
215  * coming from the mbuf_pool passed as parameter
216  */
217 static inline int
218 port_init(uint16_t port)
219 {
220         struct rte_eth_dev_info dev_info;
221         struct rte_eth_conf port_conf;
222         struct rte_eth_rxconf *rxconf;
223         struct rte_eth_txconf *txconf;
224         int16_t rx_rings, tx_rings;
225         uint16_t rx_ring_size, tx_ring_size;
226         int retval;
227         uint16_t q;
228
229         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
230         retval = rte_eth_dev_info_get(port, &dev_info);
231         if (retval != 0) {
232                 RTE_LOG(ERR, VHOST_PORT,
233                         "Error during getting device (port %u) info: %s\n",
234                         port, strerror(-retval));
235
236                 return retval;
237         }
238
239         rxconf = &dev_info.default_rxconf;
240         txconf = &dev_info.default_txconf;
241         rxconf->rx_drop_en = 1;
242
243         /*configure the number of supported virtio devices based on VMDQ limits */
244         num_devices = dev_info.max_vmdq_pools;
245
246         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
247         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
248
249         tx_rings = (uint16_t)rte_lcore_count();
250
251         /* Get port configuration. */
252         retval = get_eth_conf(&port_conf, num_devices);
253         if (retval < 0)
254                 return retval;
255         /* NIC queues are divided into pf queues and vmdq queues.  */
256         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
257         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
258         num_vmdq_queues = num_devices * queues_per_pool;
259         num_queues = num_pf_queues + num_vmdq_queues;
260         vmdq_queue_base = dev_info.vmdq_queue_base;
261         vmdq_pool_base  = dev_info.vmdq_pool_base;
262         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
263                 num_pf_queues, num_devices, queues_per_pool);
264
265         if (!rte_eth_dev_is_valid_port(port))
266                 return -1;
267
268         rx_rings = (uint16_t)dev_info.max_rx_queues;
269         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
270                 port_conf.txmode.offloads |=
271                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
272         /* Configure ethernet device. */
273         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
274         if (retval != 0) {
275                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
276                         port, strerror(-retval));
277                 return retval;
278         }
279
280         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
281                 &tx_ring_size);
282         if (retval != 0) {
283                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
284                         "for port %u: %s.\n", port, strerror(-retval));
285                 return retval;
286         }
287         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
288                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
289                         "for Rx queues on port %u.\n", port);
290                 return -1;
291         }
292
293         /* Setup the queues. */
294         rxconf->offloads = port_conf.rxmode.offloads;
295         for (q = 0; q < rx_rings; q ++) {
296                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
297                                                 rte_eth_dev_socket_id(port),
298                                                 rxconf,
299                                                 mbuf_pool);
300                 if (retval < 0) {
301                         RTE_LOG(ERR, VHOST_PORT,
302                                 "Failed to setup rx queue %u of port %u: %s.\n",
303                                 q, port, strerror(-retval));
304                         return retval;
305                 }
306         }
307         txconf->offloads = port_conf.txmode.offloads;
308         for (q = 0; q < tx_rings; q ++) {
309                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
310                                                 rte_eth_dev_socket_id(port),
311                                                 txconf);
312                 if (retval < 0) {
313                         RTE_LOG(ERR, VHOST_PORT,
314                                 "Failed to setup tx queue %u of port %u: %s.\n",
315                                 q, port, strerror(-retval));
316                         return retval;
317                 }
318         }
319
320         /* Start the device. */
321         retval  = rte_eth_dev_start(port);
322         if (retval < 0) {
323                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
324                         port, strerror(-retval));
325                 return retval;
326         }
327
328         if (promiscuous) {
329                 retval = rte_eth_promiscuous_enable(port);
330                 if (retval != 0) {
331                         RTE_LOG(ERR, VHOST_PORT,
332                                 "Failed to enable promiscuous mode on port %u: %s\n",
333                                 port, rte_strerror(-retval));
334                         return retval;
335                 }
336         }
337
338         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
339         if (retval < 0) {
340                 RTE_LOG(ERR, VHOST_PORT,
341                         "Failed to get MAC address on port %u: %s\n",
342                         port, rte_strerror(-retval));
343                 return retval;
344         }
345
346         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
347         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
348                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
349                         port,
350                         vmdq_ports_eth_addr[port].addr_bytes[0],
351                         vmdq_ports_eth_addr[port].addr_bytes[1],
352                         vmdq_ports_eth_addr[port].addr_bytes[2],
353                         vmdq_ports_eth_addr[port].addr_bytes[3],
354                         vmdq_ports_eth_addr[port].addr_bytes[4],
355                         vmdq_ports_eth_addr[port].addr_bytes[5]);
356
357         return 0;
358 }
359
360 /*
361  * Set socket file path.
362  */
363 static int
364 us_vhost_parse_socket_path(const char *q_arg)
365 {
366         char *old;
367
368         /* parse number string */
369         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
370                 return -1;
371
372         old = socket_files;
373         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
374         if (socket_files == NULL) {
375                 free(old);
376                 return -1;
377         }
378
379         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
380         nb_sockets++;
381
382         return 0;
383 }
384
385 /*
386  * Parse the portmask provided at run time.
387  */
388 static int
389 parse_portmask(const char *portmask)
390 {
391         char *end = NULL;
392         unsigned long pm;
393
394         errno = 0;
395
396         /* parse hexadecimal string */
397         pm = strtoul(portmask, &end, 16);
398         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
399                 return 0;
400
401         return pm;
402
403 }
404
405 /*
406  * Parse num options at run time.
407  */
408 static int
409 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
410 {
411         char *end = NULL;
412         unsigned long num;
413
414         errno = 0;
415
416         /* parse unsigned int string */
417         num = strtoul(q_arg, &end, 10);
418         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
419                 return -1;
420
421         if (num > max_valid_value)
422                 return -1;
423
424         return num;
425
426 }
427
428 /*
429  * Display usage
430  */
431 static void
432 us_vhost_usage(const char *prgname)
433 {
434         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
435         "               --vm2vm [0|1|2]\n"
436         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
437         "               --socket-file <path>\n"
438         "               --nb-devices ND\n"
439         "               -p PORTMASK: Set mask for ports to be used by application\n"
440         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
441         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
442         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
443         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
444         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
445         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
446         "               --socket-file: The path of the socket file.\n"
447         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
448         "               --tso [0|1] disable/enable TCP segment offload.\n"
449         "               --client register a vhost-user socket as client mode.\n",
450                prgname);
451 }
452
453 /*
454  * Parse the arguments given in the command line of the application.
455  */
456 static int
457 us_vhost_parse_args(int argc, char **argv)
458 {
459         int opt, ret;
460         int option_index;
461         unsigned i;
462         const char *prgname = argv[0];
463         static struct option long_option[] = {
464                 {"vm2vm", required_argument, NULL, 0},
465                 {"rx-retry", required_argument, NULL, 0},
466                 {"rx-retry-delay", required_argument, NULL, 0},
467                 {"rx-retry-num", required_argument, NULL, 0},
468                 {"mergeable", required_argument, NULL, 0},
469                 {"stats", required_argument, NULL, 0},
470                 {"socket-file", required_argument, NULL, 0},
471                 {"tx-csum", required_argument, NULL, 0},
472                 {"tso", required_argument, NULL, 0},
473                 {"client", no_argument, &client_mode, 1},
474                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
475                 {NULL, 0, 0, 0},
476         };
477
478         /* Parse command line */
479         while ((opt = getopt_long(argc, argv, "p:P",
480                         long_option, &option_index)) != EOF) {
481                 switch (opt) {
482                 /* Portmask */
483                 case 'p':
484                         enabled_port_mask = parse_portmask(optarg);
485                         if (enabled_port_mask == 0) {
486                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
487                                 us_vhost_usage(prgname);
488                                 return -1;
489                         }
490                         break;
491
492                 case 'P':
493                         promiscuous = 1;
494                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
495                                 ETH_VMDQ_ACCEPT_BROADCAST |
496                                 ETH_VMDQ_ACCEPT_MULTICAST;
497
498                         break;
499
500                 case 0:
501                         /* Enable/disable vm2vm comms. */
502                         if (!strncmp(long_option[option_index].name, "vm2vm",
503                                 MAX_LONG_OPT_SZ)) {
504                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
505                                 if (ret == -1) {
506                                         RTE_LOG(INFO, VHOST_CONFIG,
507                                                 "Invalid argument for "
508                                                 "vm2vm [0|1|2]\n");
509                                         us_vhost_usage(prgname);
510                                         return -1;
511                                 } else {
512                                         vm2vm_mode = (vm2vm_type)ret;
513                                 }
514                         }
515
516                         /* Enable/disable retries on RX. */
517                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
518                                 ret = parse_num_opt(optarg, 1);
519                                 if (ret == -1) {
520                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
521                                         us_vhost_usage(prgname);
522                                         return -1;
523                                 } else {
524                                         enable_retry = ret;
525                                 }
526                         }
527
528                         /* Enable/disable TX checksum offload. */
529                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
530                                 ret = parse_num_opt(optarg, 1);
531                                 if (ret == -1) {
532                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
533                                         us_vhost_usage(prgname);
534                                         return -1;
535                                 } else
536                                         enable_tx_csum = ret;
537                         }
538
539                         /* Enable/disable TSO offload. */
540                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
541                                 ret = parse_num_opt(optarg, 1);
542                                 if (ret == -1) {
543                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
544                                         us_vhost_usage(prgname);
545                                         return -1;
546                                 } else
547                                         enable_tso = ret;
548                         }
549
550                         /* Specify the retries delay time (in useconds) on RX. */
551                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
552                                 ret = parse_num_opt(optarg, INT32_MAX);
553                                 if (ret == -1) {
554                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
555                                         us_vhost_usage(prgname);
556                                         return -1;
557                                 } else {
558                                         burst_rx_delay_time = ret;
559                                 }
560                         }
561
562                         /* Specify the retries number on RX. */
563                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
564                                 ret = parse_num_opt(optarg, INT32_MAX);
565                                 if (ret == -1) {
566                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
567                                         us_vhost_usage(prgname);
568                                         return -1;
569                                 } else {
570                                         burst_rx_retry_num = ret;
571                                 }
572                         }
573
574                         /* Enable/disable RX mergeable buffers. */
575                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
576                                 ret = parse_num_opt(optarg, 1);
577                                 if (ret == -1) {
578                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
579                                         us_vhost_usage(prgname);
580                                         return -1;
581                                 } else {
582                                         mergeable = !!ret;
583                                         if (ret) {
584                                                 vmdq_conf_default.rxmode.offloads |=
585                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
586                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
587                                                         = JUMBO_FRAME_MAX_SIZE;
588                                         }
589                                 }
590                         }
591
592                         /* Enable/disable stats. */
593                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, INT32_MAX);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG,
597                                                 "Invalid argument for stats [0..N]\n");
598                                         us_vhost_usage(prgname);
599                                         return -1;
600                                 } else {
601                                         enable_stats = ret;
602                                 }
603                         }
604
605                         /* Set socket file path. */
606                         if (!strncmp(long_option[option_index].name,
607                                                 "socket-file", MAX_LONG_OPT_SZ)) {
608                                 if (us_vhost_parse_socket_path(optarg) == -1) {
609                                         RTE_LOG(INFO, VHOST_CONFIG,
610                                         "Invalid argument for socket name (Max %d characters)\n",
611                                         PATH_MAX);
612                                         us_vhost_usage(prgname);
613                                         return -1;
614                                 }
615                         }
616
617                         break;
618
619                         /* Invalid option - print options. */
620                 default:
621                         us_vhost_usage(prgname);
622                         return -1;
623                 }
624         }
625
626         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
627                 if (enabled_port_mask & (1 << i))
628                         ports[num_ports++] = i;
629         }
630
631         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
632                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
633                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
634                 return -1;
635         }
636
637         return 0;
638 }
639
640 /*
641  * Update the global var NUM_PORTS and array PORTS according to system ports number
642  * and return valid ports number
643  */
644 static unsigned check_ports_num(unsigned nb_ports)
645 {
646         unsigned valid_num_ports = num_ports;
647         unsigned portid;
648
649         if (num_ports > nb_ports) {
650                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
651                         num_ports, nb_ports);
652                 num_ports = nb_ports;
653         }
654
655         for (portid = 0; portid < num_ports; portid ++) {
656                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
657                         RTE_LOG(INFO, VHOST_PORT,
658                                 "\nSpecified port ID(%u) is not valid\n",
659                                 ports[portid]);
660                         ports[portid] = INVALID_PORT_ID;
661                         valid_num_ports--;
662                 }
663         }
664         return valid_num_ports;
665 }
666
667 static __rte_always_inline struct vhost_dev *
668 find_vhost_dev(struct rte_ether_addr *mac)
669 {
670         struct vhost_dev *vdev;
671
672         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
673                 if (vdev->ready == DEVICE_RX &&
674                     rte_is_same_ether_addr(mac, &vdev->mac_address))
675                         return vdev;
676         }
677
678         return NULL;
679 }
680
681 /*
682  * This function learns the MAC address of the device and registers this along with a
683  * vlan tag to a VMDQ.
684  */
685 static int
686 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
687 {
688         struct rte_ether_hdr *pkt_hdr;
689         int i, ret;
690
691         /* Learn MAC address of guest device from packet */
692         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
693
694         if (find_vhost_dev(&pkt_hdr->s_addr)) {
695                 RTE_LOG(ERR, VHOST_DATA,
696                         "(%d) device is using a registered MAC!\n",
697                         vdev->vid);
698                 return -1;
699         }
700
701         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
702                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
703
704         /* vlan_tag currently uses the device_id. */
705         vdev->vlan_tag = vlan_tags[vdev->vid];
706
707         /* Print out VMDQ registration info. */
708         RTE_LOG(INFO, VHOST_DATA,
709                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
710                 vdev->vid,
711                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
712                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
713                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
714                 vdev->vlan_tag);
715
716         /* Register the MAC address. */
717         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
718                                 (uint32_t)vdev->vid + vmdq_pool_base);
719         if (ret)
720                 RTE_LOG(ERR, VHOST_DATA,
721                         "(%d) failed to add device MAC address to VMDQ\n",
722                         vdev->vid);
723
724         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
725
726         /* Set device as ready for RX. */
727         vdev->ready = DEVICE_RX;
728
729         return 0;
730 }
731
732 /*
733  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
734  * queue before disabling RX on the device.
735  */
736 static inline void
737 unlink_vmdq(struct vhost_dev *vdev)
738 {
739         unsigned i = 0;
740         unsigned rx_count;
741         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
742
743         if (vdev->ready == DEVICE_RX) {
744                 /*clear MAC and VLAN settings*/
745                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
746                 for (i = 0; i < 6; i++)
747                         vdev->mac_address.addr_bytes[i] = 0;
748
749                 vdev->vlan_tag = 0;
750
751                 /*Clear out the receive buffers*/
752                 rx_count = rte_eth_rx_burst(ports[0],
753                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
754
755                 while (rx_count) {
756                         for (i = 0; i < rx_count; i++)
757                                 rte_pktmbuf_free(pkts_burst[i]);
758
759                         rx_count = rte_eth_rx_burst(ports[0],
760                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
761                 }
762
763                 vdev->ready = DEVICE_MAC_LEARNING;
764         }
765 }
766
767 static __rte_always_inline void
768 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
769             struct rte_mbuf *m)
770 {
771         uint16_t ret;
772
773         if (builtin_net_driver) {
774                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
775         } else {
776                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
777         }
778
779         if (enable_stats) {
780                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
781                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
782                 src_vdev->stats.tx_total++;
783                 src_vdev->stats.tx += ret;
784         }
785 }
786
787 /*
788  * Check if the packet destination MAC address is for a local device. If so then put
789  * the packet on that devices RX queue. If not then return.
790  */
791 static __rte_always_inline int
792 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
793 {
794         struct rte_ether_hdr *pkt_hdr;
795         struct vhost_dev *dst_vdev;
796
797         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
798
799         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
800         if (!dst_vdev)
801                 return -1;
802
803         if (vdev->vid == dst_vdev->vid) {
804                 RTE_LOG_DP(DEBUG, VHOST_DATA,
805                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
806                         vdev->vid);
807                 return 0;
808         }
809
810         RTE_LOG_DP(DEBUG, VHOST_DATA,
811                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
812
813         if (unlikely(dst_vdev->remove)) {
814                 RTE_LOG_DP(DEBUG, VHOST_DATA,
815                         "(%d) device is marked for removal\n", dst_vdev->vid);
816                 return 0;
817         }
818
819         virtio_xmit(dst_vdev, vdev, m);
820         return 0;
821 }
822
823 /*
824  * Check if the destination MAC of a packet is one local VM,
825  * and get its vlan tag, and offset if it is.
826  */
827 static __rte_always_inline int
828 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
829         uint32_t *offset, uint16_t *vlan_tag)
830 {
831         struct vhost_dev *dst_vdev;
832         struct rte_ether_hdr *pkt_hdr =
833                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
834
835         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
836         if (!dst_vdev)
837                 return 0;
838
839         if (vdev->vid == dst_vdev->vid) {
840                 RTE_LOG_DP(DEBUG, VHOST_DATA,
841                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
842                         vdev->vid);
843                 return -1;
844         }
845
846         /*
847          * HW vlan strip will reduce the packet length
848          * by minus length of vlan tag, so need restore
849          * the packet length by plus it.
850          */
851         *offset  = VLAN_HLEN;
852         *vlan_tag = vlan_tags[vdev->vid];
853
854         RTE_LOG_DP(DEBUG, VHOST_DATA,
855                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
856                 vdev->vid, dst_vdev->vid, *vlan_tag);
857
858         return 0;
859 }
860
861 static uint16_t
862 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
863 {
864         if (ol_flags & PKT_TX_IPV4)
865                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
866         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
867                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
868 }
869
870 static void virtio_tx_offload(struct rte_mbuf *m)
871 {
872         void *l3_hdr;
873         struct rte_ipv4_hdr *ipv4_hdr = NULL;
874         struct rte_tcp_hdr *tcp_hdr = NULL;
875         struct rte_ether_hdr *eth_hdr =
876                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
877
878         l3_hdr = (char *)eth_hdr + m->l2_len;
879
880         if (m->ol_flags & PKT_TX_IPV4) {
881                 ipv4_hdr = l3_hdr;
882                 ipv4_hdr->hdr_checksum = 0;
883                 m->ol_flags |= PKT_TX_IP_CKSUM;
884         }
885
886         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
887         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
888 }
889
890 static inline void
891 free_pkts(struct rte_mbuf **pkts, uint16_t n)
892 {
893         while (n--)
894                 rte_pktmbuf_free(pkts[n]);
895 }
896
897 static __rte_always_inline void
898 do_drain_mbuf_table(struct mbuf_table *tx_q)
899 {
900         uint16_t count;
901
902         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
903                                  tx_q->m_table, tx_q->len);
904         if (unlikely(count < tx_q->len))
905                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
906
907         tx_q->len = 0;
908 }
909
910 /*
911  * This function routes the TX packet to the correct interface. This
912  * may be a local device or the physical port.
913  */
914 static __rte_always_inline void
915 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
916 {
917         struct mbuf_table *tx_q;
918         unsigned offset = 0;
919         const uint16_t lcore_id = rte_lcore_id();
920         struct rte_ether_hdr *nh;
921
922
923         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
924         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
925                 struct vhost_dev *vdev2;
926
927                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
928                         if (vdev2 != vdev)
929                                 virtio_xmit(vdev2, vdev, m);
930                 }
931                 goto queue2nic;
932         }
933
934         /*check if destination is local VM*/
935         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
936                 rte_pktmbuf_free(m);
937                 return;
938         }
939
940         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
941                 if (unlikely(find_local_dest(vdev, m, &offset,
942                                              &vlan_tag) != 0)) {
943                         rte_pktmbuf_free(m);
944                         return;
945                 }
946         }
947
948         RTE_LOG_DP(DEBUG, VHOST_DATA,
949                 "(%d) TX: MAC address is external\n", vdev->vid);
950
951 queue2nic:
952
953         /*Add packet to the port tx queue*/
954         tx_q = &lcore_tx_queue[lcore_id];
955
956         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
957         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
958                 /* Guest has inserted the vlan tag. */
959                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
960                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
961                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
962                         (vh->vlan_tci != vlan_tag_be))
963                         vh->vlan_tci = vlan_tag_be;
964         } else {
965                 m->ol_flags |= PKT_TX_VLAN_PKT;
966
967                 /*
968                  * Find the right seg to adjust the data len when offset is
969                  * bigger than tail room size.
970                  */
971                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
972                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
973                                 m->data_len += offset;
974                         else {
975                                 struct rte_mbuf *seg = m;
976
977                                 while ((seg->next != NULL) &&
978                                         (offset > rte_pktmbuf_tailroom(seg)))
979                                         seg = seg->next;
980
981                                 seg->data_len += offset;
982                         }
983                         m->pkt_len += offset;
984                 }
985
986                 m->vlan_tci = vlan_tag;
987         }
988
989         if (m->ol_flags & PKT_TX_TCP_SEG)
990                 virtio_tx_offload(m);
991
992         tx_q->m_table[tx_q->len++] = m;
993         if (enable_stats) {
994                 vdev->stats.tx_total++;
995                 vdev->stats.tx++;
996         }
997
998         if (unlikely(tx_q->len == MAX_PKT_BURST))
999                 do_drain_mbuf_table(tx_q);
1000 }
1001
1002
1003 static __rte_always_inline void
1004 drain_mbuf_table(struct mbuf_table *tx_q)
1005 {
1006         static uint64_t prev_tsc;
1007         uint64_t cur_tsc;
1008
1009         if (tx_q->len == 0)
1010                 return;
1011
1012         cur_tsc = rte_rdtsc();
1013         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1014                 prev_tsc = cur_tsc;
1015
1016                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1017                         "TX queue drained after timeout with burst size %u\n",
1018                         tx_q->len);
1019                 do_drain_mbuf_table(tx_q);
1020         }
1021 }
1022
1023 static __rte_always_inline void
1024 drain_eth_rx(struct vhost_dev *vdev)
1025 {
1026         uint16_t rx_count, enqueue_count;
1027         struct rte_mbuf *pkts[MAX_PKT_BURST];
1028
1029         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1030                                     pkts, MAX_PKT_BURST);
1031         if (!rx_count)
1032                 return;
1033
1034         /*
1035          * When "enable_retry" is set, here we wait and retry when there
1036          * is no enough free slots in the queue to hold @rx_count packets,
1037          * to diminish packet loss.
1038          */
1039         if (enable_retry &&
1040             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1041                         VIRTIO_RXQ))) {
1042                 uint32_t retry;
1043
1044                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1045                         rte_delay_us(burst_rx_delay_time);
1046                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1047                                         VIRTIO_RXQ))
1048                                 break;
1049                 }
1050         }
1051
1052         if (builtin_net_driver) {
1053                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1054                                                 pkts, rx_count);
1055         } else {
1056                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1057                                                 pkts, rx_count);
1058         }
1059         if (enable_stats) {
1060                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1061                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1062         }
1063
1064         free_pkts(pkts, rx_count);
1065 }
1066
1067 static __rte_always_inline void
1068 drain_virtio_tx(struct vhost_dev *vdev)
1069 {
1070         struct rte_mbuf *pkts[MAX_PKT_BURST];
1071         uint16_t count;
1072         uint16_t i;
1073
1074         if (builtin_net_driver) {
1075                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1076                                         pkts, MAX_PKT_BURST);
1077         } else {
1078                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1079                                         mbuf_pool, pkts, MAX_PKT_BURST);
1080         }
1081
1082         /* setup VMDq for the first packet */
1083         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1084                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1085                         free_pkts(pkts, count);
1086         }
1087
1088         for (i = 0; i < count; ++i)
1089                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1090 }
1091
1092 /*
1093  * Main function of vhost-switch. It basically does:
1094  *
1095  * for each vhost device {
1096  *    - drain_eth_rx()
1097  *
1098  *      Which drains the host eth Rx queue linked to the vhost device,
1099  *      and deliver all of them to guest virito Rx ring associated with
1100  *      this vhost device.
1101  *
1102  *    - drain_virtio_tx()
1103  *
1104  *      Which drains the guest virtio Tx queue and deliver all of them
1105  *      to the target, which could be another vhost device, or the
1106  *      physical eth dev. The route is done in function "virtio_tx_route".
1107  * }
1108  */
1109 static int
1110 switch_worker(void *arg __rte_unused)
1111 {
1112         unsigned i;
1113         unsigned lcore_id = rte_lcore_id();
1114         struct vhost_dev *vdev;
1115         struct mbuf_table *tx_q;
1116
1117         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1118
1119         tx_q = &lcore_tx_queue[lcore_id];
1120         for (i = 0; i < rte_lcore_count(); i++) {
1121                 if (lcore_ids[i] == lcore_id) {
1122                         tx_q->txq_id = i;
1123                         break;
1124                 }
1125         }
1126
1127         while(1) {
1128                 drain_mbuf_table(tx_q);
1129
1130                 /*
1131                  * Inform the configuration core that we have exited the
1132                  * linked list and that no devices are in use if requested.
1133                  */
1134                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1135                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1136
1137                 /*
1138                  * Process vhost devices
1139                  */
1140                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1141                               lcore_vdev_entry) {
1142                         if (unlikely(vdev->remove)) {
1143                                 unlink_vmdq(vdev);
1144                                 vdev->ready = DEVICE_SAFE_REMOVE;
1145                                 continue;
1146                         }
1147
1148                         if (likely(vdev->ready == DEVICE_RX))
1149                                 drain_eth_rx(vdev);
1150
1151                         if (likely(!vdev->remove))
1152                                 drain_virtio_tx(vdev);
1153                 }
1154         }
1155
1156         return 0;
1157 }
1158
1159 /*
1160  * Remove a device from the specific data core linked list and from the
1161  * main linked list. Synchonization  occurs through the use of the
1162  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1163  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1164  */
1165 static void
1166 destroy_device(int vid)
1167 {
1168         struct vhost_dev *vdev = NULL;
1169         int lcore;
1170
1171         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1172                 if (vdev->vid == vid)
1173                         break;
1174         }
1175         if (!vdev)
1176                 return;
1177         /*set the remove flag. */
1178         vdev->remove = 1;
1179         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1180                 rte_pause();
1181         }
1182
1183         if (builtin_net_driver)
1184                 vs_vhost_net_remove(vdev);
1185
1186         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1187                      lcore_vdev_entry);
1188         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1189
1190
1191         /* Set the dev_removal_flag on each lcore. */
1192         RTE_LCORE_FOREACH_WORKER(lcore)
1193                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1194
1195         /*
1196          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1197          * we can be sure that they can no longer access the device removed
1198          * from the linked lists and that the devices are no longer in use.
1199          */
1200         RTE_LCORE_FOREACH_WORKER(lcore) {
1201                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1202                         rte_pause();
1203         }
1204
1205         lcore_info[vdev->coreid].device_num--;
1206
1207         RTE_LOG(INFO, VHOST_DATA,
1208                 "(%d) device has been removed from data core\n",
1209                 vdev->vid);
1210
1211         rte_free(vdev);
1212 }
1213
1214 /*
1215  * A new device is added to a data core. First the device is added to the main linked list
1216  * and then allocated to a specific data core.
1217  */
1218 static int
1219 new_device(int vid)
1220 {
1221         int lcore, core_add = 0;
1222         uint32_t device_num_min = num_devices;
1223         struct vhost_dev *vdev;
1224
1225         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1226         if (vdev == NULL) {
1227                 RTE_LOG(INFO, VHOST_DATA,
1228                         "(%d) couldn't allocate memory for vhost dev\n",
1229                         vid);
1230                 return -1;
1231         }
1232         vdev->vid = vid;
1233
1234         if (builtin_net_driver)
1235                 vs_vhost_net_setup(vdev);
1236
1237         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1238         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1239
1240         /*reset ready flag*/
1241         vdev->ready = DEVICE_MAC_LEARNING;
1242         vdev->remove = 0;
1243
1244         /* Find a suitable lcore to add the device. */
1245         RTE_LCORE_FOREACH_WORKER(lcore) {
1246                 if (lcore_info[lcore].device_num < device_num_min) {
1247                         device_num_min = lcore_info[lcore].device_num;
1248                         core_add = lcore;
1249                 }
1250         }
1251         vdev->coreid = core_add;
1252
1253         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1254                           lcore_vdev_entry);
1255         lcore_info[vdev->coreid].device_num++;
1256
1257         /* Disable notifications. */
1258         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1259         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1260
1261         RTE_LOG(INFO, VHOST_DATA,
1262                 "(%d) device has been added to data core %d\n",
1263                 vid, vdev->coreid);
1264
1265         return 0;
1266 }
1267
1268 /*
1269  * These callback allow devices to be added to the data core when configuration
1270  * has been fully complete.
1271  */
1272 static const struct vhost_device_ops virtio_net_device_ops =
1273 {
1274         .new_device =  new_device,
1275         .destroy_device = destroy_device,
1276 };
1277
1278 /*
1279  * This is a thread will wake up after a period to print stats if the user has
1280  * enabled them.
1281  */
1282 static void *
1283 print_stats(__rte_unused void *arg)
1284 {
1285         struct vhost_dev *vdev;
1286         uint64_t tx_dropped, rx_dropped;
1287         uint64_t tx, tx_total, rx, rx_total;
1288         const char clr[] = { 27, '[', '2', 'J', '\0' };
1289         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1290
1291         while(1) {
1292                 sleep(enable_stats);
1293
1294                 /* Clear screen and move to top left */
1295                 printf("%s%s\n", clr, top_left);
1296                 printf("Device statistics =================================\n");
1297
1298                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1299                         tx_total   = vdev->stats.tx_total;
1300                         tx         = vdev->stats.tx;
1301                         tx_dropped = tx_total - tx;
1302
1303                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1304                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1305                         rx_dropped = rx_total - rx;
1306
1307                         printf("Statistics for device %d\n"
1308                                 "-----------------------\n"
1309                                 "TX total:              %" PRIu64 "\n"
1310                                 "TX dropped:            %" PRIu64 "\n"
1311                                 "TX successful:         %" PRIu64 "\n"
1312                                 "RX total:              %" PRIu64 "\n"
1313                                 "RX dropped:            %" PRIu64 "\n"
1314                                 "RX successful:         %" PRIu64 "\n",
1315                                 vdev->vid,
1316                                 tx_total, tx_dropped, tx,
1317                                 rx_total, rx_dropped, rx);
1318                 }
1319
1320                 printf("===================================================\n");
1321
1322                 fflush(stdout);
1323         }
1324
1325         return NULL;
1326 }
1327
1328 static void
1329 unregister_drivers(int socket_num)
1330 {
1331         int i, ret;
1332
1333         for (i = 0; i < socket_num; i++) {
1334                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1335                 if (ret != 0)
1336                         RTE_LOG(ERR, VHOST_CONFIG,
1337                                 "Fail to unregister vhost driver for %s.\n",
1338                                 socket_files + i * PATH_MAX);
1339         }
1340 }
1341
1342 /* When we receive a INT signal, unregister vhost driver */
1343 static void
1344 sigint_handler(__rte_unused int signum)
1345 {
1346         /* Unregister vhost driver. */
1347         unregister_drivers(nb_sockets);
1348
1349         exit(0);
1350 }
1351
1352 /*
1353  * While creating an mbuf pool, one key thing is to figure out how
1354  * many mbuf entries is enough for our use. FYI, here are some
1355  * guidelines:
1356  *
1357  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1358  *
1359  * - For each switch core (A CPU core does the packet switch), we need
1360  *   also make some reservation for receiving the packets from virtio
1361  *   Tx queue. How many is enough depends on the usage. It's normally
1362  *   a simple calculation like following:
1363  *
1364  *       MAX_PKT_BURST * max packet size / mbuf size
1365  *
1366  *   So, we definitely need allocate more mbufs when TSO is enabled.
1367  *
1368  * - Similarly, for each switching core, we should serve @nr_rx_desc
1369  *   mbufs for receiving the packets from physical NIC device.
1370  *
1371  * - We also need make sure, for each switch core, we have allocated
1372  *   enough mbufs to fill up the mbuf cache.
1373  */
1374 static void
1375 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1376         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1377 {
1378         uint32_t nr_mbufs;
1379         uint32_t nr_mbufs_per_core;
1380         uint32_t mtu = 1500;
1381
1382         if (mergeable)
1383                 mtu = 9000;
1384         if (enable_tso)
1385                 mtu = 64 * 1024;
1386
1387         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1388                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1389         nr_mbufs_per_core += nr_rx_desc;
1390         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1391
1392         nr_mbufs  = nr_queues * nr_rx_desc;
1393         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1394         nr_mbufs *= nr_port;
1395
1396         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1397                                             nr_mbuf_cache, 0, mbuf_size,
1398                                             rte_socket_id());
1399         if (mbuf_pool == NULL)
1400                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1401 }
1402
1403 /*
1404  * Main function, does initialisation and calls the per-lcore functions.
1405  */
1406 int
1407 main(int argc, char *argv[])
1408 {
1409         unsigned lcore_id, core_id = 0;
1410         unsigned nb_ports, valid_num_ports;
1411         int ret, i;
1412         uint16_t portid;
1413         static pthread_t tid;
1414         uint64_t flags = 0;
1415
1416         signal(SIGINT, sigint_handler);
1417
1418         /* init EAL */
1419         ret = rte_eal_init(argc, argv);
1420         if (ret < 0)
1421                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1422         argc -= ret;
1423         argv += ret;
1424
1425         /* parse app arguments */
1426         ret = us_vhost_parse_args(argc, argv);
1427         if (ret < 0)
1428                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1429
1430         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1431                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1432
1433                 if (rte_lcore_is_enabled(lcore_id))
1434                         lcore_ids[core_id++] = lcore_id;
1435         }
1436
1437         if (rte_lcore_count() > RTE_MAX_LCORE)
1438                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1439
1440         /* Get the number of physical ports. */
1441         nb_ports = rte_eth_dev_count_avail();
1442
1443         /*
1444          * Update the global var NUM_PORTS and global array PORTS
1445          * and get value of var VALID_NUM_PORTS according to system ports number
1446          */
1447         valid_num_ports = check_ports_num(nb_ports);
1448
1449         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1450                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1451                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1452                 return -1;
1453         }
1454
1455         /*
1456          * FIXME: here we are trying to allocate mbufs big enough for
1457          * @MAX_QUEUES, but the truth is we're never going to use that
1458          * many queues here. We probably should only do allocation for
1459          * those queues we are going to use.
1460          */
1461         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1462                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1463
1464         if (vm2vm_mode == VM2VM_HARDWARE) {
1465                 /* Enable VT loop back to let L2 switch to do it. */
1466                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1467                 RTE_LOG(DEBUG, VHOST_CONFIG,
1468                         "Enable loop back for L2 switch in vmdq.\n");
1469         }
1470
1471         /* initialize all ports */
1472         RTE_ETH_FOREACH_DEV(portid) {
1473                 /* skip ports that are not enabled */
1474                 if ((enabled_port_mask & (1 << portid)) == 0) {
1475                         RTE_LOG(INFO, VHOST_PORT,
1476                                 "Skipping disabled port %d\n", portid);
1477                         continue;
1478                 }
1479                 if (port_init(portid) != 0)
1480                         rte_exit(EXIT_FAILURE,
1481                                 "Cannot initialize network ports\n");
1482         }
1483
1484         /* Enable stats if the user option is set. */
1485         if (enable_stats) {
1486                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1487                                         print_stats, NULL);
1488                 if (ret < 0)
1489                         rte_exit(EXIT_FAILURE,
1490                                 "Cannot create print-stats thread\n");
1491         }
1492
1493         /* Launch all data cores. */
1494         RTE_LCORE_FOREACH_WORKER(lcore_id)
1495                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1496
1497         if (client_mode)
1498                 flags |= RTE_VHOST_USER_CLIENT;
1499
1500         /* Register vhost user driver to handle vhost messages. */
1501         for (i = 0; i < nb_sockets; i++) {
1502                 char *file = socket_files + i * PATH_MAX;
1503                 ret = rte_vhost_driver_register(file, flags);
1504                 if (ret != 0) {
1505                         unregister_drivers(i);
1506                         rte_exit(EXIT_FAILURE,
1507                                 "vhost driver register failure.\n");
1508                 }
1509
1510                 if (builtin_net_driver)
1511                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1512
1513                 if (mergeable == 0) {
1514                         rte_vhost_driver_disable_features(file,
1515                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1516                 }
1517
1518                 if (enable_tx_csum == 0) {
1519                         rte_vhost_driver_disable_features(file,
1520                                 1ULL << VIRTIO_NET_F_CSUM);
1521                 }
1522
1523                 if (enable_tso == 0) {
1524                         rte_vhost_driver_disable_features(file,
1525                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1526                         rte_vhost_driver_disable_features(file,
1527                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1528                         rte_vhost_driver_disable_features(file,
1529                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1530                         rte_vhost_driver_disable_features(file,
1531                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1532                 }
1533
1534                 if (promiscuous) {
1535                         rte_vhost_driver_enable_features(file,
1536                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1537                 }
1538
1539                 ret = rte_vhost_driver_callback_register(file,
1540                         &virtio_net_device_ops);
1541                 if (ret != 0) {
1542                         rte_exit(EXIT_FAILURE,
1543                                 "failed to register vhost driver callbacks.\n");
1544                 }
1545
1546                 if (rte_vhost_driver_start(file) < 0) {
1547                         rte_exit(EXIT_FAILURE,
1548                                 "failed to start vhost driver.\n");
1549                 }
1550         }
1551
1552         RTE_LCORE_FOREACH_WORKER(lcore_id)
1553                 rte_eal_wait_lcore(lcore_id);
1554
1555         return 0;
1556
1557 }