examples/vhost: remove unnecessary constant
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60
61 /* Maximum long option length for option parsing. */
62 #define MAX_LONG_OPT_SZ 64
63
64 /* mask of enabled ports */
65 static uint32_t enabled_port_mask = 0;
66
67 /* Promiscuous mode */
68 static uint32_t promiscuous;
69
70 /* number of devices/queues to support*/
71 static uint32_t num_queues = 0;
72 static uint32_t num_devices;
73
74 static struct rte_mempool *mbuf_pool;
75 static int mergeable;
76
77 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 typedef enum {
79         VM2VM_DISABLED = 0,
80         VM2VM_SOFTWARE = 1,
81         VM2VM_HARDWARE = 2,
82         VM2VM_LAST
83 } vm2vm_type;
84 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
85
86 /* Enable stats. */
87 static uint32_t enable_stats = 0;
88 /* Enable retries on RX. */
89 static uint32_t enable_retry = 1;
90
91 /* Disable TX checksum offload */
92 static uint32_t enable_tx_csum;
93
94 /* Disable TSO offload */
95 static uint32_t enable_tso;
96
97 static int client_mode;
98 static int dequeue_zero_copy;
99
100 static int builtin_net_driver;
101
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113         .rxmode = {
114                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115                 .split_hdr_size = 0,
116                 /*
117                  * VLAN strip is necessary for 1G NIC such as I350,
118                  * this fixes bug of ipv4 forwarding in guest can't
119                  * forward pakets from one virtio dev to another virtio dev.
120                  */
121                 .offloads = (DEV_RX_OFFLOAD_CRC_STRIP |
122                              DEV_RX_OFFLOAD_VLAN_STRIP),
123         },
124
125         .txmode = {
126                 .mq_mode = ETH_MQ_TX_NONE,
127                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128                              DEV_TX_OFFLOAD_TCP_CKSUM |
129                              DEV_TX_OFFLOAD_VLAN_INSERT |
130                              DEV_TX_OFFLOAD_MULTI_SEGS |
131                              DEV_TX_OFFLOAD_TCP_TSO),
132         },
133         .rx_adv_conf = {
134                 /*
135                  * should be overridden separately in code with
136                  * appropriate values
137                  */
138                 .vmdq_rx_conf = {
139                         .nb_queue_pools = ETH_8_POOLS,
140                         .enable_default_pool = 0,
141                         .default_pool = 0,
142                         .nb_pool_maps = 0,
143                         .pool_map = {{0, 0},},
144                 },
145         },
146 };
147
148
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155
156 const uint16_t vlan_tags[] = {
157         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
159         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166
167 /* ethernet addresses of ports */
168 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169
170 static struct vhost_dev_tailq_list vhost_dev_list =
171         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177         unsigned len;
178         unsigned txq_id;
179         struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184
185 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
186                                  / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188
189 /*
190  * Builds up the correct configuration for VMDQ VLAN pool map
191  * according to the pool & queue limits.
192  */
193 static inline int
194 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
195 {
196         struct rte_eth_vmdq_rx_conf conf;
197         struct rte_eth_vmdq_rx_conf *def_conf =
198                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
199         unsigned i;
200
201         memset(&conf, 0, sizeof(conf));
202         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
203         conf.nb_pool_maps = num_devices;
204         conf.enable_loop_back = def_conf->enable_loop_back;
205         conf.rx_mode = def_conf->rx_mode;
206
207         for (i = 0; i < conf.nb_pool_maps; i++) {
208                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
209                 conf.pool_map[i].pools = (1UL << i);
210         }
211
212         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
213         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
214                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
215         return 0;
216 }
217
218 /*
219  * Validate the device number according to the max pool number gotten form
220  * dev_info. If the device number is invalid, give the error message and
221  * return -1. Each device must have its own pool.
222  */
223 static inline int
224 validate_num_devices(uint32_t max_nb_devices)
225 {
226         if (num_devices > max_nb_devices) {
227                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
228                 return -1;
229         }
230         return 0;
231 }
232
233 /*
234  * Initialises a given port using global settings and with the rx buffers
235  * coming from the mbuf_pool passed as parameter
236  */
237 static inline int
238 port_init(uint16_t port)
239 {
240         struct rte_eth_dev_info dev_info;
241         struct rte_eth_conf port_conf;
242         struct rte_eth_rxconf *rxconf;
243         struct rte_eth_txconf *txconf;
244         int16_t rx_rings, tx_rings;
245         uint16_t rx_ring_size, tx_ring_size;
246         int retval;
247         uint16_t q;
248
249         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
250         rte_eth_dev_info_get (port, &dev_info);
251
252         rxconf = &dev_info.default_rxconf;
253         txconf = &dev_info.default_txconf;
254         rxconf->rx_drop_en = 1;
255
256         /*configure the number of supported virtio devices based on VMDQ limits */
257         num_devices = dev_info.max_vmdq_pools;
258
259         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
260         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
261
262         /*
263          * When dequeue zero copy is enabled, guest Tx used vring will be
264          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
265          * (tx_ring_size here) must be small enough so that the driver will
266          * hit the free threshold easily and free mbufs timely. Otherwise,
267          * guest Tx vring would be starved.
268          */
269         if (dequeue_zero_copy)
270                 tx_ring_size = 64;
271
272         tx_rings = (uint16_t)rte_lcore_count();
273
274         retval = validate_num_devices(MAX_DEVICES);
275         if (retval < 0)
276                 return retval;
277
278         /* Get port configuration. */
279         retval = get_eth_conf(&port_conf, num_devices);
280         if (retval < 0)
281                 return retval;
282         /* NIC queues are divided into pf queues and vmdq queues.  */
283         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
284         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
285         num_vmdq_queues = num_devices * queues_per_pool;
286         num_queues = num_pf_queues + num_vmdq_queues;
287         vmdq_queue_base = dev_info.vmdq_queue_base;
288         vmdq_pool_base  = dev_info.vmdq_pool_base;
289         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
290                 num_pf_queues, num_devices, queues_per_pool);
291
292         if (!rte_eth_dev_is_valid_port(port))
293                 return -1;
294
295         rx_rings = (uint16_t)dev_info.max_rx_queues;
296         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
297                 port_conf.txmode.offloads |=
298                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
299         /* Configure ethernet device. */
300         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
301         if (retval != 0) {
302                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
303                         port, strerror(-retval));
304                 return retval;
305         }
306
307         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
308                 &tx_ring_size);
309         if (retval != 0) {
310                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
311                         "for port %u: %s.\n", port, strerror(-retval));
312                 return retval;
313         }
314         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
315                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
316                         "for Rx queues on port %u.\n", port);
317                 return -1;
318         }
319
320         /* Setup the queues. */
321         rxconf->offloads = port_conf.rxmode.offloads;
322         for (q = 0; q < rx_rings; q ++) {
323                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
324                                                 rte_eth_dev_socket_id(port),
325                                                 rxconf,
326                                                 mbuf_pool);
327                 if (retval < 0) {
328                         RTE_LOG(ERR, VHOST_PORT,
329                                 "Failed to setup rx queue %u of port %u: %s.\n",
330                                 q, port, strerror(-retval));
331                         return retval;
332                 }
333         }
334         txconf->offloads = port_conf.txmode.offloads;
335         for (q = 0; q < tx_rings; q ++) {
336                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
337                                                 rte_eth_dev_socket_id(port),
338                                                 txconf);
339                 if (retval < 0) {
340                         RTE_LOG(ERR, VHOST_PORT,
341                                 "Failed to setup tx queue %u of port %u: %s.\n",
342                                 q, port, strerror(-retval));
343                         return retval;
344                 }
345         }
346
347         /* Start the device. */
348         retval  = rte_eth_dev_start(port);
349         if (retval < 0) {
350                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
351                         port, strerror(-retval));
352                 return retval;
353         }
354
355         if (promiscuous)
356                 rte_eth_promiscuous_enable(port);
357
358         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
359         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
360         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
361                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
362                         port,
363                         vmdq_ports_eth_addr[port].addr_bytes[0],
364                         vmdq_ports_eth_addr[port].addr_bytes[1],
365                         vmdq_ports_eth_addr[port].addr_bytes[2],
366                         vmdq_ports_eth_addr[port].addr_bytes[3],
367                         vmdq_ports_eth_addr[port].addr_bytes[4],
368                         vmdq_ports_eth_addr[port].addr_bytes[5]);
369
370         return 0;
371 }
372
373 /*
374  * Set socket file path.
375  */
376 static int
377 us_vhost_parse_socket_path(const char *q_arg)
378 {
379         /* parse number string */
380         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
381                 return -1;
382
383         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
384         snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
385         nb_sockets++;
386
387         return 0;
388 }
389
390 /*
391  * Parse the portmask provided at run time.
392  */
393 static int
394 parse_portmask(const char *portmask)
395 {
396         char *end = NULL;
397         unsigned long pm;
398
399         errno = 0;
400
401         /* parse hexadecimal string */
402         pm = strtoul(portmask, &end, 16);
403         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
404                 return -1;
405
406         if (pm == 0)
407                 return -1;
408
409         return pm;
410
411 }
412
413 /*
414  * Parse num options at run time.
415  */
416 static int
417 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
418 {
419         char *end = NULL;
420         unsigned long num;
421
422         errno = 0;
423
424         /* parse unsigned int string */
425         num = strtoul(q_arg, &end, 10);
426         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
427                 return -1;
428
429         if (num > max_valid_value)
430                 return -1;
431
432         return num;
433
434 }
435
436 /*
437  * Display usage
438  */
439 static void
440 us_vhost_usage(const char *prgname)
441 {
442         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
443         "               --vm2vm [0|1|2]\n"
444         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
445         "               --socket-file <path>\n"
446         "               --nb-devices ND\n"
447         "               -p PORTMASK: Set mask for ports to be used by application\n"
448         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
449         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
450         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
451         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
452         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
453         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
454         "               --socket-file: The path of the socket file.\n"
455         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
456         "               --tso [0|1] disable/enable TCP segment offload.\n"
457         "               --client register a vhost-user socket as client mode.\n"
458         "               --dequeue-zero-copy enables dequeue zero copy\n",
459                prgname);
460 }
461
462 /*
463  * Parse the arguments given in the command line of the application.
464  */
465 static int
466 us_vhost_parse_args(int argc, char **argv)
467 {
468         int opt, ret;
469         int option_index;
470         unsigned i;
471         const char *prgname = argv[0];
472         static struct option long_option[] = {
473                 {"vm2vm", required_argument, NULL, 0},
474                 {"rx-retry", required_argument, NULL, 0},
475                 {"rx-retry-delay", required_argument, NULL, 0},
476                 {"rx-retry-num", required_argument, NULL, 0},
477                 {"mergeable", required_argument, NULL, 0},
478                 {"stats", required_argument, NULL, 0},
479                 {"socket-file", required_argument, NULL, 0},
480                 {"tx-csum", required_argument, NULL, 0},
481                 {"tso", required_argument, NULL, 0},
482                 {"client", no_argument, &client_mode, 1},
483                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
484                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
485                 {NULL, 0, 0, 0},
486         };
487
488         /* Parse command line */
489         while ((opt = getopt_long(argc, argv, "p:P",
490                         long_option, &option_index)) != EOF) {
491                 switch (opt) {
492                 /* Portmask */
493                 case 'p':
494                         enabled_port_mask = parse_portmask(optarg);
495                         if (enabled_port_mask == 0) {
496                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
497                                 us_vhost_usage(prgname);
498                                 return -1;
499                         }
500                         break;
501
502                 case 'P':
503                         promiscuous = 1;
504                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
505                                 ETH_VMDQ_ACCEPT_BROADCAST |
506                                 ETH_VMDQ_ACCEPT_MULTICAST;
507
508                         break;
509
510                 case 0:
511                         /* Enable/disable vm2vm comms. */
512                         if (!strncmp(long_option[option_index].name, "vm2vm",
513                                 MAX_LONG_OPT_SZ)) {
514                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
515                                 if (ret == -1) {
516                                         RTE_LOG(INFO, VHOST_CONFIG,
517                                                 "Invalid argument for "
518                                                 "vm2vm [0|1|2]\n");
519                                         us_vhost_usage(prgname);
520                                         return -1;
521                                 } else {
522                                         vm2vm_mode = (vm2vm_type)ret;
523                                 }
524                         }
525
526                         /* Enable/disable retries on RX. */
527                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
528                                 ret = parse_num_opt(optarg, 1);
529                                 if (ret == -1) {
530                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
531                                         us_vhost_usage(prgname);
532                                         return -1;
533                                 } else {
534                                         enable_retry = ret;
535                                 }
536                         }
537
538                         /* Enable/disable TX checksum offload. */
539                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
540                                 ret = parse_num_opt(optarg, 1);
541                                 if (ret == -1) {
542                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
543                                         us_vhost_usage(prgname);
544                                         return -1;
545                                 } else
546                                         enable_tx_csum = ret;
547                         }
548
549                         /* Enable/disable TSO offload. */
550                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
551                                 ret = parse_num_opt(optarg, 1);
552                                 if (ret == -1) {
553                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
554                                         us_vhost_usage(prgname);
555                                         return -1;
556                                 } else
557                                         enable_tso = ret;
558                         }
559
560                         /* Specify the retries delay time (in useconds) on RX. */
561                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
562                                 ret = parse_num_opt(optarg, INT32_MAX);
563                                 if (ret == -1) {
564                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
565                                         us_vhost_usage(prgname);
566                                         return -1;
567                                 } else {
568                                         burst_rx_delay_time = ret;
569                                 }
570                         }
571
572                         /* Specify the retries number on RX. */
573                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
574                                 ret = parse_num_opt(optarg, INT32_MAX);
575                                 if (ret == -1) {
576                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
577                                         us_vhost_usage(prgname);
578                                         return -1;
579                                 } else {
580                                         burst_rx_retry_num = ret;
581                                 }
582                         }
583
584                         /* Enable/disable RX mergeable buffers. */
585                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
586                                 ret = parse_num_opt(optarg, 1);
587                                 if (ret == -1) {
588                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
589                                         us_vhost_usage(prgname);
590                                         return -1;
591                                 } else {
592                                         mergeable = !!ret;
593                                         if (ret) {
594                                                 vmdq_conf_default.rxmode.offloads |=
595                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
596                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
597                                                         = JUMBO_FRAME_MAX_SIZE;
598                                         }
599                                 }
600                         }
601
602                         /* Enable/disable stats. */
603                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
604                                 ret = parse_num_opt(optarg, INT32_MAX);
605                                 if (ret == -1) {
606                                         RTE_LOG(INFO, VHOST_CONFIG,
607                                                 "Invalid argument for stats [0..N]\n");
608                                         us_vhost_usage(prgname);
609                                         return -1;
610                                 } else {
611                                         enable_stats = ret;
612                                 }
613                         }
614
615                         /* Set socket file path. */
616                         if (!strncmp(long_option[option_index].name,
617                                                 "socket-file", MAX_LONG_OPT_SZ)) {
618                                 if (us_vhost_parse_socket_path(optarg) == -1) {
619                                         RTE_LOG(INFO, VHOST_CONFIG,
620                                         "Invalid argument for socket name (Max %d characters)\n",
621                                         PATH_MAX);
622                                         us_vhost_usage(prgname);
623                                         return -1;
624                                 }
625                         }
626
627                         break;
628
629                         /* Invalid option - print options. */
630                 default:
631                         us_vhost_usage(prgname);
632                         return -1;
633                 }
634         }
635
636         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
637                 if (enabled_port_mask & (1 << i))
638                         ports[num_ports++] = i;
639         }
640
641         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
642                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
643                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
644                 return -1;
645         }
646
647         return 0;
648 }
649
650 /*
651  * Update the global var NUM_PORTS and array PORTS according to system ports number
652  * and return valid ports number
653  */
654 static unsigned check_ports_num(unsigned nb_ports)
655 {
656         unsigned valid_num_ports = num_ports;
657         unsigned portid;
658
659         if (num_ports > nb_ports) {
660                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
661                         num_ports, nb_ports);
662                 num_ports = nb_ports;
663         }
664
665         for (portid = 0; portid < num_ports; portid ++) {
666                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
667                         RTE_LOG(INFO, VHOST_PORT,
668                                 "\nSpecified port ID(%u) is not valid\n",
669                                 ports[portid]);
670                         ports[portid] = INVALID_PORT_ID;
671                         valid_num_ports--;
672                 }
673         }
674         return valid_num_ports;
675 }
676
677 static __rte_always_inline struct vhost_dev *
678 find_vhost_dev(struct ether_addr *mac)
679 {
680         struct vhost_dev *vdev;
681
682         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
683                 if (vdev->ready == DEVICE_RX &&
684                     is_same_ether_addr(mac, &vdev->mac_address))
685                         return vdev;
686         }
687
688         return NULL;
689 }
690
691 /*
692  * This function learns the MAC address of the device and registers this along with a
693  * vlan tag to a VMDQ.
694  */
695 static int
696 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
697 {
698         struct ether_hdr *pkt_hdr;
699         int i, ret;
700
701         /* Learn MAC address of guest device from packet */
702         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
703
704         if (find_vhost_dev(&pkt_hdr->s_addr)) {
705                 RTE_LOG(ERR, VHOST_DATA,
706                         "(%d) device is using a registered MAC!\n",
707                         vdev->vid);
708                 return -1;
709         }
710
711         for (i = 0; i < ETHER_ADDR_LEN; i++)
712                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
713
714         /* vlan_tag currently uses the device_id. */
715         vdev->vlan_tag = vlan_tags[vdev->vid];
716
717         /* Print out VMDQ registration info. */
718         RTE_LOG(INFO, VHOST_DATA,
719                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
720                 vdev->vid,
721                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
722                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
723                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
724                 vdev->vlan_tag);
725
726         /* Register the MAC address. */
727         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
728                                 (uint32_t)vdev->vid + vmdq_pool_base);
729         if (ret)
730                 RTE_LOG(ERR, VHOST_DATA,
731                         "(%d) failed to add device MAC address to VMDQ\n",
732                         vdev->vid);
733
734         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
735
736         /* Set device as ready for RX. */
737         vdev->ready = DEVICE_RX;
738
739         return 0;
740 }
741
742 /*
743  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
744  * queue before disabling RX on the device.
745  */
746 static inline void
747 unlink_vmdq(struct vhost_dev *vdev)
748 {
749         unsigned i = 0;
750         unsigned rx_count;
751         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
752
753         if (vdev->ready == DEVICE_RX) {
754                 /*clear MAC and VLAN settings*/
755                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
756                 for (i = 0; i < 6; i++)
757                         vdev->mac_address.addr_bytes[i] = 0;
758
759                 vdev->vlan_tag = 0;
760
761                 /*Clear out the receive buffers*/
762                 rx_count = rte_eth_rx_burst(ports[0],
763                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
764
765                 while (rx_count) {
766                         for (i = 0; i < rx_count; i++)
767                                 rte_pktmbuf_free(pkts_burst[i]);
768
769                         rx_count = rte_eth_rx_burst(ports[0],
770                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
771                 }
772
773                 vdev->ready = DEVICE_MAC_LEARNING;
774         }
775 }
776
777 static __rte_always_inline void
778 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
779             struct rte_mbuf *m)
780 {
781         uint16_t ret;
782
783         if (builtin_net_driver) {
784                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
785         } else {
786                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
787         }
788
789         if (enable_stats) {
790                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
791                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
792                 src_vdev->stats.tx_total++;
793                 src_vdev->stats.tx += ret;
794         }
795 }
796
797 /*
798  * Check if the packet destination MAC address is for a local device. If so then put
799  * the packet on that devices RX queue. If not then return.
800  */
801 static __rte_always_inline int
802 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
803 {
804         struct ether_hdr *pkt_hdr;
805         struct vhost_dev *dst_vdev;
806
807         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
808
809         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
810         if (!dst_vdev)
811                 return -1;
812
813         if (vdev->vid == dst_vdev->vid) {
814                 RTE_LOG_DP(DEBUG, VHOST_DATA,
815                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
816                         vdev->vid);
817                 return 0;
818         }
819
820         RTE_LOG_DP(DEBUG, VHOST_DATA,
821                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
822
823         if (unlikely(dst_vdev->remove)) {
824                 RTE_LOG_DP(DEBUG, VHOST_DATA,
825                         "(%d) device is marked for removal\n", dst_vdev->vid);
826                 return 0;
827         }
828
829         virtio_xmit(dst_vdev, vdev, m);
830         return 0;
831 }
832
833 /*
834  * Check if the destination MAC of a packet is one local VM,
835  * and get its vlan tag, and offset if it is.
836  */
837 static __rte_always_inline int
838 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
839         uint32_t *offset, uint16_t *vlan_tag)
840 {
841         struct vhost_dev *dst_vdev;
842         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
843
844         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
845         if (!dst_vdev)
846                 return 0;
847
848         if (vdev->vid == dst_vdev->vid) {
849                 RTE_LOG_DP(DEBUG, VHOST_DATA,
850                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
851                         vdev->vid);
852                 return -1;
853         }
854
855         /*
856          * HW vlan strip will reduce the packet length
857          * by minus length of vlan tag, so need restore
858          * the packet length by plus it.
859          */
860         *offset  = VLAN_HLEN;
861         *vlan_tag = vlan_tags[vdev->vid];
862
863         RTE_LOG_DP(DEBUG, VHOST_DATA,
864                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
865                 vdev->vid, dst_vdev->vid, *vlan_tag);
866
867         return 0;
868 }
869
870 static uint16_t
871 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
872 {
873         if (ol_flags & PKT_TX_IPV4)
874                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
875         else /* assume ethertype == ETHER_TYPE_IPv6 */
876                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
877 }
878
879 static void virtio_tx_offload(struct rte_mbuf *m)
880 {
881         void *l3_hdr;
882         struct ipv4_hdr *ipv4_hdr = NULL;
883         struct tcp_hdr *tcp_hdr = NULL;
884         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
885
886         l3_hdr = (char *)eth_hdr + m->l2_len;
887
888         if (m->ol_flags & PKT_TX_IPV4) {
889                 ipv4_hdr = l3_hdr;
890                 ipv4_hdr->hdr_checksum = 0;
891                 m->ol_flags |= PKT_TX_IP_CKSUM;
892         }
893
894         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
895         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
896 }
897
898 static inline void
899 free_pkts(struct rte_mbuf **pkts, uint16_t n)
900 {
901         while (n--)
902                 rte_pktmbuf_free(pkts[n]);
903 }
904
905 static __rte_always_inline void
906 do_drain_mbuf_table(struct mbuf_table *tx_q)
907 {
908         uint16_t count;
909
910         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
911                                  tx_q->m_table, tx_q->len);
912         if (unlikely(count < tx_q->len))
913                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
914
915         tx_q->len = 0;
916 }
917
918 /*
919  * This function routes the TX packet to the correct interface. This
920  * may be a local device or the physical port.
921  */
922 static __rte_always_inline void
923 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
924 {
925         struct mbuf_table *tx_q;
926         unsigned offset = 0;
927         const uint16_t lcore_id = rte_lcore_id();
928         struct ether_hdr *nh;
929
930
931         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
932         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
933                 struct vhost_dev *vdev2;
934
935                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
936                         if (vdev2 != vdev)
937                                 virtio_xmit(vdev2, vdev, m);
938                 }
939                 goto queue2nic;
940         }
941
942         /*check if destination is local VM*/
943         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
944                 rte_pktmbuf_free(m);
945                 return;
946         }
947
948         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
949                 if (unlikely(find_local_dest(vdev, m, &offset,
950                                              &vlan_tag) != 0)) {
951                         rte_pktmbuf_free(m);
952                         return;
953                 }
954         }
955
956         RTE_LOG_DP(DEBUG, VHOST_DATA,
957                 "(%d) TX: MAC address is external\n", vdev->vid);
958
959 queue2nic:
960
961         /*Add packet to the port tx queue*/
962         tx_q = &lcore_tx_queue[lcore_id];
963
964         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
965         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
966                 /* Guest has inserted the vlan tag. */
967                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
968                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
969                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
970                         (vh->vlan_tci != vlan_tag_be))
971                         vh->vlan_tci = vlan_tag_be;
972         } else {
973                 m->ol_flags |= PKT_TX_VLAN_PKT;
974
975                 /*
976                  * Find the right seg to adjust the data len when offset is
977                  * bigger than tail room size.
978                  */
979                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
980                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
981                                 m->data_len += offset;
982                         else {
983                                 struct rte_mbuf *seg = m;
984
985                                 while ((seg->next != NULL) &&
986                                         (offset > rte_pktmbuf_tailroom(seg)))
987                                         seg = seg->next;
988
989                                 seg->data_len += offset;
990                         }
991                         m->pkt_len += offset;
992                 }
993
994                 m->vlan_tci = vlan_tag;
995         }
996
997         if (m->ol_flags & PKT_TX_TCP_SEG)
998                 virtio_tx_offload(m);
999
1000         tx_q->m_table[tx_q->len++] = m;
1001         if (enable_stats) {
1002                 vdev->stats.tx_total++;
1003                 vdev->stats.tx++;
1004         }
1005
1006         if (unlikely(tx_q->len == MAX_PKT_BURST))
1007                 do_drain_mbuf_table(tx_q);
1008 }
1009
1010
1011 static __rte_always_inline void
1012 drain_mbuf_table(struct mbuf_table *tx_q)
1013 {
1014         static uint64_t prev_tsc;
1015         uint64_t cur_tsc;
1016
1017         if (tx_q->len == 0)
1018                 return;
1019
1020         cur_tsc = rte_rdtsc();
1021         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1022                 prev_tsc = cur_tsc;
1023
1024                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1025                         "TX queue drained after timeout with burst size %u\n",
1026                         tx_q->len);
1027                 do_drain_mbuf_table(tx_q);
1028         }
1029 }
1030
1031 static __rte_always_inline void
1032 drain_eth_rx(struct vhost_dev *vdev)
1033 {
1034         uint16_t rx_count, enqueue_count;
1035         struct rte_mbuf *pkts[MAX_PKT_BURST];
1036
1037         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1038                                     pkts, MAX_PKT_BURST);
1039         if (!rx_count)
1040                 return;
1041
1042         /*
1043          * When "enable_retry" is set, here we wait and retry when there
1044          * is no enough free slots in the queue to hold @rx_count packets,
1045          * to diminish packet loss.
1046          */
1047         if (enable_retry &&
1048             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1049                         VIRTIO_RXQ))) {
1050                 uint32_t retry;
1051
1052                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1053                         rte_delay_us(burst_rx_delay_time);
1054                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1055                                         VIRTIO_RXQ))
1056                                 break;
1057                 }
1058         }
1059
1060         if (builtin_net_driver) {
1061                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1062                                                 pkts, rx_count);
1063         } else {
1064                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1065                                                 pkts, rx_count);
1066         }
1067         if (enable_stats) {
1068                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1069                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1070         }
1071
1072         free_pkts(pkts, rx_count);
1073 }
1074
1075 static __rte_always_inline void
1076 drain_virtio_tx(struct vhost_dev *vdev)
1077 {
1078         struct rte_mbuf *pkts[MAX_PKT_BURST];
1079         uint16_t count;
1080         uint16_t i;
1081
1082         if (builtin_net_driver) {
1083                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1084                                         pkts, MAX_PKT_BURST);
1085         } else {
1086                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1087                                         mbuf_pool, pkts, MAX_PKT_BURST);
1088         }
1089
1090         /* setup VMDq for the first packet */
1091         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1092                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1093                         free_pkts(pkts, count);
1094         }
1095
1096         for (i = 0; i < count; ++i)
1097                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1098 }
1099
1100 /*
1101  * Main function of vhost-switch. It basically does:
1102  *
1103  * for each vhost device {
1104  *    - drain_eth_rx()
1105  *
1106  *      Which drains the host eth Rx queue linked to the vhost device,
1107  *      and deliver all of them to guest virito Rx ring associated with
1108  *      this vhost device.
1109  *
1110  *    - drain_virtio_tx()
1111  *
1112  *      Which drains the guest virtio Tx queue and deliver all of them
1113  *      to the target, which could be another vhost device, or the
1114  *      physical eth dev. The route is done in function "virtio_tx_route".
1115  * }
1116  */
1117 static int
1118 switch_worker(void *arg __rte_unused)
1119 {
1120         unsigned i;
1121         unsigned lcore_id = rte_lcore_id();
1122         struct vhost_dev *vdev;
1123         struct mbuf_table *tx_q;
1124
1125         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1126
1127         tx_q = &lcore_tx_queue[lcore_id];
1128         for (i = 0; i < rte_lcore_count(); i++) {
1129                 if (lcore_ids[i] == lcore_id) {
1130                         tx_q->txq_id = i;
1131                         break;
1132                 }
1133         }
1134
1135         while(1) {
1136                 drain_mbuf_table(tx_q);
1137
1138                 /*
1139                  * Inform the configuration core that we have exited the
1140                  * linked list and that no devices are in use if requested.
1141                  */
1142                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1143                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1144
1145                 /*
1146                  * Process vhost devices
1147                  */
1148                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1149                               lcore_vdev_entry) {
1150                         if (unlikely(vdev->remove)) {
1151                                 unlink_vmdq(vdev);
1152                                 vdev->ready = DEVICE_SAFE_REMOVE;
1153                                 continue;
1154                         }
1155
1156                         if (likely(vdev->ready == DEVICE_RX))
1157                                 drain_eth_rx(vdev);
1158
1159                         if (likely(!vdev->remove))
1160                                 drain_virtio_tx(vdev);
1161                 }
1162         }
1163
1164         return 0;
1165 }
1166
1167 /*
1168  * Remove a device from the specific data core linked list and from the
1169  * main linked list. Synchonization  occurs through the use of the
1170  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1171  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1172  */
1173 static void
1174 destroy_device(int vid)
1175 {
1176         struct vhost_dev *vdev = NULL;
1177         int lcore;
1178
1179         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1180                 if (vdev->vid == vid)
1181                         break;
1182         }
1183         if (!vdev)
1184                 return;
1185         /*set the remove flag. */
1186         vdev->remove = 1;
1187         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1188                 rte_pause();
1189         }
1190
1191         if (builtin_net_driver)
1192                 vs_vhost_net_remove(vdev);
1193
1194         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1195                      lcore_vdev_entry);
1196         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1197
1198
1199         /* Set the dev_removal_flag on each lcore. */
1200         RTE_LCORE_FOREACH_SLAVE(lcore)
1201                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1202
1203         /*
1204          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1205          * we can be sure that they can no longer access the device removed
1206          * from the linked lists and that the devices are no longer in use.
1207          */
1208         RTE_LCORE_FOREACH_SLAVE(lcore) {
1209                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1210                         rte_pause();
1211         }
1212
1213         lcore_info[vdev->coreid].device_num--;
1214
1215         RTE_LOG(INFO, VHOST_DATA,
1216                 "(%d) device has been removed from data core\n",
1217                 vdev->vid);
1218
1219         rte_free(vdev);
1220 }
1221
1222 /*
1223  * A new device is added to a data core. First the device is added to the main linked list
1224  * and the allocated to a specific data core.
1225  */
1226 static int
1227 new_device(int vid)
1228 {
1229         int lcore, core_add = 0;
1230         uint32_t device_num_min = num_devices;
1231         struct vhost_dev *vdev;
1232
1233         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1234         if (vdev == NULL) {
1235                 RTE_LOG(INFO, VHOST_DATA,
1236                         "(%d) couldn't allocate memory for vhost dev\n",
1237                         vid);
1238                 return -1;
1239         }
1240         vdev->vid = vid;
1241
1242         if (builtin_net_driver)
1243                 vs_vhost_net_setup(vdev);
1244
1245         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1246         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1247
1248         /*reset ready flag*/
1249         vdev->ready = DEVICE_MAC_LEARNING;
1250         vdev->remove = 0;
1251
1252         /* Find a suitable lcore to add the device. */
1253         RTE_LCORE_FOREACH_SLAVE(lcore) {
1254                 if (lcore_info[lcore].device_num < device_num_min) {
1255                         device_num_min = lcore_info[lcore].device_num;
1256                         core_add = lcore;
1257                 }
1258         }
1259         vdev->coreid = core_add;
1260
1261         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1262                           lcore_vdev_entry);
1263         lcore_info[vdev->coreid].device_num++;
1264
1265         /* Disable notifications. */
1266         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1267         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1268
1269         RTE_LOG(INFO, VHOST_DATA,
1270                 "(%d) device has been added to data core %d\n",
1271                 vid, vdev->coreid);
1272
1273         return 0;
1274 }
1275
1276 /*
1277  * These callback allow devices to be added to the data core when configuration
1278  * has been fully complete.
1279  */
1280 static const struct vhost_device_ops virtio_net_device_ops =
1281 {
1282         .new_device =  new_device,
1283         .destroy_device = destroy_device,
1284 };
1285
1286 /*
1287  * This is a thread will wake up after a period to print stats if the user has
1288  * enabled them.
1289  */
1290 static void *
1291 print_stats(__rte_unused void *arg)
1292 {
1293         struct vhost_dev *vdev;
1294         uint64_t tx_dropped, rx_dropped;
1295         uint64_t tx, tx_total, rx, rx_total;
1296         const char clr[] = { 27, '[', '2', 'J', '\0' };
1297         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1298
1299         while(1) {
1300                 sleep(enable_stats);
1301
1302                 /* Clear screen and move to top left */
1303                 printf("%s%s\n", clr, top_left);
1304                 printf("Device statistics =================================\n");
1305
1306                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1307                         tx_total   = vdev->stats.tx_total;
1308                         tx         = vdev->stats.tx;
1309                         tx_dropped = tx_total - tx;
1310
1311                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1312                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1313                         rx_dropped = rx_total - rx;
1314
1315                         printf("Statistics for device %d\n"
1316                                 "-----------------------\n"
1317                                 "TX total:              %" PRIu64 "\n"
1318                                 "TX dropped:            %" PRIu64 "\n"
1319                                 "TX successful:         %" PRIu64 "\n"
1320                                 "RX total:              %" PRIu64 "\n"
1321                                 "RX dropped:            %" PRIu64 "\n"
1322                                 "RX successful:         %" PRIu64 "\n",
1323                                 vdev->vid,
1324                                 tx_total, tx_dropped, tx,
1325                                 rx_total, rx_dropped, rx);
1326                 }
1327
1328                 printf("===================================================\n");
1329         }
1330
1331         return NULL;
1332 }
1333
1334 static void
1335 unregister_drivers(int socket_num)
1336 {
1337         int i, ret;
1338
1339         for (i = 0; i < socket_num; i++) {
1340                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1341                 if (ret != 0)
1342                         RTE_LOG(ERR, VHOST_CONFIG,
1343                                 "Fail to unregister vhost driver for %s.\n",
1344                                 socket_files + i * PATH_MAX);
1345         }
1346 }
1347
1348 /* When we receive a INT signal, unregister vhost driver */
1349 static void
1350 sigint_handler(__rte_unused int signum)
1351 {
1352         /* Unregister vhost driver. */
1353         unregister_drivers(nb_sockets);
1354
1355         exit(0);
1356 }
1357
1358 /*
1359  * While creating an mbuf pool, one key thing is to figure out how
1360  * many mbuf entries is enough for our use. FYI, here are some
1361  * guidelines:
1362  *
1363  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1364  *
1365  * - For each switch core (A CPU core does the packet switch), we need
1366  *   also make some reservation for receiving the packets from virtio
1367  *   Tx queue. How many is enough depends on the usage. It's normally
1368  *   a simple calculation like following:
1369  *
1370  *       MAX_PKT_BURST * max packet size / mbuf size
1371  *
1372  *   So, we definitely need allocate more mbufs when TSO is enabled.
1373  *
1374  * - Similarly, for each switching core, we should serve @nr_rx_desc
1375  *   mbufs for receiving the packets from physical NIC device.
1376  *
1377  * - We also need make sure, for each switch core, we have allocated
1378  *   enough mbufs to fill up the mbuf cache.
1379  */
1380 static void
1381 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1382         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1383 {
1384         uint32_t nr_mbufs;
1385         uint32_t nr_mbufs_per_core;
1386         uint32_t mtu = 1500;
1387
1388         if (mergeable)
1389                 mtu = 9000;
1390         if (enable_tso)
1391                 mtu = 64 * 1024;
1392
1393         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1394                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1395         nr_mbufs_per_core += nr_rx_desc;
1396         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1397
1398         nr_mbufs  = nr_queues * nr_rx_desc;
1399         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1400         nr_mbufs *= nr_port;
1401
1402         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1403                                             nr_mbuf_cache, 0, mbuf_size,
1404                                             rte_socket_id());
1405         if (mbuf_pool == NULL)
1406                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1407 }
1408
1409 /*
1410  * Main function, does initialisation and calls the per-lcore functions.
1411  */
1412 int
1413 main(int argc, char *argv[])
1414 {
1415         unsigned lcore_id, core_id = 0;
1416         unsigned nb_ports, valid_num_ports;
1417         int ret, i;
1418         uint16_t portid;
1419         static pthread_t tid;
1420         uint64_t flags = 0;
1421
1422         signal(SIGINT, sigint_handler);
1423
1424         /* init EAL */
1425         ret = rte_eal_init(argc, argv);
1426         if (ret < 0)
1427                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1428         argc -= ret;
1429         argv += ret;
1430
1431         /* parse app arguments */
1432         ret = us_vhost_parse_args(argc, argv);
1433         if (ret < 0)
1434                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1435
1436         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1437                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1438
1439                 if (rte_lcore_is_enabled(lcore_id))
1440                         lcore_ids[core_id++] = lcore_id;
1441         }
1442
1443         if (rte_lcore_count() > RTE_MAX_LCORE)
1444                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1445
1446         /* Get the number of physical ports. */
1447         nb_ports = rte_eth_dev_count_avail();
1448
1449         /*
1450          * Update the global var NUM_PORTS and global array PORTS
1451          * and get value of var VALID_NUM_PORTS according to system ports number
1452          */
1453         valid_num_ports = check_ports_num(nb_ports);
1454
1455         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1456                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1457                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1458                 return -1;
1459         }
1460
1461         /*
1462          * FIXME: here we are trying to allocate mbufs big enough for
1463          * @MAX_QUEUES, but the truth is we're never going to use that
1464          * many queues here. We probably should only do allocation for
1465          * those queues we are going to use.
1466          */
1467         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1468                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1469
1470         if (vm2vm_mode == VM2VM_HARDWARE) {
1471                 /* Enable VT loop back to let L2 switch to do it. */
1472                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1473                 RTE_LOG(DEBUG, VHOST_CONFIG,
1474                         "Enable loop back for L2 switch in vmdq.\n");
1475         }
1476
1477         /* initialize all ports */
1478         RTE_ETH_FOREACH_DEV(portid) {
1479                 /* skip ports that are not enabled */
1480                 if ((enabled_port_mask & (1 << portid)) == 0) {
1481                         RTE_LOG(INFO, VHOST_PORT,
1482                                 "Skipping disabled port %d\n", portid);
1483                         continue;
1484                 }
1485                 if (port_init(portid) != 0)
1486                         rte_exit(EXIT_FAILURE,
1487                                 "Cannot initialize network ports\n");
1488         }
1489
1490         /* Enable stats if the user option is set. */
1491         if (enable_stats) {
1492                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1493                                         print_stats, NULL);
1494                 if (ret < 0)
1495                         rte_exit(EXIT_FAILURE,
1496                                 "Cannot create print-stats thread\n");
1497         }
1498
1499         /* Launch all data cores. */
1500         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1501                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1502
1503         if (client_mode)
1504                 flags |= RTE_VHOST_USER_CLIENT;
1505
1506         if (dequeue_zero_copy)
1507                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1508
1509         /* Register vhost user driver to handle vhost messages. */
1510         for (i = 0; i < nb_sockets; i++) {
1511                 char *file = socket_files + i * PATH_MAX;
1512                 ret = rte_vhost_driver_register(file, flags);
1513                 if (ret != 0) {
1514                         unregister_drivers(i);
1515                         rte_exit(EXIT_FAILURE,
1516                                 "vhost driver register failure.\n");
1517                 }
1518
1519                 if (builtin_net_driver)
1520                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1521
1522                 if (mergeable == 0) {
1523                         rte_vhost_driver_disable_features(file,
1524                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1525                 }
1526
1527                 if (enable_tx_csum == 0) {
1528                         rte_vhost_driver_disable_features(file,
1529                                 1ULL << VIRTIO_NET_F_CSUM);
1530                 }
1531
1532                 if (enable_tso == 0) {
1533                         rte_vhost_driver_disable_features(file,
1534                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1535                         rte_vhost_driver_disable_features(file,
1536                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1537                         rte_vhost_driver_disable_features(file,
1538                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1539                         rte_vhost_driver_disable_features(file,
1540                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1541                 }
1542
1543                 if (promiscuous) {
1544                         rte_vhost_driver_enable_features(file,
1545                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1546                 }
1547
1548                 ret = rte_vhost_driver_callback_register(file,
1549                         &virtio_net_device_ops);
1550                 if (ret != 0) {
1551                         rte_exit(EXIT_FAILURE,
1552                                 "failed to register vhost driver callbacks.\n");
1553                 }
1554
1555                 if (rte_vhost_driver_start(file) < 0) {
1556                         rte_exit(EXIT_FAILURE,
1557                                 "failed to start vhost driver.\n");
1558                 }
1559         }
1560
1561         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1562                 rte_eal_wait_lcore(lcore_id);
1563
1564         return 0;
1565
1566 }