examples/vhost: fix sending ARP packet to self
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60
61 /* Size of buffers used for snprintfs. */
62 #define MAX_PRINT_BUFF 6072
63
64 /* Maximum long option length for option parsing. */
65 #define MAX_LONG_OPT_SZ 64
66
67 /* mask of enabled ports */
68 static uint32_t enabled_port_mask = 0;
69
70 /* Promiscuous mode */
71 static uint32_t promiscuous;
72
73 /* number of devices/queues to support*/
74 static uint32_t num_queues = 0;
75 static uint32_t num_devices;
76
77 static struct rte_mempool *mbuf_pool;
78 static int mergeable;
79
80 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 typedef enum {
82         VM2VM_DISABLED = 0,
83         VM2VM_SOFTWARE = 1,
84         VM2VM_HARDWARE = 2,
85         VM2VM_LAST
86 } vm2vm_type;
87 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
88
89 /* Enable stats. */
90 static uint32_t enable_stats = 0;
91 /* Enable retries on RX. */
92 static uint32_t enable_retry = 1;
93
94 /* Disable TX checksum offload */
95 static uint32_t enable_tx_csum;
96
97 /* Disable TSO offload */
98 static uint32_t enable_tso;
99
100 static int client_mode;
101 static int dequeue_zero_copy;
102
103 static int builtin_net_driver;
104
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113
114 /* empty vmdq configuration structure. Filled in programatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116         .rxmode = {
117                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
118                 .split_hdr_size = 0,
119                 .header_split   = 0, /**< Header Split disabled */
120                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
121                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
122                 /*
123                  * It is necessary for 1G NIC such as I350,
124                  * this fixes bug of ipv4 forwarding in guest can't
125                  * forward pakets from one virtio dev to another virtio dev.
126                  */
127                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
128                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
129                 .hw_strip_crc   = 1, /**< CRC stripped by hardware */
130         },
131
132         .txmode = {
133                 .mq_mode = ETH_MQ_TX_NONE,
134         },
135         .rx_adv_conf = {
136                 /*
137                  * should be overridden separately in code with
138                  * appropriate values
139                  */
140                 .vmdq_rx_conf = {
141                         .nb_queue_pools = ETH_8_POOLS,
142                         .enable_default_pool = 0,
143                         .default_pool = 0,
144                         .nb_pool_maps = 0,
145                         .pool_map = {{0, 0},},
146                 },
147         },
148 };
149
150 static unsigned lcore_ids[RTE_MAX_LCORE];
151 static uint16_t ports[RTE_MAX_ETHPORTS];
152 static unsigned num_ports = 0; /**< The number of ports specified in command line */
153 static uint16_t num_pf_queues, num_vmdq_queues;
154 static uint16_t vmdq_pool_base, vmdq_queue_base;
155 static uint16_t queues_per_pool;
156
157 const uint16_t vlan_tags[] = {
158         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
159         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
160         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
161         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
162         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
163         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
164         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
165         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
166 };
167
168 /* ethernet addresses of ports */
169 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
170
171 static struct vhost_dev_tailq_list vhost_dev_list =
172         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
173
174 static struct lcore_info lcore_info[RTE_MAX_LCORE];
175
176 /* Used for queueing bursts of TX packets. */
177 struct mbuf_table {
178         unsigned len;
179         unsigned txq_id;
180         struct rte_mbuf *m_table[MAX_PKT_BURST];
181 };
182
183 /* TX queue for each data core. */
184 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
185
186 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
187                                  / US_PER_S * BURST_TX_DRAIN_US)
188 #define VLAN_HLEN       4
189
190 /*
191  * Builds up the correct configuration for VMDQ VLAN pool map
192  * according to the pool & queue limits.
193  */
194 static inline int
195 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
196 {
197         struct rte_eth_vmdq_rx_conf conf;
198         struct rte_eth_vmdq_rx_conf *def_conf =
199                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
200         unsigned i;
201
202         memset(&conf, 0, sizeof(conf));
203         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
204         conf.nb_pool_maps = num_devices;
205         conf.enable_loop_back = def_conf->enable_loop_back;
206         conf.rx_mode = def_conf->rx_mode;
207
208         for (i = 0; i < conf.nb_pool_maps; i++) {
209                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
210                 conf.pool_map[i].pools = (1UL << i);
211         }
212
213         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
214         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
215                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
216         return 0;
217 }
218
219 /*
220  * Validate the device number according to the max pool number gotten form
221  * dev_info. If the device number is invalid, give the error message and
222  * return -1. Each device must have its own pool.
223  */
224 static inline int
225 validate_num_devices(uint32_t max_nb_devices)
226 {
227         if (num_devices > max_nb_devices) {
228                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
229                 return -1;
230         }
231         return 0;
232 }
233
234 /*
235  * Initialises a given port using global settings and with the rx buffers
236  * coming from the mbuf_pool passed as parameter
237  */
238 static inline int
239 port_init(uint16_t port)
240 {
241         struct rte_eth_dev_info dev_info;
242         struct rte_eth_conf port_conf;
243         struct rte_eth_rxconf *rxconf;
244         struct rte_eth_txconf *txconf;
245         int16_t rx_rings, tx_rings;
246         uint16_t rx_ring_size, tx_ring_size;
247         int retval;
248         uint16_t q;
249
250         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
251         rte_eth_dev_info_get (port, &dev_info);
252
253         if (dev_info.max_rx_queues > MAX_QUEUES) {
254                 rte_exit(EXIT_FAILURE,
255                         "please define MAX_QUEUES no less than %u in %s\n",
256                         dev_info.max_rx_queues, __FILE__);
257         }
258
259         rxconf = &dev_info.default_rxconf;
260         txconf = &dev_info.default_txconf;
261         rxconf->rx_drop_en = 1;
262
263         /* Enable vlan offload */
264         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
265
266         /*configure the number of supported virtio devices based on VMDQ limits */
267         num_devices = dev_info.max_vmdq_pools;
268
269         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
270         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
271
272         /*
273          * When dequeue zero copy is enabled, guest Tx used vring will be
274          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
275          * (tx_ring_size here) must be small enough so that the driver will
276          * hit the free threshold easily and free mbufs timely. Otherwise,
277          * guest Tx vring would be starved.
278          */
279         if (dequeue_zero_copy)
280                 tx_ring_size = 64;
281
282         tx_rings = (uint16_t)rte_lcore_count();
283
284         retval = validate_num_devices(MAX_DEVICES);
285         if (retval < 0)
286                 return retval;
287
288         /* Get port configuration. */
289         retval = get_eth_conf(&port_conf, num_devices);
290         if (retval < 0)
291                 return retval;
292         /* NIC queues are divided into pf queues and vmdq queues.  */
293         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
294         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
295         num_vmdq_queues = num_devices * queues_per_pool;
296         num_queues = num_pf_queues + num_vmdq_queues;
297         vmdq_queue_base = dev_info.vmdq_queue_base;
298         vmdq_pool_base  = dev_info.vmdq_pool_base;
299         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
300                 num_pf_queues, num_devices, queues_per_pool);
301
302         if (port >= rte_eth_dev_count()) return -1;
303
304         rx_rings = (uint16_t)dev_info.max_rx_queues;
305         /* Configure ethernet device. */
306         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
307         if (retval != 0) {
308                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
309                         port, strerror(-retval));
310                 return retval;
311         }
312
313         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
314                 &tx_ring_size);
315         if (retval != 0) {
316                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
317                         "for port %u: %s.\n", port, strerror(-retval));
318                 return retval;
319         }
320         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
321                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
322                         "for Rx queues on port %u.\n", port);
323                 return -1;
324         }
325
326         /* Setup the queues. */
327         for (q = 0; q < rx_rings; q ++) {
328                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
329                                                 rte_eth_dev_socket_id(port),
330                                                 rxconf,
331                                                 mbuf_pool);
332                 if (retval < 0) {
333                         RTE_LOG(ERR, VHOST_PORT,
334                                 "Failed to setup rx queue %u of port %u: %s.\n",
335                                 q, port, strerror(-retval));
336                         return retval;
337                 }
338         }
339         for (q = 0; q < tx_rings; q ++) {
340                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
341                                                 rte_eth_dev_socket_id(port),
342                                                 txconf);
343                 if (retval < 0) {
344                         RTE_LOG(ERR, VHOST_PORT,
345                                 "Failed to setup tx queue %u of port %u: %s.\n",
346                                 q, port, strerror(-retval));
347                         return retval;
348                 }
349         }
350
351         /* Start the device. */
352         retval  = rte_eth_dev_start(port);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
355                         port, strerror(-retval));
356                 return retval;
357         }
358
359         if (promiscuous)
360                 rte_eth_promiscuous_enable(port);
361
362         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
364         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
365                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
366                         port,
367                         vmdq_ports_eth_addr[port].addr_bytes[0],
368                         vmdq_ports_eth_addr[port].addr_bytes[1],
369                         vmdq_ports_eth_addr[port].addr_bytes[2],
370                         vmdq_ports_eth_addr[port].addr_bytes[3],
371                         vmdq_ports_eth_addr[port].addr_bytes[4],
372                         vmdq_ports_eth_addr[port].addr_bytes[5]);
373
374         return 0;
375 }
376
377 /*
378  * Set socket file path.
379  */
380 static int
381 us_vhost_parse_socket_path(const char *q_arg)
382 {
383         /* parse number string */
384         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
385                 return -1;
386
387         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
388         snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
389         nb_sockets++;
390
391         return 0;
392 }
393
394 /*
395  * Parse the portmask provided at run time.
396  */
397 static int
398 parse_portmask(const char *portmask)
399 {
400         char *end = NULL;
401         unsigned long pm;
402
403         errno = 0;
404
405         /* parse hexadecimal string */
406         pm = strtoul(portmask, &end, 16);
407         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
408                 return -1;
409
410         if (pm == 0)
411                 return -1;
412
413         return pm;
414
415 }
416
417 /*
418  * Parse num options at run time.
419  */
420 static int
421 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
422 {
423         char *end = NULL;
424         unsigned long num;
425
426         errno = 0;
427
428         /* parse unsigned int string */
429         num = strtoul(q_arg, &end, 10);
430         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
431                 return -1;
432
433         if (num > max_valid_value)
434                 return -1;
435
436         return num;
437
438 }
439
440 /*
441  * Display usage
442  */
443 static void
444 us_vhost_usage(const char *prgname)
445 {
446         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
447         "               --vm2vm [0|1|2]\n"
448         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
449         "               --socket-file <path>\n"
450         "               --nb-devices ND\n"
451         "               -p PORTMASK: Set mask for ports to be used by application\n"
452         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
453         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
454         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
455         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
456         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
457         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
458         "               --socket-file: The path of the socket file.\n"
459         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
460         "               --tso [0|1] disable/enable TCP segment offload.\n"
461         "               --client register a vhost-user socket as client mode.\n"
462         "               --dequeue-zero-copy enables dequeue zero copy\n",
463                prgname);
464 }
465
466 /*
467  * Parse the arguments given in the command line of the application.
468  */
469 static int
470 us_vhost_parse_args(int argc, char **argv)
471 {
472         int opt, ret;
473         int option_index;
474         unsigned i;
475         const char *prgname = argv[0];
476         static struct option long_option[] = {
477                 {"vm2vm", required_argument, NULL, 0},
478                 {"rx-retry", required_argument, NULL, 0},
479                 {"rx-retry-delay", required_argument, NULL, 0},
480                 {"rx-retry-num", required_argument, NULL, 0},
481                 {"mergeable", required_argument, NULL, 0},
482                 {"stats", required_argument, NULL, 0},
483                 {"socket-file", required_argument, NULL, 0},
484                 {"tx-csum", required_argument, NULL, 0},
485                 {"tso", required_argument, NULL, 0},
486                 {"client", no_argument, &client_mode, 1},
487                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
488                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
489                 {NULL, 0, 0, 0},
490         };
491
492         /* Parse command line */
493         while ((opt = getopt_long(argc, argv, "p:P",
494                         long_option, &option_index)) != EOF) {
495                 switch (opt) {
496                 /* Portmask */
497                 case 'p':
498                         enabled_port_mask = parse_portmask(optarg);
499                         if (enabled_port_mask == 0) {
500                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
501                                 us_vhost_usage(prgname);
502                                 return -1;
503                         }
504                         break;
505
506                 case 'P':
507                         promiscuous = 1;
508                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
509                                 ETH_VMDQ_ACCEPT_BROADCAST |
510                                 ETH_VMDQ_ACCEPT_MULTICAST;
511
512                         break;
513
514                 case 0:
515                         /* Enable/disable vm2vm comms. */
516                         if (!strncmp(long_option[option_index].name, "vm2vm",
517                                 MAX_LONG_OPT_SZ)) {
518                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
519                                 if (ret == -1) {
520                                         RTE_LOG(INFO, VHOST_CONFIG,
521                                                 "Invalid argument for "
522                                                 "vm2vm [0|1|2]\n");
523                                         us_vhost_usage(prgname);
524                                         return -1;
525                                 } else {
526                                         vm2vm_mode = (vm2vm_type)ret;
527                                 }
528                         }
529
530                         /* Enable/disable retries on RX. */
531                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
532                                 ret = parse_num_opt(optarg, 1);
533                                 if (ret == -1) {
534                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
535                                         us_vhost_usage(prgname);
536                                         return -1;
537                                 } else {
538                                         enable_retry = ret;
539                                 }
540                         }
541
542                         /* Enable/disable TX checksum offload. */
543                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
544                                 ret = parse_num_opt(optarg, 1);
545                                 if (ret == -1) {
546                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
547                                         us_vhost_usage(prgname);
548                                         return -1;
549                                 } else
550                                         enable_tx_csum = ret;
551                         }
552
553                         /* Enable/disable TSO offload. */
554                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
555                                 ret = parse_num_opt(optarg, 1);
556                                 if (ret == -1) {
557                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
558                                         us_vhost_usage(prgname);
559                                         return -1;
560                                 } else
561                                         enable_tso = ret;
562                         }
563
564                         /* Specify the retries delay time (in useconds) on RX. */
565                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
566                                 ret = parse_num_opt(optarg, INT32_MAX);
567                                 if (ret == -1) {
568                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
569                                         us_vhost_usage(prgname);
570                                         return -1;
571                                 } else {
572                                         burst_rx_delay_time = ret;
573                                 }
574                         }
575
576                         /* Specify the retries number on RX. */
577                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
578                                 ret = parse_num_opt(optarg, INT32_MAX);
579                                 if (ret == -1) {
580                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
581                                         us_vhost_usage(prgname);
582                                         return -1;
583                                 } else {
584                                         burst_rx_retry_num = ret;
585                                 }
586                         }
587
588                         /* Enable/disable RX mergeable buffers. */
589                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
590                                 ret = parse_num_opt(optarg, 1);
591                                 if (ret == -1) {
592                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
593                                         us_vhost_usage(prgname);
594                                         return -1;
595                                 } else {
596                                         mergeable = !!ret;
597                                         if (ret) {
598                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
599                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
600                                                         = JUMBO_FRAME_MAX_SIZE;
601                                         }
602                                 }
603                         }
604
605                         /* Enable/disable stats. */
606                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
607                                 ret = parse_num_opt(optarg, INT32_MAX);
608                                 if (ret == -1) {
609                                         RTE_LOG(INFO, VHOST_CONFIG,
610                                                 "Invalid argument for stats [0..N]\n");
611                                         us_vhost_usage(prgname);
612                                         return -1;
613                                 } else {
614                                         enable_stats = ret;
615                                 }
616                         }
617
618                         /* Set socket file path. */
619                         if (!strncmp(long_option[option_index].name,
620                                                 "socket-file", MAX_LONG_OPT_SZ)) {
621                                 if (us_vhost_parse_socket_path(optarg) == -1) {
622                                         RTE_LOG(INFO, VHOST_CONFIG,
623                                         "Invalid argument for socket name (Max %d characters)\n",
624                                         PATH_MAX);
625                                         us_vhost_usage(prgname);
626                                         return -1;
627                                 }
628                         }
629
630                         break;
631
632                         /* Invalid option - print options. */
633                 default:
634                         us_vhost_usage(prgname);
635                         return -1;
636                 }
637         }
638
639         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
640                 if (enabled_port_mask & (1 << i))
641                         ports[num_ports++] = i;
642         }
643
644         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
645                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
646                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
647                 return -1;
648         }
649
650         return 0;
651 }
652
653 /*
654  * Update the global var NUM_PORTS and array PORTS according to system ports number
655  * and return valid ports number
656  */
657 static unsigned check_ports_num(unsigned nb_ports)
658 {
659         unsigned valid_num_ports = num_ports;
660         unsigned portid;
661
662         if (num_ports > nb_ports) {
663                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
664                         num_ports, nb_ports);
665                 num_ports = nb_ports;
666         }
667
668         for (portid = 0; portid < num_ports; portid ++) {
669                 if (ports[portid] >= nb_ports) {
670                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
671                                 ports[portid], (nb_ports - 1));
672                         ports[portid] = INVALID_PORT_ID;
673                         valid_num_ports--;
674                 }
675         }
676         return valid_num_ports;
677 }
678
679 static __rte_always_inline struct vhost_dev *
680 find_vhost_dev(struct ether_addr *mac)
681 {
682         struct vhost_dev *vdev;
683
684         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
685                 if (vdev->ready == DEVICE_RX &&
686                     is_same_ether_addr(mac, &vdev->mac_address))
687                         return vdev;
688         }
689
690         return NULL;
691 }
692
693 /*
694  * This function learns the MAC address of the device and registers this along with a
695  * vlan tag to a VMDQ.
696  */
697 static int
698 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
699 {
700         struct ether_hdr *pkt_hdr;
701         int i, ret;
702
703         /* Learn MAC address of guest device from packet */
704         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
705
706         if (find_vhost_dev(&pkt_hdr->s_addr)) {
707                 RTE_LOG(ERR, VHOST_DATA,
708                         "(%d) device is using a registered MAC!\n",
709                         vdev->vid);
710                 return -1;
711         }
712
713         for (i = 0; i < ETHER_ADDR_LEN; i++)
714                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
715
716         /* vlan_tag currently uses the device_id. */
717         vdev->vlan_tag = vlan_tags[vdev->vid];
718
719         /* Print out VMDQ registration info. */
720         RTE_LOG(INFO, VHOST_DATA,
721                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
722                 vdev->vid,
723                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
724                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
725                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
726                 vdev->vlan_tag);
727
728         /* Register the MAC address. */
729         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
730                                 (uint32_t)vdev->vid + vmdq_pool_base);
731         if (ret)
732                 RTE_LOG(ERR, VHOST_DATA,
733                         "(%d) failed to add device MAC address to VMDQ\n",
734                         vdev->vid);
735
736         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
737
738         /* Set device as ready for RX. */
739         vdev->ready = DEVICE_RX;
740
741         return 0;
742 }
743
744 /*
745  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
746  * queue before disabling RX on the device.
747  */
748 static inline void
749 unlink_vmdq(struct vhost_dev *vdev)
750 {
751         unsigned i = 0;
752         unsigned rx_count;
753         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
754
755         if (vdev->ready == DEVICE_RX) {
756                 /*clear MAC and VLAN settings*/
757                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
758                 for (i = 0; i < 6; i++)
759                         vdev->mac_address.addr_bytes[i] = 0;
760
761                 vdev->vlan_tag = 0;
762
763                 /*Clear out the receive buffers*/
764                 rx_count = rte_eth_rx_burst(ports[0],
765                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
766
767                 while (rx_count) {
768                         for (i = 0; i < rx_count; i++)
769                                 rte_pktmbuf_free(pkts_burst[i]);
770
771                         rx_count = rte_eth_rx_burst(ports[0],
772                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
773                 }
774
775                 vdev->ready = DEVICE_MAC_LEARNING;
776         }
777 }
778
779 static __rte_always_inline void
780 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
781             struct rte_mbuf *m)
782 {
783         uint16_t ret;
784
785         if (builtin_net_driver) {
786                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
787         } else {
788                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
789         }
790
791         if (enable_stats) {
792                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
793                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
794                 src_vdev->stats.tx_total++;
795                 src_vdev->stats.tx += ret;
796         }
797 }
798
799 /*
800  * Check if the packet destination MAC address is for a local device. If so then put
801  * the packet on that devices RX queue. If not then return.
802  */
803 static __rte_always_inline int
804 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
805 {
806         struct ether_hdr *pkt_hdr;
807         struct vhost_dev *dst_vdev;
808
809         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
810
811         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
812         if (!dst_vdev)
813                 return -1;
814
815         if (vdev->vid == dst_vdev->vid) {
816                 RTE_LOG_DP(DEBUG, VHOST_DATA,
817                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
818                         vdev->vid);
819                 return 0;
820         }
821
822         RTE_LOG_DP(DEBUG, VHOST_DATA,
823                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
824
825         if (unlikely(dst_vdev->remove)) {
826                 RTE_LOG_DP(DEBUG, VHOST_DATA,
827                         "(%d) device is marked for removal\n", dst_vdev->vid);
828                 return 0;
829         }
830
831         virtio_xmit(dst_vdev, vdev, m);
832         return 0;
833 }
834
835 /*
836  * Check if the destination MAC of a packet is one local VM,
837  * and get its vlan tag, and offset if it is.
838  */
839 static __rte_always_inline int
840 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
841         uint32_t *offset, uint16_t *vlan_tag)
842 {
843         struct vhost_dev *dst_vdev;
844         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
845
846         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
847         if (!dst_vdev)
848                 return 0;
849
850         if (vdev->vid == dst_vdev->vid) {
851                 RTE_LOG_DP(DEBUG, VHOST_DATA,
852                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
853                         vdev->vid);
854                 return -1;
855         }
856
857         /*
858          * HW vlan strip will reduce the packet length
859          * by minus length of vlan tag, so need restore
860          * the packet length by plus it.
861          */
862         *offset  = VLAN_HLEN;
863         *vlan_tag = vlan_tags[vdev->vid];
864
865         RTE_LOG_DP(DEBUG, VHOST_DATA,
866                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
867                 vdev->vid, dst_vdev->vid, *vlan_tag);
868
869         return 0;
870 }
871
872 static uint16_t
873 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
874 {
875         if (ol_flags & PKT_TX_IPV4)
876                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
877         else /* assume ethertype == ETHER_TYPE_IPv6 */
878                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
879 }
880
881 static void virtio_tx_offload(struct rte_mbuf *m)
882 {
883         void *l3_hdr;
884         struct ipv4_hdr *ipv4_hdr = NULL;
885         struct tcp_hdr *tcp_hdr = NULL;
886         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
887
888         l3_hdr = (char *)eth_hdr + m->l2_len;
889
890         if (m->ol_flags & PKT_TX_IPV4) {
891                 ipv4_hdr = l3_hdr;
892                 ipv4_hdr->hdr_checksum = 0;
893                 m->ol_flags |= PKT_TX_IP_CKSUM;
894         }
895
896         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
897         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
898 }
899
900 static inline void
901 free_pkts(struct rte_mbuf **pkts, uint16_t n)
902 {
903         while (n--)
904                 rte_pktmbuf_free(pkts[n]);
905 }
906
907 static __rte_always_inline void
908 do_drain_mbuf_table(struct mbuf_table *tx_q)
909 {
910         uint16_t count;
911
912         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
913                                  tx_q->m_table, tx_q->len);
914         if (unlikely(count < tx_q->len))
915                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
916
917         tx_q->len = 0;
918 }
919
920 /*
921  * This function routes the TX packet to the correct interface. This
922  * may be a local device or the physical port.
923  */
924 static __rte_always_inline void
925 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
926 {
927         struct mbuf_table *tx_q;
928         unsigned offset = 0;
929         const uint16_t lcore_id = rte_lcore_id();
930         struct ether_hdr *nh;
931
932
933         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
934         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
935                 struct vhost_dev *vdev2;
936
937                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
938                         if (vdev2 != vdev)
939                                 virtio_xmit(vdev2, vdev, m);
940                 }
941                 goto queue2nic;
942         }
943
944         /*check if destination is local VM*/
945         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
946                 rte_pktmbuf_free(m);
947                 return;
948         }
949
950         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
951                 if (unlikely(find_local_dest(vdev, m, &offset,
952                                              &vlan_tag) != 0)) {
953                         rte_pktmbuf_free(m);
954                         return;
955                 }
956         }
957
958         RTE_LOG_DP(DEBUG, VHOST_DATA,
959                 "(%d) TX: MAC address is external\n", vdev->vid);
960
961 queue2nic:
962
963         /*Add packet to the port tx queue*/
964         tx_q = &lcore_tx_queue[lcore_id];
965
966         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
967         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
968                 /* Guest has inserted the vlan tag. */
969                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
970                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
971                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
972                         (vh->vlan_tci != vlan_tag_be))
973                         vh->vlan_tci = vlan_tag_be;
974         } else {
975                 m->ol_flags |= PKT_TX_VLAN_PKT;
976
977                 /*
978                  * Find the right seg to adjust the data len when offset is
979                  * bigger than tail room size.
980                  */
981                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
982                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
983                                 m->data_len += offset;
984                         else {
985                                 struct rte_mbuf *seg = m;
986
987                                 while ((seg->next != NULL) &&
988                                         (offset > rte_pktmbuf_tailroom(seg)))
989                                         seg = seg->next;
990
991                                 seg->data_len += offset;
992                         }
993                         m->pkt_len += offset;
994                 }
995
996                 m->vlan_tci = vlan_tag;
997         }
998
999         if (m->ol_flags & PKT_TX_TCP_SEG)
1000                 virtio_tx_offload(m);
1001
1002         tx_q->m_table[tx_q->len++] = m;
1003         if (enable_stats) {
1004                 vdev->stats.tx_total++;
1005                 vdev->stats.tx++;
1006         }
1007
1008         if (unlikely(tx_q->len == MAX_PKT_BURST))
1009                 do_drain_mbuf_table(tx_q);
1010 }
1011
1012
1013 static __rte_always_inline void
1014 drain_mbuf_table(struct mbuf_table *tx_q)
1015 {
1016         static uint64_t prev_tsc;
1017         uint64_t cur_tsc;
1018
1019         if (tx_q->len == 0)
1020                 return;
1021
1022         cur_tsc = rte_rdtsc();
1023         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1024                 prev_tsc = cur_tsc;
1025
1026                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1027                         "TX queue drained after timeout with burst size %u\n",
1028                         tx_q->len);
1029                 do_drain_mbuf_table(tx_q);
1030         }
1031 }
1032
1033 static __rte_always_inline void
1034 drain_eth_rx(struct vhost_dev *vdev)
1035 {
1036         uint16_t rx_count, enqueue_count;
1037         struct rte_mbuf *pkts[MAX_PKT_BURST];
1038
1039         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1040                                     pkts, MAX_PKT_BURST);
1041         if (!rx_count)
1042                 return;
1043
1044         /*
1045          * When "enable_retry" is set, here we wait and retry when there
1046          * is no enough free slots in the queue to hold @rx_count packets,
1047          * to diminish packet loss.
1048          */
1049         if (enable_retry &&
1050             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1051                         VIRTIO_RXQ))) {
1052                 uint32_t retry;
1053
1054                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1055                         rte_delay_us(burst_rx_delay_time);
1056                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1057                                         VIRTIO_RXQ))
1058                                 break;
1059                 }
1060         }
1061
1062         if (builtin_net_driver) {
1063                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1064                                                 pkts, rx_count);
1065         } else {
1066                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1067                                                 pkts, rx_count);
1068         }
1069         if (enable_stats) {
1070                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1071                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1072         }
1073
1074         free_pkts(pkts, rx_count);
1075 }
1076
1077 static __rte_always_inline void
1078 drain_virtio_tx(struct vhost_dev *vdev)
1079 {
1080         struct rte_mbuf *pkts[MAX_PKT_BURST];
1081         uint16_t count;
1082         uint16_t i;
1083
1084         if (builtin_net_driver) {
1085                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1086                                         pkts, MAX_PKT_BURST);
1087         } else {
1088                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1089                                         mbuf_pool, pkts, MAX_PKT_BURST);
1090         }
1091
1092         /* setup VMDq for the first packet */
1093         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1094                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1095                         free_pkts(pkts, count);
1096         }
1097
1098         for (i = 0; i < count; ++i)
1099                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1100 }
1101
1102 /*
1103  * Main function of vhost-switch. It basically does:
1104  *
1105  * for each vhost device {
1106  *    - drain_eth_rx()
1107  *
1108  *      Which drains the host eth Rx queue linked to the vhost device,
1109  *      and deliver all of them to guest virito Rx ring associated with
1110  *      this vhost device.
1111  *
1112  *    - drain_virtio_tx()
1113  *
1114  *      Which drains the guest virtio Tx queue and deliver all of them
1115  *      to the target, which could be another vhost device, or the
1116  *      physical eth dev. The route is done in function "virtio_tx_route".
1117  * }
1118  */
1119 static int
1120 switch_worker(void *arg __rte_unused)
1121 {
1122         unsigned i;
1123         unsigned lcore_id = rte_lcore_id();
1124         struct vhost_dev *vdev;
1125         struct mbuf_table *tx_q;
1126
1127         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1128
1129         tx_q = &lcore_tx_queue[lcore_id];
1130         for (i = 0; i < rte_lcore_count(); i++) {
1131                 if (lcore_ids[i] == lcore_id) {
1132                         tx_q->txq_id = i;
1133                         break;
1134                 }
1135         }
1136
1137         while(1) {
1138                 drain_mbuf_table(tx_q);
1139
1140                 /*
1141                  * Inform the configuration core that we have exited the
1142                  * linked list and that no devices are in use if requested.
1143                  */
1144                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1145                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1146
1147                 /*
1148                  * Process vhost devices
1149                  */
1150                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1151                               lcore_vdev_entry) {
1152                         if (unlikely(vdev->remove)) {
1153                                 unlink_vmdq(vdev);
1154                                 vdev->ready = DEVICE_SAFE_REMOVE;
1155                                 continue;
1156                         }
1157
1158                         if (likely(vdev->ready == DEVICE_RX))
1159                                 drain_eth_rx(vdev);
1160
1161                         if (likely(!vdev->remove))
1162                                 drain_virtio_tx(vdev);
1163                 }
1164         }
1165
1166         return 0;
1167 }
1168
1169 /*
1170  * Remove a device from the specific data core linked list and from the
1171  * main linked list. Synchonization  occurs through the use of the
1172  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1173  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1174  */
1175 static void
1176 destroy_device(int vid)
1177 {
1178         struct vhost_dev *vdev = NULL;
1179         int lcore;
1180
1181         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1182                 if (vdev->vid == vid)
1183                         break;
1184         }
1185         if (!vdev)
1186                 return;
1187         /*set the remove flag. */
1188         vdev->remove = 1;
1189         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1190                 rte_pause();
1191         }
1192
1193         if (builtin_net_driver)
1194                 vs_vhost_net_remove(vdev);
1195
1196         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1197                      lcore_vdev_entry);
1198         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1199
1200
1201         /* Set the dev_removal_flag on each lcore. */
1202         RTE_LCORE_FOREACH_SLAVE(lcore)
1203                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1204
1205         /*
1206          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1207          * we can be sure that they can no longer access the device removed
1208          * from the linked lists and that the devices are no longer in use.
1209          */
1210         RTE_LCORE_FOREACH_SLAVE(lcore) {
1211                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1212                         rte_pause();
1213         }
1214
1215         lcore_info[vdev->coreid].device_num--;
1216
1217         RTE_LOG(INFO, VHOST_DATA,
1218                 "(%d) device has been removed from data core\n",
1219                 vdev->vid);
1220
1221         rte_free(vdev);
1222 }
1223
1224 /*
1225  * A new device is added to a data core. First the device is added to the main linked list
1226  * and the allocated to a specific data core.
1227  */
1228 static int
1229 new_device(int vid)
1230 {
1231         int lcore, core_add = 0;
1232         uint32_t device_num_min = num_devices;
1233         struct vhost_dev *vdev;
1234
1235         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1236         if (vdev == NULL) {
1237                 RTE_LOG(INFO, VHOST_DATA,
1238                         "(%d) couldn't allocate memory for vhost dev\n",
1239                         vid);
1240                 return -1;
1241         }
1242         vdev->vid = vid;
1243
1244         if (builtin_net_driver)
1245                 vs_vhost_net_setup(vdev);
1246
1247         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1248         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1249
1250         /*reset ready flag*/
1251         vdev->ready = DEVICE_MAC_LEARNING;
1252         vdev->remove = 0;
1253
1254         /* Find a suitable lcore to add the device. */
1255         RTE_LCORE_FOREACH_SLAVE(lcore) {
1256                 if (lcore_info[lcore].device_num < device_num_min) {
1257                         device_num_min = lcore_info[lcore].device_num;
1258                         core_add = lcore;
1259                 }
1260         }
1261         vdev->coreid = core_add;
1262
1263         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1264                           lcore_vdev_entry);
1265         lcore_info[vdev->coreid].device_num++;
1266
1267         /* Disable notifications. */
1268         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1269         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1270
1271         RTE_LOG(INFO, VHOST_DATA,
1272                 "(%d) device has been added to data core %d\n",
1273                 vid, vdev->coreid);
1274
1275         return 0;
1276 }
1277
1278 /*
1279  * These callback allow devices to be added to the data core when configuration
1280  * has been fully complete.
1281  */
1282 static const struct vhost_device_ops virtio_net_device_ops =
1283 {
1284         .new_device =  new_device,
1285         .destroy_device = destroy_device,
1286 };
1287
1288 /*
1289  * This is a thread will wake up after a period to print stats if the user has
1290  * enabled them.
1291  */
1292 static void
1293 print_stats(void)
1294 {
1295         struct vhost_dev *vdev;
1296         uint64_t tx_dropped, rx_dropped;
1297         uint64_t tx, tx_total, rx, rx_total;
1298         const char clr[] = { 27, '[', '2', 'J', '\0' };
1299         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1300
1301         while(1) {
1302                 sleep(enable_stats);
1303
1304                 /* Clear screen and move to top left */
1305                 printf("%s%s\n", clr, top_left);
1306                 printf("Device statistics =================================\n");
1307
1308                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1309                         tx_total   = vdev->stats.tx_total;
1310                         tx         = vdev->stats.tx;
1311                         tx_dropped = tx_total - tx;
1312
1313                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1314                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1315                         rx_dropped = rx_total - rx;
1316
1317                         printf("Statistics for device %d\n"
1318                                 "-----------------------\n"
1319                                 "TX total:              %" PRIu64 "\n"
1320                                 "TX dropped:            %" PRIu64 "\n"
1321                                 "TX successful:         %" PRIu64 "\n"
1322                                 "RX total:              %" PRIu64 "\n"
1323                                 "RX dropped:            %" PRIu64 "\n"
1324                                 "RX successful:         %" PRIu64 "\n",
1325                                 vdev->vid,
1326                                 tx_total, tx_dropped, tx,
1327                                 rx_total, rx_dropped, rx);
1328                 }
1329
1330                 printf("===================================================\n");
1331         }
1332 }
1333
1334 static void
1335 unregister_drivers(int socket_num)
1336 {
1337         int i, ret;
1338
1339         for (i = 0; i < socket_num; i++) {
1340                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1341                 if (ret != 0)
1342                         RTE_LOG(ERR, VHOST_CONFIG,
1343                                 "Fail to unregister vhost driver for %s.\n",
1344                                 socket_files + i * PATH_MAX);
1345         }
1346 }
1347
1348 /* When we receive a INT signal, unregister vhost driver */
1349 static void
1350 sigint_handler(__rte_unused int signum)
1351 {
1352         /* Unregister vhost driver. */
1353         unregister_drivers(nb_sockets);
1354
1355         exit(0);
1356 }
1357
1358 /*
1359  * While creating an mbuf pool, one key thing is to figure out how
1360  * many mbuf entries is enough for our use. FYI, here are some
1361  * guidelines:
1362  *
1363  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1364  *
1365  * - For each switch core (A CPU core does the packet switch), we need
1366  *   also make some reservation for receiving the packets from virtio
1367  *   Tx queue. How many is enough depends on the usage. It's normally
1368  *   a simple calculation like following:
1369  *
1370  *       MAX_PKT_BURST * max packet size / mbuf size
1371  *
1372  *   So, we definitely need allocate more mbufs when TSO is enabled.
1373  *
1374  * - Similarly, for each switching core, we should serve @nr_rx_desc
1375  *   mbufs for receiving the packets from physical NIC device.
1376  *
1377  * - We also need make sure, for each switch core, we have allocated
1378  *   enough mbufs to fill up the mbuf cache.
1379  */
1380 static void
1381 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1382         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1383 {
1384         uint32_t nr_mbufs;
1385         uint32_t nr_mbufs_per_core;
1386         uint32_t mtu = 1500;
1387
1388         if (mergeable)
1389                 mtu = 9000;
1390         if (enable_tso)
1391                 mtu = 64 * 1024;
1392
1393         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1394                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1395         nr_mbufs_per_core += nr_rx_desc;
1396         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1397
1398         nr_mbufs  = nr_queues * nr_rx_desc;
1399         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1400         nr_mbufs *= nr_port;
1401
1402         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1403                                             nr_mbuf_cache, 0, mbuf_size,
1404                                             rte_socket_id());
1405         if (mbuf_pool == NULL)
1406                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1407 }
1408
1409 /*
1410  * Main function, does initialisation and calls the per-lcore functions.
1411  */
1412 int
1413 main(int argc, char *argv[])
1414 {
1415         unsigned lcore_id, core_id = 0;
1416         unsigned nb_ports, valid_num_ports;
1417         int ret, i;
1418         uint16_t portid;
1419         static pthread_t tid;
1420         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1421         uint64_t flags = 0;
1422
1423         signal(SIGINT, sigint_handler);
1424
1425         /* init EAL */
1426         ret = rte_eal_init(argc, argv);
1427         if (ret < 0)
1428                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1429         argc -= ret;
1430         argv += ret;
1431
1432         /* parse app arguments */
1433         ret = us_vhost_parse_args(argc, argv);
1434         if (ret < 0)
1435                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1436
1437         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1438                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1439
1440                 if (rte_lcore_is_enabled(lcore_id))
1441                         lcore_ids[core_id++] = lcore_id;
1442         }
1443
1444         if (rte_lcore_count() > RTE_MAX_LCORE)
1445                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1446
1447         /* Get the number of physical ports. */
1448         nb_ports = rte_eth_dev_count();
1449
1450         /*
1451          * Update the global var NUM_PORTS and global array PORTS
1452          * and get value of var VALID_NUM_PORTS according to system ports number
1453          */
1454         valid_num_ports = check_ports_num(nb_ports);
1455
1456         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1457                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1458                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1459                 return -1;
1460         }
1461
1462         /*
1463          * FIXME: here we are trying to allocate mbufs big enough for
1464          * @MAX_QUEUES, but the truth is we're never going to use that
1465          * many queues here. We probably should only do allocation for
1466          * those queues we are going to use.
1467          */
1468         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1469                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1470
1471         if (vm2vm_mode == VM2VM_HARDWARE) {
1472                 /* Enable VT loop back to let L2 switch to do it. */
1473                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1474                 RTE_LOG(DEBUG, VHOST_CONFIG,
1475                         "Enable loop back for L2 switch in vmdq.\n");
1476         }
1477
1478         /* initialize all ports */
1479         for (portid = 0; portid < nb_ports; portid++) {
1480                 /* skip ports that are not enabled */
1481                 if ((enabled_port_mask & (1 << portid)) == 0) {
1482                         RTE_LOG(INFO, VHOST_PORT,
1483                                 "Skipping disabled port %d\n", portid);
1484                         continue;
1485                 }
1486                 if (port_init(portid) != 0)
1487                         rte_exit(EXIT_FAILURE,
1488                                 "Cannot initialize network ports\n");
1489         }
1490
1491         /* Enable stats if the user option is set. */
1492         if (enable_stats) {
1493                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1494                 if (ret != 0)
1495                         rte_exit(EXIT_FAILURE,
1496                                 "Cannot create print-stats thread\n");
1497
1498                 /* Set thread_name for aid in debugging.  */
1499                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1500                 ret = rte_thread_setname(tid, thread_name);
1501                 if (ret != 0)
1502                         RTE_LOG(DEBUG, VHOST_CONFIG,
1503                                 "Cannot set print-stats name\n");
1504         }
1505
1506         /* Launch all data cores. */
1507         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1508                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1509
1510         if (client_mode)
1511                 flags |= RTE_VHOST_USER_CLIENT;
1512
1513         if (dequeue_zero_copy)
1514                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1515
1516         /* Register vhost user driver to handle vhost messages. */
1517         for (i = 0; i < nb_sockets; i++) {
1518                 char *file = socket_files + i * PATH_MAX;
1519                 ret = rte_vhost_driver_register(file, flags);
1520                 if (ret != 0) {
1521                         unregister_drivers(i);
1522                         rte_exit(EXIT_FAILURE,
1523                                 "vhost driver register failure.\n");
1524                 }
1525
1526                 if (builtin_net_driver)
1527                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1528
1529                 if (mergeable == 0) {
1530                         rte_vhost_driver_disable_features(file,
1531                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1532                 }
1533
1534                 if (enable_tx_csum == 0) {
1535                         rte_vhost_driver_disable_features(file,
1536                                 1ULL << VIRTIO_NET_F_CSUM);
1537                 }
1538
1539                 if (enable_tso == 0) {
1540                         rte_vhost_driver_disable_features(file,
1541                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1542                         rte_vhost_driver_disable_features(file,
1543                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1544                         rte_vhost_driver_disable_features(file,
1545                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1546                         rte_vhost_driver_disable_features(file,
1547                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1548                 }
1549
1550                 if (promiscuous) {
1551                         rte_vhost_driver_enable_features(file,
1552                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1553                 }
1554
1555                 ret = rte_vhost_driver_callback_register(file,
1556                         &virtio_net_device_ops);
1557                 if (ret != 0) {
1558                         rte_exit(EXIT_FAILURE,
1559                                 "failed to register vhost driver callbacks.\n");
1560                 }
1561
1562                 if (rte_vhost_driver_start(file) < 0) {
1563                         rte_exit(EXIT_FAILURE,
1564                                 "failed to start vhost driver.\n");
1565                 }
1566         }
1567
1568         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1569                 rte_eal_wait_lcore(lcore_id);
1570
1571         return 0;
1572
1573 }