examples/vhost: remove unnecessary method and constant
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95 static int dequeue_zero_copy;
96
97 static int builtin_net_driver;
98
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110         .rxmode = {
111                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112                 .split_hdr_size = 0,
113                 /*
114                  * VLAN strip is necessary for 1G NIC such as I350,
115                  * this fixes bug of ipv4 forwarding in guest can't
116                  * forward pakets from one virtio dev to another virtio dev.
117                  */
118                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119         },
120
121         .txmode = {
122                 .mq_mode = ETH_MQ_TX_NONE,
123                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124                              DEV_TX_OFFLOAD_TCP_CKSUM |
125                              DEV_TX_OFFLOAD_VLAN_INSERT |
126                              DEV_TX_OFFLOAD_MULTI_SEGS |
127                              DEV_TX_OFFLOAD_TCP_TSO),
128         },
129         .rx_adv_conf = {
130                 /*
131                  * should be overridden separately in code with
132                  * appropriate values
133                  */
134                 .vmdq_rx_conf = {
135                         .nb_queue_pools = ETH_8_POOLS,
136                         .enable_default_pool = 0,
137                         .default_pool = 0,
138                         .nb_pool_maps = 0,
139                         .pool_map = {{0, 0},},
140                 },
141         },
142 };
143
144
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151
152 const uint16_t vlan_tags[] = {
153         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162
163 /* ethernet addresses of ports */
164 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165
166 static struct vhost_dev_tailq_list vhost_dev_list =
167         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173         unsigned len;
174         unsigned txq_id;
175         struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
180
181 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
182                                  / US_PER_S * BURST_TX_DRAIN_US)
183 #define VLAN_HLEN       4
184
185 /*
186  * Builds up the correct configuration for VMDQ VLAN pool map
187  * according to the pool & queue limits.
188  */
189 static inline int
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
191 {
192         struct rte_eth_vmdq_rx_conf conf;
193         struct rte_eth_vmdq_rx_conf *def_conf =
194                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
195         unsigned i;
196
197         memset(&conf, 0, sizeof(conf));
198         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199         conf.nb_pool_maps = num_devices;
200         conf.enable_loop_back = def_conf->enable_loop_back;
201         conf.rx_mode = def_conf->rx_mode;
202
203         for (i = 0; i < conf.nb_pool_maps; i++) {
204                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
205                 conf.pool_map[i].pools = (1UL << i);
206         }
207
208         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
211         return 0;
212 }
213
214 /*
215  * Initialises a given port using global settings and with the rx buffers
216  * coming from the mbuf_pool passed as parameter
217  */
218 static inline int
219 port_init(uint16_t port)
220 {
221         struct rte_eth_dev_info dev_info;
222         struct rte_eth_conf port_conf;
223         struct rte_eth_rxconf *rxconf;
224         struct rte_eth_txconf *txconf;
225         int16_t rx_rings, tx_rings;
226         uint16_t rx_ring_size, tx_ring_size;
227         int retval;
228         uint16_t q;
229
230         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231         rte_eth_dev_info_get (port, &dev_info);
232
233         rxconf = &dev_info.default_rxconf;
234         txconf = &dev_info.default_txconf;
235         rxconf->rx_drop_en = 1;
236
237         /*configure the number of supported virtio devices based on VMDQ limits */
238         num_devices = dev_info.max_vmdq_pools;
239
240         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
241         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
242
243         /*
244          * When dequeue zero copy is enabled, guest Tx used vring will be
245          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
246          * (tx_ring_size here) must be small enough so that the driver will
247          * hit the free threshold easily and free mbufs timely. Otherwise,
248          * guest Tx vring would be starved.
249          */
250         if (dequeue_zero_copy)
251                 tx_ring_size = 64;
252
253         tx_rings = (uint16_t)rte_lcore_count();
254
255         /* Get port configuration. */
256         retval = get_eth_conf(&port_conf, num_devices);
257         if (retval < 0)
258                 return retval;
259         /* NIC queues are divided into pf queues and vmdq queues.  */
260         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
261         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
262         num_vmdq_queues = num_devices * queues_per_pool;
263         num_queues = num_pf_queues + num_vmdq_queues;
264         vmdq_queue_base = dev_info.vmdq_queue_base;
265         vmdq_pool_base  = dev_info.vmdq_pool_base;
266         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
267                 num_pf_queues, num_devices, queues_per_pool);
268
269         if (!rte_eth_dev_is_valid_port(port))
270                 return -1;
271
272         rx_rings = (uint16_t)dev_info.max_rx_queues;
273         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
274                 port_conf.txmode.offloads |=
275                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
276         /* Configure ethernet device. */
277         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
278         if (retval != 0) {
279                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
280                         port, strerror(-retval));
281                 return retval;
282         }
283
284         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
285                 &tx_ring_size);
286         if (retval != 0) {
287                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
288                         "for port %u: %s.\n", port, strerror(-retval));
289                 return retval;
290         }
291         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
292                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
293                         "for Rx queues on port %u.\n", port);
294                 return -1;
295         }
296
297         /* Setup the queues. */
298         rxconf->offloads = port_conf.rxmode.offloads;
299         for (q = 0; q < rx_rings; q ++) {
300                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
301                                                 rte_eth_dev_socket_id(port),
302                                                 rxconf,
303                                                 mbuf_pool);
304                 if (retval < 0) {
305                         RTE_LOG(ERR, VHOST_PORT,
306                                 "Failed to setup rx queue %u of port %u: %s.\n",
307                                 q, port, strerror(-retval));
308                         return retval;
309                 }
310         }
311         txconf->offloads = port_conf.txmode.offloads;
312         for (q = 0; q < tx_rings; q ++) {
313                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
314                                                 rte_eth_dev_socket_id(port),
315                                                 txconf);
316                 if (retval < 0) {
317                         RTE_LOG(ERR, VHOST_PORT,
318                                 "Failed to setup tx queue %u of port %u: %s.\n",
319                                 q, port, strerror(-retval));
320                         return retval;
321                 }
322         }
323
324         /* Start the device. */
325         retval  = rte_eth_dev_start(port);
326         if (retval < 0) {
327                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
328                         port, strerror(-retval));
329                 return retval;
330         }
331
332         if (promiscuous)
333                 rte_eth_promiscuous_enable(port);
334
335         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
336         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
337         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
338                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
339                         port,
340                         vmdq_ports_eth_addr[port].addr_bytes[0],
341                         vmdq_ports_eth_addr[port].addr_bytes[1],
342                         vmdq_ports_eth_addr[port].addr_bytes[2],
343                         vmdq_ports_eth_addr[port].addr_bytes[3],
344                         vmdq_ports_eth_addr[port].addr_bytes[4],
345                         vmdq_ports_eth_addr[port].addr_bytes[5]);
346
347         return 0;
348 }
349
350 /*
351  * Set socket file path.
352  */
353 static int
354 us_vhost_parse_socket_path(const char *q_arg)
355 {
356         /* parse number string */
357         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
358                 return -1;
359
360         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
361         snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
362         nb_sockets++;
363
364         return 0;
365 }
366
367 /*
368  * Parse the portmask provided at run time.
369  */
370 static int
371 parse_portmask(const char *portmask)
372 {
373         char *end = NULL;
374         unsigned long pm;
375
376         errno = 0;
377
378         /* parse hexadecimal string */
379         pm = strtoul(portmask, &end, 16);
380         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
381                 return -1;
382
383         if (pm == 0)
384                 return -1;
385
386         return pm;
387
388 }
389
390 /*
391  * Parse num options at run time.
392  */
393 static int
394 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
395 {
396         char *end = NULL;
397         unsigned long num;
398
399         errno = 0;
400
401         /* parse unsigned int string */
402         num = strtoul(q_arg, &end, 10);
403         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
404                 return -1;
405
406         if (num > max_valid_value)
407                 return -1;
408
409         return num;
410
411 }
412
413 /*
414  * Display usage
415  */
416 static void
417 us_vhost_usage(const char *prgname)
418 {
419         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
420         "               --vm2vm [0|1|2]\n"
421         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
422         "               --socket-file <path>\n"
423         "               --nb-devices ND\n"
424         "               -p PORTMASK: Set mask for ports to be used by application\n"
425         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
426         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
427         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
428         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
429         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
430         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
431         "               --socket-file: The path of the socket file.\n"
432         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
433         "               --tso [0|1] disable/enable TCP segment offload.\n"
434         "               --client register a vhost-user socket as client mode.\n"
435         "               --dequeue-zero-copy enables dequeue zero copy\n",
436                prgname);
437 }
438
439 /*
440  * Parse the arguments given in the command line of the application.
441  */
442 static int
443 us_vhost_parse_args(int argc, char **argv)
444 {
445         int opt, ret;
446         int option_index;
447         unsigned i;
448         const char *prgname = argv[0];
449         static struct option long_option[] = {
450                 {"vm2vm", required_argument, NULL, 0},
451                 {"rx-retry", required_argument, NULL, 0},
452                 {"rx-retry-delay", required_argument, NULL, 0},
453                 {"rx-retry-num", required_argument, NULL, 0},
454                 {"mergeable", required_argument, NULL, 0},
455                 {"stats", required_argument, NULL, 0},
456                 {"socket-file", required_argument, NULL, 0},
457                 {"tx-csum", required_argument, NULL, 0},
458                 {"tso", required_argument, NULL, 0},
459                 {"client", no_argument, &client_mode, 1},
460                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
461                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
462                 {NULL, 0, 0, 0},
463         };
464
465         /* Parse command line */
466         while ((opt = getopt_long(argc, argv, "p:P",
467                         long_option, &option_index)) != EOF) {
468                 switch (opt) {
469                 /* Portmask */
470                 case 'p':
471                         enabled_port_mask = parse_portmask(optarg);
472                         if (enabled_port_mask == 0) {
473                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
474                                 us_vhost_usage(prgname);
475                                 return -1;
476                         }
477                         break;
478
479                 case 'P':
480                         promiscuous = 1;
481                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
482                                 ETH_VMDQ_ACCEPT_BROADCAST |
483                                 ETH_VMDQ_ACCEPT_MULTICAST;
484
485                         break;
486
487                 case 0:
488                         /* Enable/disable vm2vm comms. */
489                         if (!strncmp(long_option[option_index].name, "vm2vm",
490                                 MAX_LONG_OPT_SZ)) {
491                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
492                                 if (ret == -1) {
493                                         RTE_LOG(INFO, VHOST_CONFIG,
494                                                 "Invalid argument for "
495                                                 "vm2vm [0|1|2]\n");
496                                         us_vhost_usage(prgname);
497                                         return -1;
498                                 } else {
499                                         vm2vm_mode = (vm2vm_type)ret;
500                                 }
501                         }
502
503                         /* Enable/disable retries on RX. */
504                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
505                                 ret = parse_num_opt(optarg, 1);
506                                 if (ret == -1) {
507                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
508                                         us_vhost_usage(prgname);
509                                         return -1;
510                                 } else {
511                                         enable_retry = ret;
512                                 }
513                         }
514
515                         /* Enable/disable TX checksum offload. */
516                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
517                                 ret = parse_num_opt(optarg, 1);
518                                 if (ret == -1) {
519                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
520                                         us_vhost_usage(prgname);
521                                         return -1;
522                                 } else
523                                         enable_tx_csum = ret;
524                         }
525
526                         /* Enable/disable TSO offload. */
527                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
528                                 ret = parse_num_opt(optarg, 1);
529                                 if (ret == -1) {
530                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
531                                         us_vhost_usage(prgname);
532                                         return -1;
533                                 } else
534                                         enable_tso = ret;
535                         }
536
537                         /* Specify the retries delay time (in useconds) on RX. */
538                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
539                                 ret = parse_num_opt(optarg, INT32_MAX);
540                                 if (ret == -1) {
541                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
542                                         us_vhost_usage(prgname);
543                                         return -1;
544                                 } else {
545                                         burst_rx_delay_time = ret;
546                                 }
547                         }
548
549                         /* Specify the retries number on RX. */
550                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
551                                 ret = parse_num_opt(optarg, INT32_MAX);
552                                 if (ret == -1) {
553                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
554                                         us_vhost_usage(prgname);
555                                         return -1;
556                                 } else {
557                                         burst_rx_retry_num = ret;
558                                 }
559                         }
560
561                         /* Enable/disable RX mergeable buffers. */
562                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
563                                 ret = parse_num_opt(optarg, 1);
564                                 if (ret == -1) {
565                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
566                                         us_vhost_usage(prgname);
567                                         return -1;
568                                 } else {
569                                         mergeable = !!ret;
570                                         if (ret) {
571                                                 vmdq_conf_default.rxmode.offloads |=
572                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
573                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
574                                                         = JUMBO_FRAME_MAX_SIZE;
575                                         }
576                                 }
577                         }
578
579                         /* Enable/disable stats. */
580                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
581                                 ret = parse_num_opt(optarg, INT32_MAX);
582                                 if (ret == -1) {
583                                         RTE_LOG(INFO, VHOST_CONFIG,
584                                                 "Invalid argument for stats [0..N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         enable_stats = ret;
589                                 }
590                         }
591
592                         /* Set socket file path. */
593                         if (!strncmp(long_option[option_index].name,
594                                                 "socket-file", MAX_LONG_OPT_SZ)) {
595                                 if (us_vhost_parse_socket_path(optarg) == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG,
597                                         "Invalid argument for socket name (Max %d characters)\n",
598                                         PATH_MAX);
599                                         us_vhost_usage(prgname);
600                                         return -1;
601                                 }
602                         }
603
604                         break;
605
606                         /* Invalid option - print options. */
607                 default:
608                         us_vhost_usage(prgname);
609                         return -1;
610                 }
611         }
612
613         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
614                 if (enabled_port_mask & (1 << i))
615                         ports[num_ports++] = i;
616         }
617
618         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
619                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
620                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
621                 return -1;
622         }
623
624         return 0;
625 }
626
627 /*
628  * Update the global var NUM_PORTS and array PORTS according to system ports number
629  * and return valid ports number
630  */
631 static unsigned check_ports_num(unsigned nb_ports)
632 {
633         unsigned valid_num_ports = num_ports;
634         unsigned portid;
635
636         if (num_ports > nb_ports) {
637                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
638                         num_ports, nb_ports);
639                 num_ports = nb_ports;
640         }
641
642         for (portid = 0; portid < num_ports; portid ++) {
643                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
644                         RTE_LOG(INFO, VHOST_PORT,
645                                 "\nSpecified port ID(%u) is not valid\n",
646                                 ports[portid]);
647                         ports[portid] = INVALID_PORT_ID;
648                         valid_num_ports--;
649                 }
650         }
651         return valid_num_ports;
652 }
653
654 static __rte_always_inline struct vhost_dev *
655 find_vhost_dev(struct ether_addr *mac)
656 {
657         struct vhost_dev *vdev;
658
659         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
660                 if (vdev->ready == DEVICE_RX &&
661                     is_same_ether_addr(mac, &vdev->mac_address))
662                         return vdev;
663         }
664
665         return NULL;
666 }
667
668 /*
669  * This function learns the MAC address of the device and registers this along with a
670  * vlan tag to a VMDQ.
671  */
672 static int
673 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
674 {
675         struct ether_hdr *pkt_hdr;
676         int i, ret;
677
678         /* Learn MAC address of guest device from packet */
679         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
680
681         if (find_vhost_dev(&pkt_hdr->s_addr)) {
682                 RTE_LOG(ERR, VHOST_DATA,
683                         "(%d) device is using a registered MAC!\n",
684                         vdev->vid);
685                 return -1;
686         }
687
688         for (i = 0; i < ETHER_ADDR_LEN; i++)
689                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
690
691         /* vlan_tag currently uses the device_id. */
692         vdev->vlan_tag = vlan_tags[vdev->vid];
693
694         /* Print out VMDQ registration info. */
695         RTE_LOG(INFO, VHOST_DATA,
696                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
697                 vdev->vid,
698                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
699                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
700                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
701                 vdev->vlan_tag);
702
703         /* Register the MAC address. */
704         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
705                                 (uint32_t)vdev->vid + vmdq_pool_base);
706         if (ret)
707                 RTE_LOG(ERR, VHOST_DATA,
708                         "(%d) failed to add device MAC address to VMDQ\n",
709                         vdev->vid);
710
711         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
712
713         /* Set device as ready for RX. */
714         vdev->ready = DEVICE_RX;
715
716         return 0;
717 }
718
719 /*
720  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
721  * queue before disabling RX on the device.
722  */
723 static inline void
724 unlink_vmdq(struct vhost_dev *vdev)
725 {
726         unsigned i = 0;
727         unsigned rx_count;
728         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
729
730         if (vdev->ready == DEVICE_RX) {
731                 /*clear MAC and VLAN settings*/
732                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
733                 for (i = 0; i < 6; i++)
734                         vdev->mac_address.addr_bytes[i] = 0;
735
736                 vdev->vlan_tag = 0;
737
738                 /*Clear out the receive buffers*/
739                 rx_count = rte_eth_rx_burst(ports[0],
740                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
741
742                 while (rx_count) {
743                         for (i = 0; i < rx_count; i++)
744                                 rte_pktmbuf_free(pkts_burst[i]);
745
746                         rx_count = rte_eth_rx_burst(ports[0],
747                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
748                 }
749
750                 vdev->ready = DEVICE_MAC_LEARNING;
751         }
752 }
753
754 static __rte_always_inline void
755 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
756             struct rte_mbuf *m)
757 {
758         uint16_t ret;
759
760         if (builtin_net_driver) {
761                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
762         } else {
763                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
764         }
765
766         if (enable_stats) {
767                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
768                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
769                 src_vdev->stats.tx_total++;
770                 src_vdev->stats.tx += ret;
771         }
772 }
773
774 /*
775  * Check if the packet destination MAC address is for a local device. If so then put
776  * the packet on that devices RX queue. If not then return.
777  */
778 static __rte_always_inline int
779 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
780 {
781         struct ether_hdr *pkt_hdr;
782         struct vhost_dev *dst_vdev;
783
784         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
785
786         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
787         if (!dst_vdev)
788                 return -1;
789
790         if (vdev->vid == dst_vdev->vid) {
791                 RTE_LOG_DP(DEBUG, VHOST_DATA,
792                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
793                         vdev->vid);
794                 return 0;
795         }
796
797         RTE_LOG_DP(DEBUG, VHOST_DATA,
798                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
799
800         if (unlikely(dst_vdev->remove)) {
801                 RTE_LOG_DP(DEBUG, VHOST_DATA,
802                         "(%d) device is marked for removal\n", dst_vdev->vid);
803                 return 0;
804         }
805
806         virtio_xmit(dst_vdev, vdev, m);
807         return 0;
808 }
809
810 /*
811  * Check if the destination MAC of a packet is one local VM,
812  * and get its vlan tag, and offset if it is.
813  */
814 static __rte_always_inline int
815 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
816         uint32_t *offset, uint16_t *vlan_tag)
817 {
818         struct vhost_dev *dst_vdev;
819         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
820
821         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
822         if (!dst_vdev)
823                 return 0;
824
825         if (vdev->vid == dst_vdev->vid) {
826                 RTE_LOG_DP(DEBUG, VHOST_DATA,
827                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
828                         vdev->vid);
829                 return -1;
830         }
831
832         /*
833          * HW vlan strip will reduce the packet length
834          * by minus length of vlan tag, so need restore
835          * the packet length by plus it.
836          */
837         *offset  = VLAN_HLEN;
838         *vlan_tag = vlan_tags[vdev->vid];
839
840         RTE_LOG_DP(DEBUG, VHOST_DATA,
841                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
842                 vdev->vid, dst_vdev->vid, *vlan_tag);
843
844         return 0;
845 }
846
847 static uint16_t
848 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
849 {
850         if (ol_flags & PKT_TX_IPV4)
851                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
852         else /* assume ethertype == ETHER_TYPE_IPv6 */
853                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
854 }
855
856 static void virtio_tx_offload(struct rte_mbuf *m)
857 {
858         void *l3_hdr;
859         struct ipv4_hdr *ipv4_hdr = NULL;
860         struct tcp_hdr *tcp_hdr = NULL;
861         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
862
863         l3_hdr = (char *)eth_hdr + m->l2_len;
864
865         if (m->ol_flags & PKT_TX_IPV4) {
866                 ipv4_hdr = l3_hdr;
867                 ipv4_hdr->hdr_checksum = 0;
868                 m->ol_flags |= PKT_TX_IP_CKSUM;
869         }
870
871         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
872         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
873 }
874
875 static inline void
876 free_pkts(struct rte_mbuf **pkts, uint16_t n)
877 {
878         while (n--)
879                 rte_pktmbuf_free(pkts[n]);
880 }
881
882 static __rte_always_inline void
883 do_drain_mbuf_table(struct mbuf_table *tx_q)
884 {
885         uint16_t count;
886
887         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
888                                  tx_q->m_table, tx_q->len);
889         if (unlikely(count < tx_q->len))
890                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
891
892         tx_q->len = 0;
893 }
894
895 /*
896  * This function routes the TX packet to the correct interface. This
897  * may be a local device or the physical port.
898  */
899 static __rte_always_inline void
900 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
901 {
902         struct mbuf_table *tx_q;
903         unsigned offset = 0;
904         const uint16_t lcore_id = rte_lcore_id();
905         struct ether_hdr *nh;
906
907
908         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
909         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
910                 struct vhost_dev *vdev2;
911
912                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
913                         if (vdev2 != vdev)
914                                 virtio_xmit(vdev2, vdev, m);
915                 }
916                 goto queue2nic;
917         }
918
919         /*check if destination is local VM*/
920         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
921                 rte_pktmbuf_free(m);
922                 return;
923         }
924
925         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
926                 if (unlikely(find_local_dest(vdev, m, &offset,
927                                              &vlan_tag) != 0)) {
928                         rte_pktmbuf_free(m);
929                         return;
930                 }
931         }
932
933         RTE_LOG_DP(DEBUG, VHOST_DATA,
934                 "(%d) TX: MAC address is external\n", vdev->vid);
935
936 queue2nic:
937
938         /*Add packet to the port tx queue*/
939         tx_q = &lcore_tx_queue[lcore_id];
940
941         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
942         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
943                 /* Guest has inserted the vlan tag. */
944                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
945                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
946                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
947                         (vh->vlan_tci != vlan_tag_be))
948                         vh->vlan_tci = vlan_tag_be;
949         } else {
950                 m->ol_flags |= PKT_TX_VLAN_PKT;
951
952                 /*
953                  * Find the right seg to adjust the data len when offset is
954                  * bigger than tail room size.
955                  */
956                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
957                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
958                                 m->data_len += offset;
959                         else {
960                                 struct rte_mbuf *seg = m;
961
962                                 while ((seg->next != NULL) &&
963                                         (offset > rte_pktmbuf_tailroom(seg)))
964                                         seg = seg->next;
965
966                                 seg->data_len += offset;
967                         }
968                         m->pkt_len += offset;
969                 }
970
971                 m->vlan_tci = vlan_tag;
972         }
973
974         if (m->ol_flags & PKT_TX_TCP_SEG)
975                 virtio_tx_offload(m);
976
977         tx_q->m_table[tx_q->len++] = m;
978         if (enable_stats) {
979                 vdev->stats.tx_total++;
980                 vdev->stats.tx++;
981         }
982
983         if (unlikely(tx_q->len == MAX_PKT_BURST))
984                 do_drain_mbuf_table(tx_q);
985 }
986
987
988 static __rte_always_inline void
989 drain_mbuf_table(struct mbuf_table *tx_q)
990 {
991         static uint64_t prev_tsc;
992         uint64_t cur_tsc;
993
994         if (tx_q->len == 0)
995                 return;
996
997         cur_tsc = rte_rdtsc();
998         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
999                 prev_tsc = cur_tsc;
1000
1001                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1002                         "TX queue drained after timeout with burst size %u\n",
1003                         tx_q->len);
1004                 do_drain_mbuf_table(tx_q);
1005         }
1006 }
1007
1008 static __rte_always_inline void
1009 drain_eth_rx(struct vhost_dev *vdev)
1010 {
1011         uint16_t rx_count, enqueue_count;
1012         struct rte_mbuf *pkts[MAX_PKT_BURST];
1013
1014         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1015                                     pkts, MAX_PKT_BURST);
1016         if (!rx_count)
1017                 return;
1018
1019         /*
1020          * When "enable_retry" is set, here we wait and retry when there
1021          * is no enough free slots in the queue to hold @rx_count packets,
1022          * to diminish packet loss.
1023          */
1024         if (enable_retry &&
1025             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1026                         VIRTIO_RXQ))) {
1027                 uint32_t retry;
1028
1029                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1030                         rte_delay_us(burst_rx_delay_time);
1031                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1032                                         VIRTIO_RXQ))
1033                                 break;
1034                 }
1035         }
1036
1037         if (builtin_net_driver) {
1038                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1039                                                 pkts, rx_count);
1040         } else {
1041                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1042                                                 pkts, rx_count);
1043         }
1044         if (enable_stats) {
1045                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1046                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1047         }
1048
1049         free_pkts(pkts, rx_count);
1050 }
1051
1052 static __rte_always_inline void
1053 drain_virtio_tx(struct vhost_dev *vdev)
1054 {
1055         struct rte_mbuf *pkts[MAX_PKT_BURST];
1056         uint16_t count;
1057         uint16_t i;
1058
1059         if (builtin_net_driver) {
1060                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1061                                         pkts, MAX_PKT_BURST);
1062         } else {
1063                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1064                                         mbuf_pool, pkts, MAX_PKT_BURST);
1065         }
1066
1067         /* setup VMDq for the first packet */
1068         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1069                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1070                         free_pkts(pkts, count);
1071         }
1072
1073         for (i = 0; i < count; ++i)
1074                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1075 }
1076
1077 /*
1078  * Main function of vhost-switch. It basically does:
1079  *
1080  * for each vhost device {
1081  *    - drain_eth_rx()
1082  *
1083  *      Which drains the host eth Rx queue linked to the vhost device,
1084  *      and deliver all of them to guest virito Rx ring associated with
1085  *      this vhost device.
1086  *
1087  *    - drain_virtio_tx()
1088  *
1089  *      Which drains the guest virtio Tx queue and deliver all of them
1090  *      to the target, which could be another vhost device, or the
1091  *      physical eth dev. The route is done in function "virtio_tx_route".
1092  * }
1093  */
1094 static int
1095 switch_worker(void *arg __rte_unused)
1096 {
1097         unsigned i;
1098         unsigned lcore_id = rte_lcore_id();
1099         struct vhost_dev *vdev;
1100         struct mbuf_table *tx_q;
1101
1102         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1103
1104         tx_q = &lcore_tx_queue[lcore_id];
1105         for (i = 0; i < rte_lcore_count(); i++) {
1106                 if (lcore_ids[i] == lcore_id) {
1107                         tx_q->txq_id = i;
1108                         break;
1109                 }
1110         }
1111
1112         while(1) {
1113                 drain_mbuf_table(tx_q);
1114
1115                 /*
1116                  * Inform the configuration core that we have exited the
1117                  * linked list and that no devices are in use if requested.
1118                  */
1119                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1120                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1121
1122                 /*
1123                  * Process vhost devices
1124                  */
1125                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1126                               lcore_vdev_entry) {
1127                         if (unlikely(vdev->remove)) {
1128                                 unlink_vmdq(vdev);
1129                                 vdev->ready = DEVICE_SAFE_REMOVE;
1130                                 continue;
1131                         }
1132
1133                         if (likely(vdev->ready == DEVICE_RX))
1134                                 drain_eth_rx(vdev);
1135
1136                         if (likely(!vdev->remove))
1137                                 drain_virtio_tx(vdev);
1138                 }
1139         }
1140
1141         return 0;
1142 }
1143
1144 /*
1145  * Remove a device from the specific data core linked list and from the
1146  * main linked list. Synchonization  occurs through the use of the
1147  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1148  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1149  */
1150 static void
1151 destroy_device(int vid)
1152 {
1153         struct vhost_dev *vdev = NULL;
1154         int lcore;
1155
1156         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1157                 if (vdev->vid == vid)
1158                         break;
1159         }
1160         if (!vdev)
1161                 return;
1162         /*set the remove flag. */
1163         vdev->remove = 1;
1164         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1165                 rte_pause();
1166         }
1167
1168         if (builtin_net_driver)
1169                 vs_vhost_net_remove(vdev);
1170
1171         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1172                      lcore_vdev_entry);
1173         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1174
1175
1176         /* Set the dev_removal_flag on each lcore. */
1177         RTE_LCORE_FOREACH_SLAVE(lcore)
1178                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1179
1180         /*
1181          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1182          * we can be sure that they can no longer access the device removed
1183          * from the linked lists and that the devices are no longer in use.
1184          */
1185         RTE_LCORE_FOREACH_SLAVE(lcore) {
1186                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1187                         rte_pause();
1188         }
1189
1190         lcore_info[vdev->coreid].device_num--;
1191
1192         RTE_LOG(INFO, VHOST_DATA,
1193                 "(%d) device has been removed from data core\n",
1194                 vdev->vid);
1195
1196         rte_free(vdev);
1197 }
1198
1199 /*
1200  * A new device is added to a data core. First the device is added to the main linked list
1201  * and the allocated to a specific data core.
1202  */
1203 static int
1204 new_device(int vid)
1205 {
1206         int lcore, core_add = 0;
1207         uint32_t device_num_min = num_devices;
1208         struct vhost_dev *vdev;
1209
1210         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1211         if (vdev == NULL) {
1212                 RTE_LOG(INFO, VHOST_DATA,
1213                         "(%d) couldn't allocate memory for vhost dev\n",
1214                         vid);
1215                 return -1;
1216         }
1217         vdev->vid = vid;
1218
1219         if (builtin_net_driver)
1220                 vs_vhost_net_setup(vdev);
1221
1222         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1223         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1224
1225         /*reset ready flag*/
1226         vdev->ready = DEVICE_MAC_LEARNING;
1227         vdev->remove = 0;
1228
1229         /* Find a suitable lcore to add the device. */
1230         RTE_LCORE_FOREACH_SLAVE(lcore) {
1231                 if (lcore_info[lcore].device_num < device_num_min) {
1232                         device_num_min = lcore_info[lcore].device_num;
1233                         core_add = lcore;
1234                 }
1235         }
1236         vdev->coreid = core_add;
1237
1238         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1239                           lcore_vdev_entry);
1240         lcore_info[vdev->coreid].device_num++;
1241
1242         /* Disable notifications. */
1243         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1244         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1245
1246         RTE_LOG(INFO, VHOST_DATA,
1247                 "(%d) device has been added to data core %d\n",
1248                 vid, vdev->coreid);
1249
1250         return 0;
1251 }
1252
1253 /*
1254  * These callback allow devices to be added to the data core when configuration
1255  * has been fully complete.
1256  */
1257 static const struct vhost_device_ops virtio_net_device_ops =
1258 {
1259         .new_device =  new_device,
1260         .destroy_device = destroy_device,
1261 };
1262
1263 /*
1264  * This is a thread will wake up after a period to print stats if the user has
1265  * enabled them.
1266  */
1267 static void *
1268 print_stats(__rte_unused void *arg)
1269 {
1270         struct vhost_dev *vdev;
1271         uint64_t tx_dropped, rx_dropped;
1272         uint64_t tx, tx_total, rx, rx_total;
1273         const char clr[] = { 27, '[', '2', 'J', '\0' };
1274         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1275
1276         while(1) {
1277                 sleep(enable_stats);
1278
1279                 /* Clear screen and move to top left */
1280                 printf("%s%s\n", clr, top_left);
1281                 printf("Device statistics =================================\n");
1282
1283                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1284                         tx_total   = vdev->stats.tx_total;
1285                         tx         = vdev->stats.tx;
1286                         tx_dropped = tx_total - tx;
1287
1288                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1289                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1290                         rx_dropped = rx_total - rx;
1291
1292                         printf("Statistics for device %d\n"
1293                                 "-----------------------\n"
1294                                 "TX total:              %" PRIu64 "\n"
1295                                 "TX dropped:            %" PRIu64 "\n"
1296                                 "TX successful:         %" PRIu64 "\n"
1297                                 "RX total:              %" PRIu64 "\n"
1298                                 "RX dropped:            %" PRIu64 "\n"
1299                                 "RX successful:         %" PRIu64 "\n",
1300                                 vdev->vid,
1301                                 tx_total, tx_dropped, tx,
1302                                 rx_total, rx_dropped, rx);
1303                 }
1304
1305                 printf("===================================================\n");
1306         }
1307
1308         return NULL;
1309 }
1310
1311 static void
1312 unregister_drivers(int socket_num)
1313 {
1314         int i, ret;
1315
1316         for (i = 0; i < socket_num; i++) {
1317                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1318                 if (ret != 0)
1319                         RTE_LOG(ERR, VHOST_CONFIG,
1320                                 "Fail to unregister vhost driver for %s.\n",
1321                                 socket_files + i * PATH_MAX);
1322         }
1323 }
1324
1325 /* When we receive a INT signal, unregister vhost driver */
1326 static void
1327 sigint_handler(__rte_unused int signum)
1328 {
1329         /* Unregister vhost driver. */
1330         unregister_drivers(nb_sockets);
1331
1332         exit(0);
1333 }
1334
1335 /*
1336  * While creating an mbuf pool, one key thing is to figure out how
1337  * many mbuf entries is enough for our use. FYI, here are some
1338  * guidelines:
1339  *
1340  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1341  *
1342  * - For each switch core (A CPU core does the packet switch), we need
1343  *   also make some reservation for receiving the packets from virtio
1344  *   Tx queue. How many is enough depends on the usage. It's normally
1345  *   a simple calculation like following:
1346  *
1347  *       MAX_PKT_BURST * max packet size / mbuf size
1348  *
1349  *   So, we definitely need allocate more mbufs when TSO is enabled.
1350  *
1351  * - Similarly, for each switching core, we should serve @nr_rx_desc
1352  *   mbufs for receiving the packets from physical NIC device.
1353  *
1354  * - We also need make sure, for each switch core, we have allocated
1355  *   enough mbufs to fill up the mbuf cache.
1356  */
1357 static void
1358 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1359         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1360 {
1361         uint32_t nr_mbufs;
1362         uint32_t nr_mbufs_per_core;
1363         uint32_t mtu = 1500;
1364
1365         if (mergeable)
1366                 mtu = 9000;
1367         if (enable_tso)
1368                 mtu = 64 * 1024;
1369
1370         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1371                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1372         nr_mbufs_per_core += nr_rx_desc;
1373         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1374
1375         nr_mbufs  = nr_queues * nr_rx_desc;
1376         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1377         nr_mbufs *= nr_port;
1378
1379         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1380                                             nr_mbuf_cache, 0, mbuf_size,
1381                                             rte_socket_id());
1382         if (mbuf_pool == NULL)
1383                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1384 }
1385
1386 /*
1387  * Main function, does initialisation and calls the per-lcore functions.
1388  */
1389 int
1390 main(int argc, char *argv[])
1391 {
1392         unsigned lcore_id, core_id = 0;
1393         unsigned nb_ports, valid_num_ports;
1394         int ret, i;
1395         uint16_t portid;
1396         static pthread_t tid;
1397         uint64_t flags = 0;
1398
1399         signal(SIGINT, sigint_handler);
1400
1401         /* init EAL */
1402         ret = rte_eal_init(argc, argv);
1403         if (ret < 0)
1404                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1405         argc -= ret;
1406         argv += ret;
1407
1408         /* parse app arguments */
1409         ret = us_vhost_parse_args(argc, argv);
1410         if (ret < 0)
1411                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1412
1413         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1414                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1415
1416                 if (rte_lcore_is_enabled(lcore_id))
1417                         lcore_ids[core_id++] = lcore_id;
1418         }
1419
1420         if (rte_lcore_count() > RTE_MAX_LCORE)
1421                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1422
1423         /* Get the number of physical ports. */
1424         nb_ports = rte_eth_dev_count_avail();
1425
1426         /*
1427          * Update the global var NUM_PORTS and global array PORTS
1428          * and get value of var VALID_NUM_PORTS according to system ports number
1429          */
1430         valid_num_ports = check_ports_num(nb_ports);
1431
1432         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1433                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1434                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1435                 return -1;
1436         }
1437
1438         /*
1439          * FIXME: here we are trying to allocate mbufs big enough for
1440          * @MAX_QUEUES, but the truth is we're never going to use that
1441          * many queues here. We probably should only do allocation for
1442          * those queues we are going to use.
1443          */
1444         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1445                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1446
1447         if (vm2vm_mode == VM2VM_HARDWARE) {
1448                 /* Enable VT loop back to let L2 switch to do it. */
1449                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1450                 RTE_LOG(DEBUG, VHOST_CONFIG,
1451                         "Enable loop back for L2 switch in vmdq.\n");
1452         }
1453
1454         /* initialize all ports */
1455         RTE_ETH_FOREACH_DEV(portid) {
1456                 /* skip ports that are not enabled */
1457                 if ((enabled_port_mask & (1 << portid)) == 0) {
1458                         RTE_LOG(INFO, VHOST_PORT,
1459                                 "Skipping disabled port %d\n", portid);
1460                         continue;
1461                 }
1462                 if (port_init(portid) != 0)
1463                         rte_exit(EXIT_FAILURE,
1464                                 "Cannot initialize network ports\n");
1465         }
1466
1467         /* Enable stats if the user option is set. */
1468         if (enable_stats) {
1469                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1470                                         print_stats, NULL);
1471                 if (ret < 0)
1472                         rte_exit(EXIT_FAILURE,
1473                                 "Cannot create print-stats thread\n");
1474         }
1475
1476         /* Launch all data cores. */
1477         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1478                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1479
1480         if (client_mode)
1481                 flags |= RTE_VHOST_USER_CLIENT;
1482
1483         if (dequeue_zero_copy)
1484                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1485
1486         /* Register vhost user driver to handle vhost messages. */
1487         for (i = 0; i < nb_sockets; i++) {
1488                 char *file = socket_files + i * PATH_MAX;
1489                 ret = rte_vhost_driver_register(file, flags);
1490                 if (ret != 0) {
1491                         unregister_drivers(i);
1492                         rte_exit(EXIT_FAILURE,
1493                                 "vhost driver register failure.\n");
1494                 }
1495
1496                 if (builtin_net_driver)
1497                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1498
1499                 if (mergeable == 0) {
1500                         rte_vhost_driver_disable_features(file,
1501                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1502                 }
1503
1504                 if (enable_tx_csum == 0) {
1505                         rte_vhost_driver_disable_features(file,
1506                                 1ULL << VIRTIO_NET_F_CSUM);
1507                 }
1508
1509                 if (enable_tso == 0) {
1510                         rte_vhost_driver_disable_features(file,
1511                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1512                         rte_vhost_driver_disable_features(file,
1513                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1514                         rte_vhost_driver_disable_features(file,
1515                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1516                         rte_vhost_driver_disable_features(file,
1517                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1518                 }
1519
1520                 if (promiscuous) {
1521                         rte_vhost_driver_enable_features(file,
1522                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1523                 }
1524
1525                 ret = rte_vhost_driver_callback_register(file,
1526                         &virtio_net_device_ops);
1527                 if (ret != 0) {
1528                         rte_exit(EXIT_FAILURE,
1529                                 "failed to register vhost driver callbacks.\n");
1530                 }
1531
1532                 if (rte_vhost_driver_start(file) < 0) {
1533                         rte_exit(EXIT_FAILURE,
1534                                 "failed to start vhost driver.\n");
1535                 }
1536         }
1537
1538         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1539                 rte_eal_wait_lcore(lcore_id);
1540
1541         return 0;
1542
1543 }