examples: take promiscuous mode switch result into account
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95 static int dequeue_zero_copy;
96
97 static int builtin_net_driver;
98
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110         .rxmode = {
111                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112                 .split_hdr_size = 0,
113                 /*
114                  * VLAN strip is necessary for 1G NIC such as I350,
115                  * this fixes bug of ipv4 forwarding in guest can't
116                  * forward pakets from one virtio dev to another virtio dev.
117                  */
118                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119         },
120
121         .txmode = {
122                 .mq_mode = ETH_MQ_TX_NONE,
123                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124                              DEV_TX_OFFLOAD_TCP_CKSUM |
125                              DEV_TX_OFFLOAD_VLAN_INSERT |
126                              DEV_TX_OFFLOAD_MULTI_SEGS |
127                              DEV_TX_OFFLOAD_TCP_TSO),
128         },
129         .rx_adv_conf = {
130                 /*
131                  * should be overridden separately in code with
132                  * appropriate values
133                  */
134                 .vmdq_rx_conf = {
135                         .nb_queue_pools = ETH_8_POOLS,
136                         .enable_default_pool = 0,
137                         .default_pool = 0,
138                         .nb_pool_maps = 0,
139                         .pool_map = {{0, 0},},
140                 },
141         },
142 };
143
144
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151
152 const uint16_t vlan_tags[] = {
153         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165
166 static struct vhost_dev_tailq_list vhost_dev_list =
167         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173         unsigned len;
174         unsigned txq_id;
175         struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
180
181 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
182                                  / US_PER_S * BURST_TX_DRAIN_US)
183 #define VLAN_HLEN       4
184
185 /*
186  * Builds up the correct configuration for VMDQ VLAN pool map
187  * according to the pool & queue limits.
188  */
189 static inline int
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
191 {
192         struct rte_eth_vmdq_rx_conf conf;
193         struct rte_eth_vmdq_rx_conf *def_conf =
194                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
195         unsigned i;
196
197         memset(&conf, 0, sizeof(conf));
198         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199         conf.nb_pool_maps = num_devices;
200         conf.enable_loop_back = def_conf->enable_loop_back;
201         conf.rx_mode = def_conf->rx_mode;
202
203         for (i = 0; i < conf.nb_pool_maps; i++) {
204                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
205                 conf.pool_map[i].pools = (1UL << i);
206         }
207
208         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
211         return 0;
212 }
213
214 /*
215  * Initialises a given port using global settings and with the rx buffers
216  * coming from the mbuf_pool passed as parameter
217  */
218 static inline int
219 port_init(uint16_t port)
220 {
221         struct rte_eth_dev_info dev_info;
222         struct rte_eth_conf port_conf;
223         struct rte_eth_rxconf *rxconf;
224         struct rte_eth_txconf *txconf;
225         int16_t rx_rings, tx_rings;
226         uint16_t rx_ring_size, tx_ring_size;
227         int retval;
228         uint16_t q;
229
230         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231         retval = rte_eth_dev_info_get(port, &dev_info);
232         if (retval != 0) {
233                 RTE_LOG(ERR, VHOST_PORT,
234                         "Error during getting device (port %u) info: %s\n",
235                         port, strerror(-retval));
236
237                 return retval;
238         }
239
240         rxconf = &dev_info.default_rxconf;
241         txconf = &dev_info.default_txconf;
242         rxconf->rx_drop_en = 1;
243
244         /*configure the number of supported virtio devices based on VMDQ limits */
245         num_devices = dev_info.max_vmdq_pools;
246
247         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
248         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
249
250         /*
251          * When dequeue zero copy is enabled, guest Tx used vring will be
252          * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
253          * (tx_ring_size here) must be small enough so that the driver will
254          * hit the free threshold easily and free mbufs timely. Otherwise,
255          * guest Tx vring would be starved.
256          */
257         if (dequeue_zero_copy)
258                 tx_ring_size = 64;
259
260         tx_rings = (uint16_t)rte_lcore_count();
261
262         /* Get port configuration. */
263         retval = get_eth_conf(&port_conf, num_devices);
264         if (retval < 0)
265                 return retval;
266         /* NIC queues are divided into pf queues and vmdq queues.  */
267         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
268         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
269         num_vmdq_queues = num_devices * queues_per_pool;
270         num_queues = num_pf_queues + num_vmdq_queues;
271         vmdq_queue_base = dev_info.vmdq_queue_base;
272         vmdq_pool_base  = dev_info.vmdq_pool_base;
273         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
274                 num_pf_queues, num_devices, queues_per_pool);
275
276         if (!rte_eth_dev_is_valid_port(port))
277                 return -1;
278
279         rx_rings = (uint16_t)dev_info.max_rx_queues;
280         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
281                 port_conf.txmode.offloads |=
282                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
283         /* Configure ethernet device. */
284         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
285         if (retval != 0) {
286                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
287                         port, strerror(-retval));
288                 return retval;
289         }
290
291         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
292                 &tx_ring_size);
293         if (retval != 0) {
294                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
295                         "for port %u: %s.\n", port, strerror(-retval));
296                 return retval;
297         }
298         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
299                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
300                         "for Rx queues on port %u.\n", port);
301                 return -1;
302         }
303
304         /* Setup the queues. */
305         rxconf->offloads = port_conf.rxmode.offloads;
306         for (q = 0; q < rx_rings; q ++) {
307                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
308                                                 rte_eth_dev_socket_id(port),
309                                                 rxconf,
310                                                 mbuf_pool);
311                 if (retval < 0) {
312                         RTE_LOG(ERR, VHOST_PORT,
313                                 "Failed to setup rx queue %u of port %u: %s.\n",
314                                 q, port, strerror(-retval));
315                         return retval;
316                 }
317         }
318         txconf->offloads = port_conf.txmode.offloads;
319         for (q = 0; q < tx_rings; q ++) {
320                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
321                                                 rte_eth_dev_socket_id(port),
322                                                 txconf);
323                 if (retval < 0) {
324                         RTE_LOG(ERR, VHOST_PORT,
325                                 "Failed to setup tx queue %u of port %u: %s.\n",
326                                 q, port, strerror(-retval));
327                         return retval;
328                 }
329         }
330
331         /* Start the device. */
332         retval  = rte_eth_dev_start(port);
333         if (retval < 0) {
334                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
335                         port, strerror(-retval));
336                 return retval;
337         }
338
339         if (promiscuous) {
340                 retval = rte_eth_promiscuous_enable(port);
341                 if (retval != 0) {
342                         RTE_LOG(ERR, VHOST_PORT,
343                                 "Failed to enable promiscuous mode on port %u: %s\n",
344                                 port, rte_strerror(-retval));
345                         return retval;
346                 }
347         }
348
349         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
350         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
351         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
352                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
353                         port,
354                         vmdq_ports_eth_addr[port].addr_bytes[0],
355                         vmdq_ports_eth_addr[port].addr_bytes[1],
356                         vmdq_ports_eth_addr[port].addr_bytes[2],
357                         vmdq_ports_eth_addr[port].addr_bytes[3],
358                         vmdq_ports_eth_addr[port].addr_bytes[4],
359                         vmdq_ports_eth_addr[port].addr_bytes[5]);
360
361         return 0;
362 }
363
364 /*
365  * Set socket file path.
366  */
367 static int
368 us_vhost_parse_socket_path(const char *q_arg)
369 {
370         char *old;
371
372         /* parse number string */
373         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
374                 return -1;
375
376         old = socket_files;
377         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
378         if (socket_files == NULL) {
379                 free(old);
380                 return -1;
381         }
382
383         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
384         nb_sockets++;
385
386         return 0;
387 }
388
389 /*
390  * Parse the portmask provided at run time.
391  */
392 static int
393 parse_portmask(const char *portmask)
394 {
395         char *end = NULL;
396         unsigned long pm;
397
398         errno = 0;
399
400         /* parse hexadecimal string */
401         pm = strtoul(portmask, &end, 16);
402         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
403                 return -1;
404
405         if (pm == 0)
406                 return -1;
407
408         return pm;
409
410 }
411
412 /*
413  * Parse num options at run time.
414  */
415 static int
416 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
417 {
418         char *end = NULL;
419         unsigned long num;
420
421         errno = 0;
422
423         /* parse unsigned int string */
424         num = strtoul(q_arg, &end, 10);
425         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426                 return -1;
427
428         if (num > max_valid_value)
429                 return -1;
430
431         return num;
432
433 }
434
435 /*
436  * Display usage
437  */
438 static void
439 us_vhost_usage(const char *prgname)
440 {
441         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
442         "               --vm2vm [0|1|2]\n"
443         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
444         "               --socket-file <path>\n"
445         "               --nb-devices ND\n"
446         "               -p PORTMASK: Set mask for ports to be used by application\n"
447         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
448         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
449         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
450         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
451         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
452         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
453         "               --socket-file: The path of the socket file.\n"
454         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
455         "               --tso [0|1] disable/enable TCP segment offload.\n"
456         "               --client register a vhost-user socket as client mode.\n"
457         "               --dequeue-zero-copy enables dequeue zero copy\n",
458                prgname);
459 }
460
461 /*
462  * Parse the arguments given in the command line of the application.
463  */
464 static int
465 us_vhost_parse_args(int argc, char **argv)
466 {
467         int opt, ret;
468         int option_index;
469         unsigned i;
470         const char *prgname = argv[0];
471         static struct option long_option[] = {
472                 {"vm2vm", required_argument, NULL, 0},
473                 {"rx-retry", required_argument, NULL, 0},
474                 {"rx-retry-delay", required_argument, NULL, 0},
475                 {"rx-retry-num", required_argument, NULL, 0},
476                 {"mergeable", required_argument, NULL, 0},
477                 {"stats", required_argument, NULL, 0},
478                 {"socket-file", required_argument, NULL, 0},
479                 {"tx-csum", required_argument, NULL, 0},
480                 {"tso", required_argument, NULL, 0},
481                 {"client", no_argument, &client_mode, 1},
482                 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
483                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
484                 {NULL, 0, 0, 0},
485         };
486
487         /* Parse command line */
488         while ((opt = getopt_long(argc, argv, "p:P",
489                         long_option, &option_index)) != EOF) {
490                 switch (opt) {
491                 /* Portmask */
492                 case 'p':
493                         enabled_port_mask = parse_portmask(optarg);
494                         if (enabled_port_mask == 0) {
495                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
496                                 us_vhost_usage(prgname);
497                                 return -1;
498                         }
499                         break;
500
501                 case 'P':
502                         promiscuous = 1;
503                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
504                                 ETH_VMDQ_ACCEPT_BROADCAST |
505                                 ETH_VMDQ_ACCEPT_MULTICAST;
506
507                         break;
508
509                 case 0:
510                         /* Enable/disable vm2vm comms. */
511                         if (!strncmp(long_option[option_index].name, "vm2vm",
512                                 MAX_LONG_OPT_SZ)) {
513                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
514                                 if (ret == -1) {
515                                         RTE_LOG(INFO, VHOST_CONFIG,
516                                                 "Invalid argument for "
517                                                 "vm2vm [0|1|2]\n");
518                                         us_vhost_usage(prgname);
519                                         return -1;
520                                 } else {
521                                         vm2vm_mode = (vm2vm_type)ret;
522                                 }
523                         }
524
525                         /* Enable/disable retries on RX. */
526                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
527                                 ret = parse_num_opt(optarg, 1);
528                                 if (ret == -1) {
529                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
530                                         us_vhost_usage(prgname);
531                                         return -1;
532                                 } else {
533                                         enable_retry = ret;
534                                 }
535                         }
536
537                         /* Enable/disable TX checksum offload. */
538                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
539                                 ret = parse_num_opt(optarg, 1);
540                                 if (ret == -1) {
541                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
542                                         us_vhost_usage(prgname);
543                                         return -1;
544                                 } else
545                                         enable_tx_csum = ret;
546                         }
547
548                         /* Enable/disable TSO offload. */
549                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
550                                 ret = parse_num_opt(optarg, 1);
551                                 if (ret == -1) {
552                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
553                                         us_vhost_usage(prgname);
554                                         return -1;
555                                 } else
556                                         enable_tso = ret;
557                         }
558
559                         /* Specify the retries delay time (in useconds) on RX. */
560                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
561                                 ret = parse_num_opt(optarg, INT32_MAX);
562                                 if (ret == -1) {
563                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
564                                         us_vhost_usage(prgname);
565                                         return -1;
566                                 } else {
567                                         burst_rx_delay_time = ret;
568                                 }
569                         }
570
571                         /* Specify the retries number on RX. */
572                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
573                                 ret = parse_num_opt(optarg, INT32_MAX);
574                                 if (ret == -1) {
575                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
576                                         us_vhost_usage(prgname);
577                                         return -1;
578                                 } else {
579                                         burst_rx_retry_num = ret;
580                                 }
581                         }
582
583                         /* Enable/disable RX mergeable buffers. */
584                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
585                                 ret = parse_num_opt(optarg, 1);
586                                 if (ret == -1) {
587                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
588                                         us_vhost_usage(prgname);
589                                         return -1;
590                                 } else {
591                                         mergeable = !!ret;
592                                         if (ret) {
593                                                 vmdq_conf_default.rxmode.offloads |=
594                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
595                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
596                                                         = JUMBO_FRAME_MAX_SIZE;
597                                         }
598                                 }
599                         }
600
601                         /* Enable/disable stats. */
602                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
603                                 ret = parse_num_opt(optarg, INT32_MAX);
604                                 if (ret == -1) {
605                                         RTE_LOG(INFO, VHOST_CONFIG,
606                                                 "Invalid argument for stats [0..N]\n");
607                                         us_vhost_usage(prgname);
608                                         return -1;
609                                 } else {
610                                         enable_stats = ret;
611                                 }
612                         }
613
614                         /* Set socket file path. */
615                         if (!strncmp(long_option[option_index].name,
616                                                 "socket-file", MAX_LONG_OPT_SZ)) {
617                                 if (us_vhost_parse_socket_path(optarg) == -1) {
618                                         RTE_LOG(INFO, VHOST_CONFIG,
619                                         "Invalid argument for socket name (Max %d characters)\n",
620                                         PATH_MAX);
621                                         us_vhost_usage(prgname);
622                                         return -1;
623                                 }
624                         }
625
626                         break;
627
628                         /* Invalid option - print options. */
629                 default:
630                         us_vhost_usage(prgname);
631                         return -1;
632                 }
633         }
634
635         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
636                 if (enabled_port_mask & (1 << i))
637                         ports[num_ports++] = i;
638         }
639
640         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
641                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
642                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
643                 return -1;
644         }
645
646         return 0;
647 }
648
649 /*
650  * Update the global var NUM_PORTS and array PORTS according to system ports number
651  * and return valid ports number
652  */
653 static unsigned check_ports_num(unsigned nb_ports)
654 {
655         unsigned valid_num_ports = num_ports;
656         unsigned portid;
657
658         if (num_ports > nb_ports) {
659                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
660                         num_ports, nb_ports);
661                 num_ports = nb_ports;
662         }
663
664         for (portid = 0; portid < num_ports; portid ++) {
665                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
666                         RTE_LOG(INFO, VHOST_PORT,
667                                 "\nSpecified port ID(%u) is not valid\n",
668                                 ports[portid]);
669                         ports[portid] = INVALID_PORT_ID;
670                         valid_num_ports--;
671                 }
672         }
673         return valid_num_ports;
674 }
675
676 static __rte_always_inline struct vhost_dev *
677 find_vhost_dev(struct rte_ether_addr *mac)
678 {
679         struct vhost_dev *vdev;
680
681         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
682                 if (vdev->ready == DEVICE_RX &&
683                     rte_is_same_ether_addr(mac, &vdev->mac_address))
684                         return vdev;
685         }
686
687         return NULL;
688 }
689
690 /*
691  * This function learns the MAC address of the device and registers this along with a
692  * vlan tag to a VMDQ.
693  */
694 static int
695 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
696 {
697         struct rte_ether_hdr *pkt_hdr;
698         int i, ret;
699
700         /* Learn MAC address of guest device from packet */
701         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
702
703         if (find_vhost_dev(&pkt_hdr->s_addr)) {
704                 RTE_LOG(ERR, VHOST_DATA,
705                         "(%d) device is using a registered MAC!\n",
706                         vdev->vid);
707                 return -1;
708         }
709
710         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
711                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
712
713         /* vlan_tag currently uses the device_id. */
714         vdev->vlan_tag = vlan_tags[vdev->vid];
715
716         /* Print out VMDQ registration info. */
717         RTE_LOG(INFO, VHOST_DATA,
718                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
719                 vdev->vid,
720                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
721                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
722                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
723                 vdev->vlan_tag);
724
725         /* Register the MAC address. */
726         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
727                                 (uint32_t)vdev->vid + vmdq_pool_base);
728         if (ret)
729                 RTE_LOG(ERR, VHOST_DATA,
730                         "(%d) failed to add device MAC address to VMDQ\n",
731                         vdev->vid);
732
733         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
734
735         /* Set device as ready for RX. */
736         vdev->ready = DEVICE_RX;
737
738         return 0;
739 }
740
741 /*
742  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
743  * queue before disabling RX on the device.
744  */
745 static inline void
746 unlink_vmdq(struct vhost_dev *vdev)
747 {
748         unsigned i = 0;
749         unsigned rx_count;
750         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
751
752         if (vdev->ready == DEVICE_RX) {
753                 /*clear MAC and VLAN settings*/
754                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
755                 for (i = 0; i < 6; i++)
756                         vdev->mac_address.addr_bytes[i] = 0;
757
758                 vdev->vlan_tag = 0;
759
760                 /*Clear out the receive buffers*/
761                 rx_count = rte_eth_rx_burst(ports[0],
762                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
763
764                 while (rx_count) {
765                         for (i = 0; i < rx_count; i++)
766                                 rte_pktmbuf_free(pkts_burst[i]);
767
768                         rx_count = rte_eth_rx_burst(ports[0],
769                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
770                 }
771
772                 vdev->ready = DEVICE_MAC_LEARNING;
773         }
774 }
775
776 static __rte_always_inline void
777 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
778             struct rte_mbuf *m)
779 {
780         uint16_t ret;
781
782         if (builtin_net_driver) {
783                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
784         } else {
785                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
786         }
787
788         if (enable_stats) {
789                 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
790                 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
791                 src_vdev->stats.tx_total++;
792                 src_vdev->stats.tx += ret;
793         }
794 }
795
796 /*
797  * Check if the packet destination MAC address is for a local device. If so then put
798  * the packet on that devices RX queue. If not then return.
799  */
800 static __rte_always_inline int
801 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
802 {
803         struct rte_ether_hdr *pkt_hdr;
804         struct vhost_dev *dst_vdev;
805
806         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
807
808         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
809         if (!dst_vdev)
810                 return -1;
811
812         if (vdev->vid == dst_vdev->vid) {
813                 RTE_LOG_DP(DEBUG, VHOST_DATA,
814                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
815                         vdev->vid);
816                 return 0;
817         }
818
819         RTE_LOG_DP(DEBUG, VHOST_DATA,
820                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
821
822         if (unlikely(dst_vdev->remove)) {
823                 RTE_LOG_DP(DEBUG, VHOST_DATA,
824                         "(%d) device is marked for removal\n", dst_vdev->vid);
825                 return 0;
826         }
827
828         virtio_xmit(dst_vdev, vdev, m);
829         return 0;
830 }
831
832 /*
833  * Check if the destination MAC of a packet is one local VM,
834  * and get its vlan tag, and offset if it is.
835  */
836 static __rte_always_inline int
837 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
838         uint32_t *offset, uint16_t *vlan_tag)
839 {
840         struct vhost_dev *dst_vdev;
841         struct rte_ether_hdr *pkt_hdr =
842                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
843
844         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
845         if (!dst_vdev)
846                 return 0;
847
848         if (vdev->vid == dst_vdev->vid) {
849                 RTE_LOG_DP(DEBUG, VHOST_DATA,
850                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
851                         vdev->vid);
852                 return -1;
853         }
854
855         /*
856          * HW vlan strip will reduce the packet length
857          * by minus length of vlan tag, so need restore
858          * the packet length by plus it.
859          */
860         *offset  = VLAN_HLEN;
861         *vlan_tag = vlan_tags[vdev->vid];
862
863         RTE_LOG_DP(DEBUG, VHOST_DATA,
864                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
865                 vdev->vid, dst_vdev->vid, *vlan_tag);
866
867         return 0;
868 }
869
870 static uint16_t
871 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
872 {
873         if (ol_flags & PKT_TX_IPV4)
874                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
875         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
876                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
877 }
878
879 static void virtio_tx_offload(struct rte_mbuf *m)
880 {
881         void *l3_hdr;
882         struct rte_ipv4_hdr *ipv4_hdr = NULL;
883         struct rte_tcp_hdr *tcp_hdr = NULL;
884         struct rte_ether_hdr *eth_hdr =
885                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
886
887         l3_hdr = (char *)eth_hdr + m->l2_len;
888
889         if (m->ol_flags & PKT_TX_IPV4) {
890                 ipv4_hdr = l3_hdr;
891                 ipv4_hdr->hdr_checksum = 0;
892                 m->ol_flags |= PKT_TX_IP_CKSUM;
893         }
894
895         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
896         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
897 }
898
899 static inline void
900 free_pkts(struct rte_mbuf **pkts, uint16_t n)
901 {
902         while (n--)
903                 rte_pktmbuf_free(pkts[n]);
904 }
905
906 static __rte_always_inline void
907 do_drain_mbuf_table(struct mbuf_table *tx_q)
908 {
909         uint16_t count;
910
911         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
912                                  tx_q->m_table, tx_q->len);
913         if (unlikely(count < tx_q->len))
914                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
915
916         tx_q->len = 0;
917 }
918
919 /*
920  * This function routes the TX packet to the correct interface. This
921  * may be a local device or the physical port.
922  */
923 static __rte_always_inline void
924 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
925 {
926         struct mbuf_table *tx_q;
927         unsigned offset = 0;
928         const uint16_t lcore_id = rte_lcore_id();
929         struct rte_ether_hdr *nh;
930
931
932         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
933         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
934                 struct vhost_dev *vdev2;
935
936                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
937                         if (vdev2 != vdev)
938                                 virtio_xmit(vdev2, vdev, m);
939                 }
940                 goto queue2nic;
941         }
942
943         /*check if destination is local VM*/
944         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
945                 rte_pktmbuf_free(m);
946                 return;
947         }
948
949         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
950                 if (unlikely(find_local_dest(vdev, m, &offset,
951                                              &vlan_tag) != 0)) {
952                         rte_pktmbuf_free(m);
953                         return;
954                 }
955         }
956
957         RTE_LOG_DP(DEBUG, VHOST_DATA,
958                 "(%d) TX: MAC address is external\n", vdev->vid);
959
960 queue2nic:
961
962         /*Add packet to the port tx queue*/
963         tx_q = &lcore_tx_queue[lcore_id];
964
965         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
966         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
967                 /* Guest has inserted the vlan tag. */
968                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
969                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
970                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
971                         (vh->vlan_tci != vlan_tag_be))
972                         vh->vlan_tci = vlan_tag_be;
973         } else {
974                 m->ol_flags |= PKT_TX_VLAN_PKT;
975
976                 /*
977                  * Find the right seg to adjust the data len when offset is
978                  * bigger than tail room size.
979                  */
980                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
981                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
982                                 m->data_len += offset;
983                         else {
984                                 struct rte_mbuf *seg = m;
985
986                                 while ((seg->next != NULL) &&
987                                         (offset > rte_pktmbuf_tailroom(seg)))
988                                         seg = seg->next;
989
990                                 seg->data_len += offset;
991                         }
992                         m->pkt_len += offset;
993                 }
994
995                 m->vlan_tci = vlan_tag;
996         }
997
998         if (m->ol_flags & PKT_TX_TCP_SEG)
999                 virtio_tx_offload(m);
1000
1001         tx_q->m_table[tx_q->len++] = m;
1002         if (enable_stats) {
1003                 vdev->stats.tx_total++;
1004                 vdev->stats.tx++;
1005         }
1006
1007         if (unlikely(tx_q->len == MAX_PKT_BURST))
1008                 do_drain_mbuf_table(tx_q);
1009 }
1010
1011
1012 static __rte_always_inline void
1013 drain_mbuf_table(struct mbuf_table *tx_q)
1014 {
1015         static uint64_t prev_tsc;
1016         uint64_t cur_tsc;
1017
1018         if (tx_q->len == 0)
1019                 return;
1020
1021         cur_tsc = rte_rdtsc();
1022         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1023                 prev_tsc = cur_tsc;
1024
1025                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1026                         "TX queue drained after timeout with burst size %u\n",
1027                         tx_q->len);
1028                 do_drain_mbuf_table(tx_q);
1029         }
1030 }
1031
1032 static __rte_always_inline void
1033 drain_eth_rx(struct vhost_dev *vdev)
1034 {
1035         uint16_t rx_count, enqueue_count;
1036         struct rte_mbuf *pkts[MAX_PKT_BURST];
1037
1038         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1039                                     pkts, MAX_PKT_BURST);
1040         if (!rx_count)
1041                 return;
1042
1043         /*
1044          * When "enable_retry" is set, here we wait and retry when there
1045          * is no enough free slots in the queue to hold @rx_count packets,
1046          * to diminish packet loss.
1047          */
1048         if (enable_retry &&
1049             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1050                         VIRTIO_RXQ))) {
1051                 uint32_t retry;
1052
1053                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1054                         rte_delay_us(burst_rx_delay_time);
1055                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1056                                         VIRTIO_RXQ))
1057                                 break;
1058                 }
1059         }
1060
1061         if (builtin_net_driver) {
1062                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1063                                                 pkts, rx_count);
1064         } else {
1065                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1066                                                 pkts, rx_count);
1067         }
1068         if (enable_stats) {
1069                 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1070                 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1071         }
1072
1073         free_pkts(pkts, rx_count);
1074 }
1075
1076 static __rte_always_inline void
1077 drain_virtio_tx(struct vhost_dev *vdev)
1078 {
1079         struct rte_mbuf *pkts[MAX_PKT_BURST];
1080         uint16_t count;
1081         uint16_t i;
1082
1083         if (builtin_net_driver) {
1084                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1085                                         pkts, MAX_PKT_BURST);
1086         } else {
1087                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1088                                         mbuf_pool, pkts, MAX_PKT_BURST);
1089         }
1090
1091         /* setup VMDq for the first packet */
1092         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1093                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1094                         free_pkts(pkts, count);
1095         }
1096
1097         for (i = 0; i < count; ++i)
1098                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1099 }
1100
1101 /*
1102  * Main function of vhost-switch. It basically does:
1103  *
1104  * for each vhost device {
1105  *    - drain_eth_rx()
1106  *
1107  *      Which drains the host eth Rx queue linked to the vhost device,
1108  *      and deliver all of them to guest virito Rx ring associated with
1109  *      this vhost device.
1110  *
1111  *    - drain_virtio_tx()
1112  *
1113  *      Which drains the guest virtio Tx queue and deliver all of them
1114  *      to the target, which could be another vhost device, or the
1115  *      physical eth dev. The route is done in function "virtio_tx_route".
1116  * }
1117  */
1118 static int
1119 switch_worker(void *arg __rte_unused)
1120 {
1121         unsigned i;
1122         unsigned lcore_id = rte_lcore_id();
1123         struct vhost_dev *vdev;
1124         struct mbuf_table *tx_q;
1125
1126         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1127
1128         tx_q = &lcore_tx_queue[lcore_id];
1129         for (i = 0; i < rte_lcore_count(); i++) {
1130                 if (lcore_ids[i] == lcore_id) {
1131                         tx_q->txq_id = i;
1132                         break;
1133                 }
1134         }
1135
1136         while(1) {
1137                 drain_mbuf_table(tx_q);
1138
1139                 /*
1140                  * Inform the configuration core that we have exited the
1141                  * linked list and that no devices are in use if requested.
1142                  */
1143                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1144                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1145
1146                 /*
1147                  * Process vhost devices
1148                  */
1149                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1150                               lcore_vdev_entry) {
1151                         if (unlikely(vdev->remove)) {
1152                                 unlink_vmdq(vdev);
1153                                 vdev->ready = DEVICE_SAFE_REMOVE;
1154                                 continue;
1155                         }
1156
1157                         if (likely(vdev->ready == DEVICE_RX))
1158                                 drain_eth_rx(vdev);
1159
1160                         if (likely(!vdev->remove))
1161                                 drain_virtio_tx(vdev);
1162                 }
1163         }
1164
1165         return 0;
1166 }
1167
1168 /*
1169  * Remove a device from the specific data core linked list and from the
1170  * main linked list. Synchonization  occurs through the use of the
1171  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1172  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1173  */
1174 static void
1175 destroy_device(int vid)
1176 {
1177         struct vhost_dev *vdev = NULL;
1178         int lcore;
1179
1180         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1181                 if (vdev->vid == vid)
1182                         break;
1183         }
1184         if (!vdev)
1185                 return;
1186         /*set the remove flag. */
1187         vdev->remove = 1;
1188         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1189                 rte_pause();
1190         }
1191
1192         if (builtin_net_driver)
1193                 vs_vhost_net_remove(vdev);
1194
1195         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1196                      lcore_vdev_entry);
1197         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1198
1199
1200         /* Set the dev_removal_flag on each lcore. */
1201         RTE_LCORE_FOREACH_SLAVE(lcore)
1202                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1203
1204         /*
1205          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1206          * we can be sure that they can no longer access the device removed
1207          * from the linked lists and that the devices are no longer in use.
1208          */
1209         RTE_LCORE_FOREACH_SLAVE(lcore) {
1210                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1211                         rte_pause();
1212         }
1213
1214         lcore_info[vdev->coreid].device_num--;
1215
1216         RTE_LOG(INFO, VHOST_DATA,
1217                 "(%d) device has been removed from data core\n",
1218                 vdev->vid);
1219
1220         rte_free(vdev);
1221 }
1222
1223 /*
1224  * A new device is added to a data core. First the device is added to the main linked list
1225  * and then allocated to a specific data core.
1226  */
1227 static int
1228 new_device(int vid)
1229 {
1230         int lcore, core_add = 0;
1231         uint32_t device_num_min = num_devices;
1232         struct vhost_dev *vdev;
1233
1234         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1235         if (vdev == NULL) {
1236                 RTE_LOG(INFO, VHOST_DATA,
1237                         "(%d) couldn't allocate memory for vhost dev\n",
1238                         vid);
1239                 return -1;
1240         }
1241         vdev->vid = vid;
1242
1243         if (builtin_net_driver)
1244                 vs_vhost_net_setup(vdev);
1245
1246         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1247         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1248
1249         /*reset ready flag*/
1250         vdev->ready = DEVICE_MAC_LEARNING;
1251         vdev->remove = 0;
1252
1253         /* Find a suitable lcore to add the device. */
1254         RTE_LCORE_FOREACH_SLAVE(lcore) {
1255                 if (lcore_info[lcore].device_num < device_num_min) {
1256                         device_num_min = lcore_info[lcore].device_num;
1257                         core_add = lcore;
1258                 }
1259         }
1260         vdev->coreid = core_add;
1261
1262         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1263                           lcore_vdev_entry);
1264         lcore_info[vdev->coreid].device_num++;
1265
1266         /* Disable notifications. */
1267         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1268         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1269
1270         RTE_LOG(INFO, VHOST_DATA,
1271                 "(%d) device has been added to data core %d\n",
1272                 vid, vdev->coreid);
1273
1274         return 0;
1275 }
1276
1277 /*
1278  * These callback allow devices to be added to the data core when configuration
1279  * has been fully complete.
1280  */
1281 static const struct vhost_device_ops virtio_net_device_ops =
1282 {
1283         .new_device =  new_device,
1284         .destroy_device = destroy_device,
1285 };
1286
1287 /*
1288  * This is a thread will wake up after a period to print stats if the user has
1289  * enabled them.
1290  */
1291 static void *
1292 print_stats(__rte_unused void *arg)
1293 {
1294         struct vhost_dev *vdev;
1295         uint64_t tx_dropped, rx_dropped;
1296         uint64_t tx, tx_total, rx, rx_total;
1297         const char clr[] = { 27, '[', '2', 'J', '\0' };
1298         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1299
1300         while(1) {
1301                 sleep(enable_stats);
1302
1303                 /* Clear screen and move to top left */
1304                 printf("%s%s\n", clr, top_left);
1305                 printf("Device statistics =================================\n");
1306
1307                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1308                         tx_total   = vdev->stats.tx_total;
1309                         tx         = vdev->stats.tx;
1310                         tx_dropped = tx_total - tx;
1311
1312                         rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1313                         rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1314                         rx_dropped = rx_total - rx;
1315
1316                         printf("Statistics for device %d\n"
1317                                 "-----------------------\n"
1318                                 "TX total:              %" PRIu64 "\n"
1319                                 "TX dropped:            %" PRIu64 "\n"
1320                                 "TX successful:         %" PRIu64 "\n"
1321                                 "RX total:              %" PRIu64 "\n"
1322                                 "RX dropped:            %" PRIu64 "\n"
1323                                 "RX successful:         %" PRIu64 "\n",
1324                                 vdev->vid,
1325                                 tx_total, tx_dropped, tx,
1326                                 rx_total, rx_dropped, rx);
1327                 }
1328
1329                 printf("===================================================\n");
1330         }
1331
1332         return NULL;
1333 }
1334
1335 static void
1336 unregister_drivers(int socket_num)
1337 {
1338         int i, ret;
1339
1340         for (i = 0; i < socket_num; i++) {
1341                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1342                 if (ret != 0)
1343                         RTE_LOG(ERR, VHOST_CONFIG,
1344                                 "Fail to unregister vhost driver for %s.\n",
1345                                 socket_files + i * PATH_MAX);
1346         }
1347 }
1348
1349 /* When we receive a INT signal, unregister vhost driver */
1350 static void
1351 sigint_handler(__rte_unused int signum)
1352 {
1353         /* Unregister vhost driver. */
1354         unregister_drivers(nb_sockets);
1355
1356         exit(0);
1357 }
1358
1359 /*
1360  * While creating an mbuf pool, one key thing is to figure out how
1361  * many mbuf entries is enough for our use. FYI, here are some
1362  * guidelines:
1363  *
1364  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1365  *
1366  * - For each switch core (A CPU core does the packet switch), we need
1367  *   also make some reservation for receiving the packets from virtio
1368  *   Tx queue. How many is enough depends on the usage. It's normally
1369  *   a simple calculation like following:
1370  *
1371  *       MAX_PKT_BURST * max packet size / mbuf size
1372  *
1373  *   So, we definitely need allocate more mbufs when TSO is enabled.
1374  *
1375  * - Similarly, for each switching core, we should serve @nr_rx_desc
1376  *   mbufs for receiving the packets from physical NIC device.
1377  *
1378  * - We also need make sure, for each switch core, we have allocated
1379  *   enough mbufs to fill up the mbuf cache.
1380  */
1381 static void
1382 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1383         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1384 {
1385         uint32_t nr_mbufs;
1386         uint32_t nr_mbufs_per_core;
1387         uint32_t mtu = 1500;
1388
1389         if (mergeable)
1390                 mtu = 9000;
1391         if (enable_tso)
1392                 mtu = 64 * 1024;
1393
1394         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1395                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1396         nr_mbufs_per_core += nr_rx_desc;
1397         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1398
1399         nr_mbufs  = nr_queues * nr_rx_desc;
1400         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1401         nr_mbufs *= nr_port;
1402
1403         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1404                                             nr_mbuf_cache, 0, mbuf_size,
1405                                             rte_socket_id());
1406         if (mbuf_pool == NULL)
1407                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1408 }
1409
1410 /*
1411  * Main function, does initialisation and calls the per-lcore functions.
1412  */
1413 int
1414 main(int argc, char *argv[])
1415 {
1416         unsigned lcore_id, core_id = 0;
1417         unsigned nb_ports, valid_num_ports;
1418         int ret, i;
1419         uint16_t portid;
1420         static pthread_t tid;
1421         uint64_t flags = 0;
1422
1423         signal(SIGINT, sigint_handler);
1424
1425         /* init EAL */
1426         ret = rte_eal_init(argc, argv);
1427         if (ret < 0)
1428                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1429         argc -= ret;
1430         argv += ret;
1431
1432         /* parse app arguments */
1433         ret = us_vhost_parse_args(argc, argv);
1434         if (ret < 0)
1435                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1436
1437         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1438                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1439
1440                 if (rte_lcore_is_enabled(lcore_id))
1441                         lcore_ids[core_id++] = lcore_id;
1442         }
1443
1444         if (rte_lcore_count() > RTE_MAX_LCORE)
1445                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1446
1447         /* Get the number of physical ports. */
1448         nb_ports = rte_eth_dev_count_avail();
1449
1450         /*
1451          * Update the global var NUM_PORTS and global array PORTS
1452          * and get value of var VALID_NUM_PORTS according to system ports number
1453          */
1454         valid_num_ports = check_ports_num(nb_ports);
1455
1456         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1457                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1458                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1459                 return -1;
1460         }
1461
1462         /*
1463          * FIXME: here we are trying to allocate mbufs big enough for
1464          * @MAX_QUEUES, but the truth is we're never going to use that
1465          * many queues here. We probably should only do allocation for
1466          * those queues we are going to use.
1467          */
1468         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1469                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1470
1471         if (vm2vm_mode == VM2VM_HARDWARE) {
1472                 /* Enable VT loop back to let L2 switch to do it. */
1473                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1474                 RTE_LOG(DEBUG, VHOST_CONFIG,
1475                         "Enable loop back for L2 switch in vmdq.\n");
1476         }
1477
1478         /* initialize all ports */
1479         RTE_ETH_FOREACH_DEV(portid) {
1480                 /* skip ports that are not enabled */
1481                 if ((enabled_port_mask & (1 << portid)) == 0) {
1482                         RTE_LOG(INFO, VHOST_PORT,
1483                                 "Skipping disabled port %d\n", portid);
1484                         continue;
1485                 }
1486                 if (port_init(portid) != 0)
1487                         rte_exit(EXIT_FAILURE,
1488                                 "Cannot initialize network ports\n");
1489         }
1490
1491         /* Enable stats if the user option is set. */
1492         if (enable_stats) {
1493                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1494                                         print_stats, NULL);
1495                 if (ret < 0)
1496                         rte_exit(EXIT_FAILURE,
1497                                 "Cannot create print-stats thread\n");
1498         }
1499
1500         /* Launch all data cores. */
1501         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1502                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1503
1504         if (client_mode)
1505                 flags |= RTE_VHOST_USER_CLIENT;
1506
1507         if (dequeue_zero_copy)
1508                 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1509
1510         /* Register vhost user driver to handle vhost messages. */
1511         for (i = 0; i < nb_sockets; i++) {
1512                 char *file = socket_files + i * PATH_MAX;
1513                 ret = rte_vhost_driver_register(file, flags);
1514                 if (ret != 0) {
1515                         unregister_drivers(i);
1516                         rte_exit(EXIT_FAILURE,
1517                                 "vhost driver register failure.\n");
1518                 }
1519
1520                 if (builtin_net_driver)
1521                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1522
1523                 if (mergeable == 0) {
1524                         rte_vhost_driver_disable_features(file,
1525                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1526                 }
1527
1528                 if (enable_tx_csum == 0) {
1529                         rte_vhost_driver_disable_features(file,
1530                                 1ULL << VIRTIO_NET_F_CSUM);
1531                 }
1532
1533                 if (enable_tso == 0) {
1534                         rte_vhost_driver_disable_features(file,
1535                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1536                         rte_vhost_driver_disable_features(file,
1537                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1538                         rte_vhost_driver_disable_features(file,
1539                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1540                         rte_vhost_driver_disable_features(file,
1541                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1542                 }
1543
1544                 if (promiscuous) {
1545                         rte_vhost_driver_enable_features(file,
1546                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1547                 }
1548
1549                 ret = rte_vhost_driver_callback_register(file,
1550                         &virtio_net_device_ops);
1551                 if (ret != 0) {
1552                         rte_exit(EXIT_FAILURE,
1553                                 "failed to register vhost driver callbacks.\n");
1554                 }
1555
1556                 if (rte_vhost_driver_start(file) < 0) {
1557                         rte_exit(EXIT_FAILURE,
1558                                 "failed to start vhost driver.\n");
1559                 }
1560         }
1561
1562         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1563                 rte_eal_wait_lcore(lcore_id);
1564
1565         return 0;
1566
1567 }