examples/vhost: fix overflow in argument parsing
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26
27 #include "ioat.h"
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* mask of enabled ports */
59 static uint32_t enabled_port_mask = 0;
60
61 /* Promiscuous mode */
62 static uint32_t promiscuous;
63
64 /* number of devices/queues to support*/
65 static uint32_t num_queues = 0;
66 static uint32_t num_devices;
67
68 static struct rte_mempool *mbuf_pool;
69 static int mergeable;
70
71 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
72 typedef enum {
73         VM2VM_DISABLED = 0,
74         VM2VM_SOFTWARE = 1,
75         VM2VM_HARDWARE = 2,
76         VM2VM_LAST
77 } vm2vm_type;
78 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
79
80 /* Enable stats. */
81 static uint32_t enable_stats = 0;
82 /* Enable retries on RX. */
83 static uint32_t enable_retry = 1;
84
85 /* Disable TX checksum offload */
86 static uint32_t enable_tx_csum;
87
88 /* Disable TSO offload */
89 static uint32_t enable_tso;
90
91 static int client_mode;
92
93 static int builtin_net_driver;
94
95 static int async_vhost_driver;
96
97 static char *dma_type;
98
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
103
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
107
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
110         .rxmode = {
111                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
112                 .split_hdr_size = 0,
113                 /*
114                  * VLAN strip is necessary for 1G NIC such as I350,
115                  * this fixes bug of ipv4 forwarding in guest can't
116                  * forward pakets from one virtio dev to another virtio dev.
117                  */
118                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
119         },
120
121         .txmode = {
122                 .mq_mode = ETH_MQ_TX_NONE,
123                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124                              DEV_TX_OFFLOAD_TCP_CKSUM |
125                              DEV_TX_OFFLOAD_VLAN_INSERT |
126                              DEV_TX_OFFLOAD_MULTI_SEGS |
127                              DEV_TX_OFFLOAD_TCP_TSO),
128         },
129         .rx_adv_conf = {
130                 /*
131                  * should be overridden separately in code with
132                  * appropriate values
133                  */
134                 .vmdq_rx_conf = {
135                         .nb_queue_pools = ETH_8_POOLS,
136                         .enable_default_pool = 0,
137                         .default_pool = 0,
138                         .nb_pool_maps = 0,
139                         .pool_map = {{0, 0},},
140                 },
141         },
142 };
143
144
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
151
152 const uint16_t vlan_tags[] = {
153         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
161 };
162
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
165
166 static struct vhost_dev_tailq_list vhost_dev_list =
167         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
168
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
170
171 /* Used for queueing bursts of TX packets. */
172 struct mbuf_table {
173         unsigned len;
174         unsigned txq_id;
175         struct rte_mbuf *m_table[MAX_PKT_BURST];
176 };
177
178 struct vhost_bufftable {
179         uint32_t len;
180         uint64_t pre_tsc;
181         struct rte_mbuf *m_table[MAX_PKT_BURST];
182 };
183
184 /* TX queue for each data core. */
185 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
186
187 /*
188  * Vhost TX buffer for each data core.
189  * Every data core maintains a TX buffer for every vhost device,
190  * which is used for batch pkts enqueue for higher performance.
191  */
192 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
193
194 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
195                                  / US_PER_S * BURST_TX_DRAIN_US)
196 #define VLAN_HLEN       4
197
198 static inline int
199 open_dma(const char *value)
200 {
201         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
202                 return open_ioat(value);
203
204         return -1;
205 }
206
207 /*
208  * Builds up the correct configuration for VMDQ VLAN pool map
209  * according to the pool & queue limits.
210  */
211 static inline int
212 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
213 {
214         struct rte_eth_vmdq_rx_conf conf;
215         struct rte_eth_vmdq_rx_conf *def_conf =
216                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
217         unsigned i;
218
219         memset(&conf, 0, sizeof(conf));
220         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
221         conf.nb_pool_maps = num_devices;
222         conf.enable_loop_back = def_conf->enable_loop_back;
223         conf.rx_mode = def_conf->rx_mode;
224
225         for (i = 0; i < conf.nb_pool_maps; i++) {
226                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
227                 conf.pool_map[i].pools = (1UL << i);
228         }
229
230         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
231         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
232                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
233         return 0;
234 }
235
236 /*
237  * Initialises a given port using global settings and with the rx buffers
238  * coming from the mbuf_pool passed as parameter
239  */
240 static inline int
241 port_init(uint16_t port)
242 {
243         struct rte_eth_dev_info dev_info;
244         struct rte_eth_conf port_conf;
245         struct rte_eth_rxconf *rxconf;
246         struct rte_eth_txconf *txconf;
247         int16_t rx_rings, tx_rings;
248         uint16_t rx_ring_size, tx_ring_size;
249         int retval;
250         uint16_t q;
251
252         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
253         retval = rte_eth_dev_info_get(port, &dev_info);
254         if (retval != 0) {
255                 RTE_LOG(ERR, VHOST_PORT,
256                         "Error during getting device (port %u) info: %s\n",
257                         port, strerror(-retval));
258
259                 return retval;
260         }
261
262         rxconf = &dev_info.default_rxconf;
263         txconf = &dev_info.default_txconf;
264         rxconf->rx_drop_en = 1;
265
266         /*configure the number of supported virtio devices based on VMDQ limits */
267         num_devices = dev_info.max_vmdq_pools;
268
269         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
270         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
271
272         tx_rings = (uint16_t)rte_lcore_count();
273
274         /* Get port configuration. */
275         retval = get_eth_conf(&port_conf, num_devices);
276         if (retval < 0)
277                 return retval;
278         /* NIC queues are divided into pf queues and vmdq queues.  */
279         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
280         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
281         num_vmdq_queues = num_devices * queues_per_pool;
282         num_queues = num_pf_queues + num_vmdq_queues;
283         vmdq_queue_base = dev_info.vmdq_queue_base;
284         vmdq_pool_base  = dev_info.vmdq_pool_base;
285         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
286                 num_pf_queues, num_devices, queues_per_pool);
287
288         if (!rte_eth_dev_is_valid_port(port))
289                 return -1;
290
291         rx_rings = (uint16_t)dev_info.max_rx_queues;
292         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
293                 port_conf.txmode.offloads |=
294                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
295         /* Configure ethernet device. */
296         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
297         if (retval != 0) {
298                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
299                         port, strerror(-retval));
300                 return retval;
301         }
302
303         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
304                 &tx_ring_size);
305         if (retval != 0) {
306                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
307                         "for port %u: %s.\n", port, strerror(-retval));
308                 return retval;
309         }
310         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
311                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
312                         "for Rx queues on port %u.\n", port);
313                 return -1;
314         }
315
316         /* Setup the queues. */
317         rxconf->offloads = port_conf.rxmode.offloads;
318         for (q = 0; q < rx_rings; q ++) {
319                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
320                                                 rte_eth_dev_socket_id(port),
321                                                 rxconf,
322                                                 mbuf_pool);
323                 if (retval < 0) {
324                         RTE_LOG(ERR, VHOST_PORT,
325                                 "Failed to setup rx queue %u of port %u: %s.\n",
326                                 q, port, strerror(-retval));
327                         return retval;
328                 }
329         }
330         txconf->offloads = port_conf.txmode.offloads;
331         for (q = 0; q < tx_rings; q ++) {
332                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
333                                                 rte_eth_dev_socket_id(port),
334                                                 txconf);
335                 if (retval < 0) {
336                         RTE_LOG(ERR, VHOST_PORT,
337                                 "Failed to setup tx queue %u of port %u: %s.\n",
338                                 q, port, strerror(-retval));
339                         return retval;
340                 }
341         }
342
343         /* Start the device. */
344         retval  = rte_eth_dev_start(port);
345         if (retval < 0) {
346                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
347                         port, strerror(-retval));
348                 return retval;
349         }
350
351         if (promiscuous) {
352                 retval = rte_eth_promiscuous_enable(port);
353                 if (retval != 0) {
354                         RTE_LOG(ERR, VHOST_PORT,
355                                 "Failed to enable promiscuous mode on port %u: %s\n",
356                                 port, rte_strerror(-retval));
357                         return retval;
358                 }
359         }
360
361         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
362         if (retval < 0) {
363                 RTE_LOG(ERR, VHOST_PORT,
364                         "Failed to get MAC address on port %u: %s\n",
365                         port, rte_strerror(-retval));
366                 return retval;
367         }
368
369         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
370         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
371                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
372                         port,
373                         vmdq_ports_eth_addr[port].addr_bytes[0],
374                         vmdq_ports_eth_addr[port].addr_bytes[1],
375                         vmdq_ports_eth_addr[port].addr_bytes[2],
376                         vmdq_ports_eth_addr[port].addr_bytes[3],
377                         vmdq_ports_eth_addr[port].addr_bytes[4],
378                         vmdq_ports_eth_addr[port].addr_bytes[5]);
379
380         return 0;
381 }
382
383 /*
384  * Set socket file path.
385  */
386 static int
387 us_vhost_parse_socket_path(const char *q_arg)
388 {
389         char *old;
390
391         /* parse number string */
392         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
393                 return -1;
394
395         old = socket_files;
396         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
397         if (socket_files == NULL) {
398                 free(old);
399                 return -1;
400         }
401
402         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
403         nb_sockets++;
404
405         return 0;
406 }
407
408 /*
409  * Parse the portmask provided at run time.
410  */
411 static int
412 parse_portmask(const char *portmask)
413 {
414         char *end = NULL;
415         unsigned long pm;
416
417         errno = 0;
418
419         /* parse hexadecimal string */
420         pm = strtoul(portmask, &end, 16);
421         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
422                 return 0;
423
424         return pm;
425
426 }
427
428 /*
429  * Parse num options at run time.
430  */
431 static int
432 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
433 {
434         char *end = NULL;
435         unsigned long num;
436
437         errno = 0;
438
439         /* parse unsigned int string */
440         num = strtoul(q_arg, &end, 10);
441         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
442                 return -1;
443
444         if (num > max_valid_value)
445                 return -1;
446
447         return num;
448
449 }
450
451 /*
452  * Display usage
453  */
454 static void
455 us_vhost_usage(const char *prgname)
456 {
457         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
458         "               --vm2vm [0|1|2]\n"
459         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
460         "               --socket-file <path>\n"
461         "               --nb-devices ND\n"
462         "               -p PORTMASK: Set mask for ports to be used by application\n"
463         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
464         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
465         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
466         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
467         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
468         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
469         "               --socket-file: The path of the socket file.\n"
470         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
471         "               --tso [0|1] disable/enable TCP segment offload.\n"
472         "               --client register a vhost-user socket as client mode.\n"
473         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
474         "               --dmas register dma channel for specific vhost device.\n",
475                prgname);
476 }
477
478 enum {
479 #define OPT_VM2VM               "vm2vm"
480         OPT_VM2VM_NUM = 256,
481 #define OPT_RX_RETRY            "rx-retry"
482         OPT_RX_RETRY_NUM,
483 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
484         OPT_RX_RETRY_DELAY_NUM,
485 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
486         OPT_RX_RETRY_NUMB_NUM,
487 #define OPT_MERGEABLE           "mergeable"
488         OPT_MERGEABLE_NUM,
489 #define OPT_STATS               "stats"
490         OPT_STATS_NUM,
491 #define OPT_SOCKET_FILE         "socket-file"
492         OPT_SOCKET_FILE_NUM,
493 #define OPT_TX_CSUM             "tx-csum"
494         OPT_TX_CSUM_NUM,
495 #define OPT_TSO                 "tso"
496         OPT_TSO_NUM,
497 #define OPT_CLIENT              "client"
498         OPT_CLIENT_NUM,
499 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
500         OPT_BUILTIN_NET_DRIVER_NUM,
501 #define OPT_DMA_TYPE            "dma-type"
502         OPT_DMA_TYPE_NUM,
503 #define OPT_DMAS                "dmas"
504         OPT_DMAS_NUM,
505 };
506
507 /*
508  * Parse the arguments given in the command line of the application.
509  */
510 static int
511 us_vhost_parse_args(int argc, char **argv)
512 {
513         int opt, ret;
514         int option_index;
515         unsigned i;
516         const char *prgname = argv[0];
517         static struct option long_option[] = {
518                 {OPT_VM2VM, required_argument,
519                                 NULL, OPT_VM2VM_NUM},
520                 {OPT_RX_RETRY, required_argument,
521                                 NULL, OPT_RX_RETRY_NUM},
522                 {OPT_RX_RETRY_DELAY, required_argument,
523                                 NULL, OPT_RX_RETRY_DELAY_NUM},
524                 {OPT_RX_RETRY_NUMB, required_argument,
525                                 NULL, OPT_RX_RETRY_NUMB_NUM},
526                 {OPT_MERGEABLE, required_argument,
527                                 NULL, OPT_MERGEABLE_NUM},
528                 {OPT_STATS, required_argument,
529                                 NULL, OPT_STATS_NUM},
530                 {OPT_SOCKET_FILE, required_argument,
531                                 NULL, OPT_SOCKET_FILE_NUM},
532                 {OPT_TX_CSUM, required_argument,
533                                 NULL, OPT_TX_CSUM_NUM},
534                 {OPT_TSO, required_argument,
535                                 NULL, OPT_TSO_NUM},
536                 {OPT_CLIENT, no_argument,
537                                 NULL, OPT_CLIENT_NUM},
538                 {OPT_BUILTIN_NET_DRIVER, no_argument,
539                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
540                 {OPT_DMA_TYPE, required_argument,
541                                 NULL, OPT_DMA_TYPE_NUM},
542                 {OPT_DMAS, required_argument,
543                                 NULL, OPT_DMAS_NUM},
544                 {NULL, 0, 0, 0},
545         };
546
547         /* Parse command line */
548         while ((opt = getopt_long(argc, argv, "p:P",
549                         long_option, &option_index)) != EOF) {
550                 switch (opt) {
551                 /* Portmask */
552                 case 'p':
553                         enabled_port_mask = parse_portmask(optarg);
554                         if (enabled_port_mask == 0) {
555                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
556                                 us_vhost_usage(prgname);
557                                 return -1;
558                         }
559                         break;
560
561                 case 'P':
562                         promiscuous = 1;
563                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
564                                 ETH_VMDQ_ACCEPT_BROADCAST |
565                                 ETH_VMDQ_ACCEPT_MULTICAST;
566                         break;
567
568                 case OPT_VM2VM_NUM:
569                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
570                         if (ret == -1) {
571                                 RTE_LOG(INFO, VHOST_CONFIG,
572                                         "Invalid argument for "
573                                         "vm2vm [0|1|2]\n");
574                                 us_vhost_usage(prgname);
575                                 return -1;
576                         }
577                         vm2vm_mode = (vm2vm_type)ret;
578                         break;
579
580                 case OPT_RX_RETRY_NUM:
581                         ret = parse_num_opt(optarg, 1);
582                         if (ret == -1) {
583                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
584                                 us_vhost_usage(prgname);
585                                 return -1;
586                         }
587                         enable_retry = ret;
588                         break;
589
590                 case OPT_TX_CSUM_NUM:
591                         ret = parse_num_opt(optarg, 1);
592                         if (ret == -1) {
593                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
594                                 us_vhost_usage(prgname);
595                                 return -1;
596                         }
597                         enable_tx_csum = ret;
598                         break;
599
600                 case OPT_TSO_NUM:
601                         ret = parse_num_opt(optarg, 1);
602                         if (ret == -1) {
603                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
604                                 us_vhost_usage(prgname);
605                                 return -1;
606                         }
607                         enable_tso = ret;
608                         break;
609
610                 case OPT_RX_RETRY_DELAY_NUM:
611                         ret = parse_num_opt(optarg, INT32_MAX);
612                         if (ret == -1) {
613                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
614                                 us_vhost_usage(prgname);
615                                 return -1;
616                         }
617                         burst_rx_delay_time = ret;
618                         break;
619
620                 case OPT_RX_RETRY_NUMB_NUM:
621                         ret = parse_num_opt(optarg, INT32_MAX);
622                         if (ret == -1) {
623                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
624                                 us_vhost_usage(prgname);
625                                 return -1;
626                         }
627                         burst_rx_retry_num = ret;
628                         break;
629
630                 case OPT_MERGEABLE_NUM:
631                         ret = parse_num_opt(optarg, 1);
632                         if (ret == -1) {
633                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
634                                 us_vhost_usage(prgname);
635                                 return -1;
636                         }
637                         mergeable = !!ret;
638                         if (ret) {
639                                 vmdq_conf_default.rxmode.offloads |=
640                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
641                                 vmdq_conf_default.rxmode.max_rx_pkt_len
642                                         = JUMBO_FRAME_MAX_SIZE;
643                         }
644                         break;
645
646                 case OPT_STATS_NUM:
647                         ret = parse_num_opt(optarg, INT32_MAX);
648                         if (ret == -1) {
649                                 RTE_LOG(INFO, VHOST_CONFIG,
650                                         "Invalid argument for stats [0..N]\n");
651                                 us_vhost_usage(prgname);
652                                 return -1;
653                         }
654                         enable_stats = ret;
655                         break;
656
657                 /* Set socket file path. */
658                 case OPT_SOCKET_FILE_NUM:
659                         if (us_vhost_parse_socket_path(optarg) == -1) {
660                                 RTE_LOG(INFO, VHOST_CONFIG,
661                                 "Invalid argument for socket name (Max %d characters)\n",
662                                 PATH_MAX);
663                                 us_vhost_usage(prgname);
664                                 return -1;
665                         }
666                         break;
667
668                 case OPT_DMA_TYPE_NUM:
669                         dma_type = optarg;
670                         break;
671
672                 case OPT_DMAS_NUM:
673                         if (open_dma(optarg) == -1) {
674                                 RTE_LOG(INFO, VHOST_CONFIG,
675                                         "Wrong DMA args\n");
676                                 us_vhost_usage(prgname);
677                                 return -1;
678                         }
679                         async_vhost_driver = 1;
680                         break;
681
682                 case OPT_CLIENT_NUM:
683                         client_mode = 1;
684                         break;
685
686                 case OPT_BUILTIN_NET_DRIVER_NUM:
687                         builtin_net_driver = 1;
688                         break;
689
690                 /* Invalid option - print options. */
691                 default:
692                         us_vhost_usage(prgname);
693                         return -1;
694                 }
695         }
696
697         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
698                 if (enabled_port_mask & (1 << i))
699                         ports[num_ports++] = i;
700         }
701
702         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
703                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
704                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
705                 return -1;
706         }
707
708         return 0;
709 }
710
711 /*
712  * Update the global var NUM_PORTS and array PORTS according to system ports number
713  * and return valid ports number
714  */
715 static unsigned check_ports_num(unsigned nb_ports)
716 {
717         unsigned valid_num_ports = num_ports;
718         unsigned portid;
719
720         if (num_ports > nb_ports) {
721                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
722                         num_ports, nb_ports);
723                 num_ports = nb_ports;
724         }
725
726         for (portid = 0; portid < num_ports; portid ++) {
727                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
728                         RTE_LOG(INFO, VHOST_PORT,
729                                 "\nSpecified port ID(%u) is not valid\n",
730                                 ports[portid]);
731                         ports[portid] = INVALID_PORT_ID;
732                         valid_num_ports--;
733                 }
734         }
735         return valid_num_ports;
736 }
737
738 static __rte_always_inline struct vhost_dev *
739 find_vhost_dev(struct rte_ether_addr *mac)
740 {
741         struct vhost_dev *vdev;
742
743         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
744                 if (vdev->ready == DEVICE_RX &&
745                     rte_is_same_ether_addr(mac, &vdev->mac_address))
746                         return vdev;
747         }
748
749         return NULL;
750 }
751
752 /*
753  * This function learns the MAC address of the device and registers this along with a
754  * vlan tag to a VMDQ.
755  */
756 static int
757 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
758 {
759         struct rte_ether_hdr *pkt_hdr;
760         int i, ret;
761
762         /* Learn MAC address of guest device from packet */
763         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
764
765         if (find_vhost_dev(&pkt_hdr->s_addr)) {
766                 RTE_LOG(ERR, VHOST_DATA,
767                         "(%d) device is using a registered MAC!\n",
768                         vdev->vid);
769                 return -1;
770         }
771
772         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
773                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
774
775         /* vlan_tag currently uses the device_id. */
776         vdev->vlan_tag = vlan_tags[vdev->vid];
777
778         /* Print out VMDQ registration info. */
779         RTE_LOG(INFO, VHOST_DATA,
780                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
781                 vdev->vid,
782                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
783                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
784                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
785                 vdev->vlan_tag);
786
787         /* Register the MAC address. */
788         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
789                                 (uint32_t)vdev->vid + vmdq_pool_base);
790         if (ret)
791                 RTE_LOG(ERR, VHOST_DATA,
792                         "(%d) failed to add device MAC address to VMDQ\n",
793                         vdev->vid);
794
795         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
796
797         /* Set device as ready for RX. */
798         vdev->ready = DEVICE_RX;
799
800         return 0;
801 }
802
803 /*
804  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
805  * queue before disabling RX on the device.
806  */
807 static inline void
808 unlink_vmdq(struct vhost_dev *vdev)
809 {
810         unsigned i = 0;
811         unsigned rx_count;
812         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
813
814         if (vdev->ready == DEVICE_RX) {
815                 /*clear MAC and VLAN settings*/
816                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
817                 for (i = 0; i < 6; i++)
818                         vdev->mac_address.addr_bytes[i] = 0;
819
820                 vdev->vlan_tag = 0;
821
822                 /*Clear out the receive buffers*/
823                 rx_count = rte_eth_rx_burst(ports[0],
824                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
825
826                 while (rx_count) {
827                         for (i = 0; i < rx_count; i++)
828                                 rte_pktmbuf_free(pkts_burst[i]);
829
830                         rx_count = rte_eth_rx_burst(ports[0],
831                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
832                 }
833
834                 vdev->ready = DEVICE_MAC_LEARNING;
835         }
836 }
837
838 static inline void
839 free_pkts(struct rte_mbuf **pkts, uint16_t n)
840 {
841         while (n--)
842                 rte_pktmbuf_free(pkts[n]);
843 }
844
845 static __rte_always_inline void
846 complete_async_pkts(struct vhost_dev *vdev)
847 {
848         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
849         uint16_t complete_count;
850
851         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
852                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
853         if (complete_count)
854                 free_pkts(p_cpl, complete_count);
855 }
856
857 static __rte_always_inline void
858 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
859             struct rte_mbuf *m)
860 {
861         uint16_t ret;
862
863         if (builtin_net_driver) {
864                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
865         } else {
866                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
867         }
868
869         if (enable_stats) {
870                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
871                                 __ATOMIC_SEQ_CST);
872                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
873                                 __ATOMIC_SEQ_CST);
874                 src_vdev->stats.tx_total++;
875                 src_vdev->stats.tx += ret;
876         }
877 }
878
879 static __rte_always_inline void
880 drain_vhost(struct vhost_dev *vdev)
881 {
882         uint16_t ret;
883         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
884         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
885         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
886
887         if (builtin_net_driver) {
888                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
889         } else if (async_vhost_driver) {
890                 uint32_t cpu_cpl_nr = 0;
891                 uint16_t enqueue_fail = 0;
892                 struct rte_mbuf *m_cpu_cpl[nr_xmit];
893
894                 complete_async_pkts(vdev);
895                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
896                                         m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
897
898                 if (cpu_cpl_nr)
899                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
900
901                 enqueue_fail = nr_xmit - ret;
902                 if (enqueue_fail)
903                         free_pkts(&m[ret], nr_xmit - ret);
904         } else {
905                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
906                                                 m, nr_xmit);
907         }
908
909         if (enable_stats) {
910                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
911                                 __ATOMIC_SEQ_CST);
912                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
913                                 __ATOMIC_SEQ_CST);
914         }
915
916         if (!async_vhost_driver)
917                 free_pkts(m, nr_xmit);
918 }
919
920 static __rte_always_inline void
921 drain_vhost_table(void)
922 {
923         uint16_t lcore_id = rte_lcore_id();
924         struct vhost_bufftable *vhost_txq;
925         struct vhost_dev *vdev;
926         uint64_t cur_tsc;
927
928         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
929                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
930                                                 + vdev->vid];
931
932                 cur_tsc = rte_rdtsc();
933                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
934                                 > MBUF_TABLE_DRAIN_TSC)) {
935                         RTE_LOG_DP(DEBUG, VHOST_DATA,
936                                 "Vhost TX queue drained after timeout with burst size %u\n",
937                                 vhost_txq->len);
938                         drain_vhost(vdev);
939                         vhost_txq->len = 0;
940                         vhost_txq->pre_tsc = cur_tsc;
941                 }
942         }
943 }
944
945 /*
946  * Check if the packet destination MAC address is for a local device. If so then put
947  * the packet on that devices RX queue. If not then return.
948  */
949 static __rte_always_inline int
950 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
951 {
952         struct rte_ether_hdr *pkt_hdr;
953         struct vhost_dev *dst_vdev;
954         struct vhost_bufftable *vhost_txq;
955         uint16_t lcore_id = rte_lcore_id();
956         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
957
958         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
959         if (!dst_vdev)
960                 return -1;
961
962         if (vdev->vid == dst_vdev->vid) {
963                 RTE_LOG_DP(DEBUG, VHOST_DATA,
964                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
965                         vdev->vid);
966                 return 0;
967         }
968
969         RTE_LOG_DP(DEBUG, VHOST_DATA,
970                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
971
972         if (unlikely(dst_vdev->remove)) {
973                 RTE_LOG_DP(DEBUG, VHOST_DATA,
974                         "(%d) device is marked for removal\n", dst_vdev->vid);
975                 return 0;
976         }
977
978         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
979         vhost_txq->m_table[vhost_txq->len++] = m;
980
981         if (enable_stats) {
982                 vdev->stats.tx_total++;
983                 vdev->stats.tx++;
984         }
985
986         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
987                 drain_vhost(dst_vdev);
988                 vhost_txq->len = 0;
989                 vhost_txq->pre_tsc = rte_rdtsc();
990         }
991         return 0;
992 }
993
994 /*
995  * Check if the destination MAC of a packet is one local VM,
996  * and get its vlan tag, and offset if it is.
997  */
998 static __rte_always_inline int
999 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1000         uint32_t *offset, uint16_t *vlan_tag)
1001 {
1002         struct vhost_dev *dst_vdev;
1003         struct rte_ether_hdr *pkt_hdr =
1004                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1005
1006         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1007         if (!dst_vdev)
1008                 return 0;
1009
1010         if (vdev->vid == dst_vdev->vid) {
1011                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1012                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1013                         vdev->vid);
1014                 return -1;
1015         }
1016
1017         /*
1018          * HW vlan strip will reduce the packet length
1019          * by minus length of vlan tag, so need restore
1020          * the packet length by plus it.
1021          */
1022         *offset  = VLAN_HLEN;
1023         *vlan_tag = vlan_tags[vdev->vid];
1024
1025         RTE_LOG_DP(DEBUG, VHOST_DATA,
1026                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1027                 vdev->vid, dst_vdev->vid, *vlan_tag);
1028
1029         return 0;
1030 }
1031
1032 static uint16_t
1033 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1034 {
1035         if (ol_flags & PKT_TX_IPV4)
1036                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1037         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1038                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1039 }
1040
1041 static void virtio_tx_offload(struct rte_mbuf *m)
1042 {
1043         void *l3_hdr;
1044         struct rte_ipv4_hdr *ipv4_hdr = NULL;
1045         struct rte_tcp_hdr *tcp_hdr = NULL;
1046         struct rte_ether_hdr *eth_hdr =
1047                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1048
1049         l3_hdr = (char *)eth_hdr + m->l2_len;
1050
1051         if (m->ol_flags & PKT_TX_IPV4) {
1052                 ipv4_hdr = l3_hdr;
1053                 ipv4_hdr->hdr_checksum = 0;
1054                 m->ol_flags |= PKT_TX_IP_CKSUM;
1055         }
1056
1057         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1058         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1059 }
1060
1061 static __rte_always_inline void
1062 do_drain_mbuf_table(struct mbuf_table *tx_q)
1063 {
1064         uint16_t count;
1065
1066         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1067                                  tx_q->m_table, tx_q->len);
1068         if (unlikely(count < tx_q->len))
1069                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1070
1071         tx_q->len = 0;
1072 }
1073
1074 /*
1075  * This function routes the TX packet to the correct interface. This
1076  * may be a local device or the physical port.
1077  */
1078 static __rte_always_inline void
1079 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1080 {
1081         struct mbuf_table *tx_q;
1082         unsigned offset = 0;
1083         const uint16_t lcore_id = rte_lcore_id();
1084         struct rte_ether_hdr *nh;
1085
1086
1087         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1088         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1089                 struct vhost_dev *vdev2;
1090
1091                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1092                         if (vdev2 != vdev)
1093                                 sync_virtio_xmit(vdev2, vdev, m);
1094                 }
1095                 goto queue2nic;
1096         }
1097
1098         /*check if destination is local VM*/
1099         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1100                 return;
1101
1102         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1103                 if (unlikely(find_local_dest(vdev, m, &offset,
1104                                              &vlan_tag) != 0)) {
1105                         rte_pktmbuf_free(m);
1106                         return;
1107                 }
1108         }
1109
1110         RTE_LOG_DP(DEBUG, VHOST_DATA,
1111                 "(%d) TX: MAC address is external\n", vdev->vid);
1112
1113 queue2nic:
1114
1115         /*Add packet to the port tx queue*/
1116         tx_q = &lcore_tx_queue[lcore_id];
1117
1118         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1119         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1120                 /* Guest has inserted the vlan tag. */
1121                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1122                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1123                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1124                         (vh->vlan_tci != vlan_tag_be))
1125                         vh->vlan_tci = vlan_tag_be;
1126         } else {
1127                 m->ol_flags |= PKT_TX_VLAN_PKT;
1128
1129                 /*
1130                  * Find the right seg to adjust the data len when offset is
1131                  * bigger than tail room size.
1132                  */
1133                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1134                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1135                                 m->data_len += offset;
1136                         else {
1137                                 struct rte_mbuf *seg = m;
1138
1139                                 while ((seg->next != NULL) &&
1140                                         (offset > rte_pktmbuf_tailroom(seg)))
1141                                         seg = seg->next;
1142
1143                                 seg->data_len += offset;
1144                         }
1145                         m->pkt_len += offset;
1146                 }
1147
1148                 m->vlan_tci = vlan_tag;
1149         }
1150
1151         if (m->ol_flags & PKT_TX_TCP_SEG)
1152                 virtio_tx_offload(m);
1153
1154         tx_q->m_table[tx_q->len++] = m;
1155         if (enable_stats) {
1156                 vdev->stats.tx_total++;
1157                 vdev->stats.tx++;
1158         }
1159
1160         if (unlikely(tx_q->len == MAX_PKT_BURST))
1161                 do_drain_mbuf_table(tx_q);
1162 }
1163
1164
1165 static __rte_always_inline void
1166 drain_mbuf_table(struct mbuf_table *tx_q)
1167 {
1168         static uint64_t prev_tsc;
1169         uint64_t cur_tsc;
1170
1171         if (tx_q->len == 0)
1172                 return;
1173
1174         cur_tsc = rte_rdtsc();
1175         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1176                 prev_tsc = cur_tsc;
1177
1178                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1179                         "TX queue drained after timeout with burst size %u\n",
1180                         tx_q->len);
1181                 do_drain_mbuf_table(tx_q);
1182         }
1183 }
1184
1185 static __rte_always_inline void
1186 drain_eth_rx(struct vhost_dev *vdev)
1187 {
1188         uint16_t rx_count, enqueue_count;
1189         struct rte_mbuf *pkts[MAX_PKT_BURST];
1190
1191         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1192                                     pkts, MAX_PKT_BURST);
1193
1194         if (!rx_count)
1195                 return;
1196
1197         /*
1198          * When "enable_retry" is set, here we wait and retry when there
1199          * is no enough free slots in the queue to hold @rx_count packets,
1200          * to diminish packet loss.
1201          */
1202         if (enable_retry &&
1203             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1204                         VIRTIO_RXQ))) {
1205                 uint32_t retry;
1206
1207                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1208                         rte_delay_us(burst_rx_delay_time);
1209                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1210                                         VIRTIO_RXQ))
1211                                 break;
1212                 }
1213         }
1214
1215         if (builtin_net_driver) {
1216                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1217                                                 pkts, rx_count);
1218         } else if (async_vhost_driver) {
1219                 uint32_t cpu_cpl_nr = 0;
1220                 uint16_t enqueue_fail = 0;
1221                 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1222
1223                 complete_async_pkts(vdev);
1224                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1225                                         VIRTIO_RXQ, pkts, rx_count,
1226                                         m_cpu_cpl, &cpu_cpl_nr);
1227                 if (cpu_cpl_nr)
1228                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
1229
1230                 enqueue_fail = rx_count - enqueue_count;
1231                 if (enqueue_fail)
1232                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1233
1234         } else {
1235                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1236                                                 pkts, rx_count);
1237         }
1238
1239         if (enable_stats) {
1240                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1241                                 __ATOMIC_SEQ_CST);
1242                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1243                                 __ATOMIC_SEQ_CST);
1244         }
1245
1246         if (!async_vhost_driver)
1247                 free_pkts(pkts, rx_count);
1248 }
1249
1250 static __rte_always_inline void
1251 drain_virtio_tx(struct vhost_dev *vdev)
1252 {
1253         struct rte_mbuf *pkts[MAX_PKT_BURST];
1254         uint16_t count;
1255         uint16_t i;
1256
1257         if (builtin_net_driver) {
1258                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1259                                         pkts, MAX_PKT_BURST);
1260         } else {
1261                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1262                                         mbuf_pool, pkts, MAX_PKT_BURST);
1263         }
1264
1265         /* setup VMDq for the first packet */
1266         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1267                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1268                         free_pkts(pkts, count);
1269         }
1270
1271         for (i = 0; i < count; ++i)
1272                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1273 }
1274
1275 /*
1276  * Main function of vhost-switch. It basically does:
1277  *
1278  * for each vhost device {
1279  *    - drain_eth_rx()
1280  *
1281  *      Which drains the host eth Rx queue linked to the vhost device,
1282  *      and deliver all of them to guest virito Rx ring associated with
1283  *      this vhost device.
1284  *
1285  *    - drain_virtio_tx()
1286  *
1287  *      Which drains the guest virtio Tx queue and deliver all of them
1288  *      to the target, which could be another vhost device, or the
1289  *      physical eth dev. The route is done in function "virtio_tx_route".
1290  * }
1291  */
1292 static int
1293 switch_worker(void *arg __rte_unused)
1294 {
1295         unsigned i;
1296         unsigned lcore_id = rte_lcore_id();
1297         struct vhost_dev *vdev;
1298         struct mbuf_table *tx_q;
1299
1300         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1301
1302         tx_q = &lcore_tx_queue[lcore_id];
1303         for (i = 0; i < rte_lcore_count(); i++) {
1304                 if (lcore_ids[i] == lcore_id) {
1305                         tx_q->txq_id = i;
1306                         break;
1307                 }
1308         }
1309
1310         while(1) {
1311                 drain_mbuf_table(tx_q);
1312                 drain_vhost_table();
1313                 /*
1314                  * Inform the configuration core that we have exited the
1315                  * linked list and that no devices are in use if requested.
1316                  */
1317                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1318                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1319
1320                 /*
1321                  * Process vhost devices
1322                  */
1323                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1324                               lcore_vdev_entry) {
1325                         if (unlikely(vdev->remove)) {
1326                                 unlink_vmdq(vdev);
1327                                 vdev->ready = DEVICE_SAFE_REMOVE;
1328                                 continue;
1329                         }
1330
1331                         if (likely(vdev->ready == DEVICE_RX))
1332                                 drain_eth_rx(vdev);
1333
1334                         if (likely(!vdev->remove))
1335                                 drain_virtio_tx(vdev);
1336                 }
1337         }
1338
1339         return 0;
1340 }
1341
1342 /*
1343  * Remove a device from the specific data core linked list and from the
1344  * main linked list. Synchonization  occurs through the use of the
1345  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1346  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1347  */
1348 static void
1349 destroy_device(int vid)
1350 {
1351         struct vhost_dev *vdev = NULL;
1352         int lcore;
1353         uint16_t i;
1354
1355         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1356                 if (vdev->vid == vid)
1357                         break;
1358         }
1359         if (!vdev)
1360                 return;
1361         /*set the remove flag. */
1362         vdev->remove = 1;
1363         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1364                 rte_pause();
1365         }
1366
1367         for (i = 0; i < RTE_MAX_LCORE; i++)
1368                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1369
1370         if (builtin_net_driver)
1371                 vs_vhost_net_remove(vdev);
1372
1373         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1374                      lcore_vdev_entry);
1375         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1376
1377
1378         /* Set the dev_removal_flag on each lcore. */
1379         RTE_LCORE_FOREACH_WORKER(lcore)
1380                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1381
1382         /*
1383          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1384          * we can be sure that they can no longer access the device removed
1385          * from the linked lists and that the devices are no longer in use.
1386          */
1387         RTE_LCORE_FOREACH_WORKER(lcore) {
1388                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1389                         rte_pause();
1390         }
1391
1392         lcore_info[vdev->coreid].device_num--;
1393
1394         RTE_LOG(INFO, VHOST_DATA,
1395                 "(%d) device has been removed from data core\n",
1396                 vdev->vid);
1397
1398         if (async_vhost_driver)
1399                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1400
1401         rte_free(vdev);
1402 }
1403
1404 /*
1405  * A new device is added to a data core. First the device is added to the main linked list
1406  * and then allocated to a specific data core.
1407  */
1408 static int
1409 new_device(int vid)
1410 {
1411         int lcore, core_add = 0;
1412         uint16_t i;
1413         uint32_t device_num_min = num_devices;
1414         struct vhost_dev *vdev;
1415         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1416         if (vdev == NULL) {
1417                 RTE_LOG(INFO, VHOST_DATA,
1418                         "(%d) couldn't allocate memory for vhost dev\n",
1419                         vid);
1420                 return -1;
1421         }
1422         vdev->vid = vid;
1423
1424         for (i = 0; i < RTE_MAX_LCORE; i++) {
1425                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1426                         = rte_zmalloc("vhost bufftable",
1427                                 sizeof(struct vhost_bufftable),
1428                                 RTE_CACHE_LINE_SIZE);
1429
1430                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1431                         RTE_LOG(INFO, VHOST_DATA,
1432                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1433                         return -1;
1434                 }
1435         }
1436
1437         if (builtin_net_driver)
1438                 vs_vhost_net_setup(vdev);
1439
1440         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1441         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1442
1443         /*reset ready flag*/
1444         vdev->ready = DEVICE_MAC_LEARNING;
1445         vdev->remove = 0;
1446
1447         /* Find a suitable lcore to add the device. */
1448         RTE_LCORE_FOREACH_WORKER(lcore) {
1449                 if (lcore_info[lcore].device_num < device_num_min) {
1450                         device_num_min = lcore_info[lcore].device_num;
1451                         core_add = lcore;
1452                 }
1453         }
1454         vdev->coreid = core_add;
1455
1456         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1457                           lcore_vdev_entry);
1458         lcore_info[vdev->coreid].device_num++;
1459
1460         /* Disable notifications. */
1461         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1462         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1463
1464         RTE_LOG(INFO, VHOST_DATA,
1465                 "(%d) device has been added to data core %d\n",
1466                 vid, vdev->coreid);
1467
1468         if (async_vhost_driver) {
1469                 struct rte_vhost_async_features f;
1470                 struct rte_vhost_async_channel_ops channel_ops;
1471
1472                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1473                         channel_ops.transfer_data = ioat_transfer_data_cb;
1474                         channel_ops.check_completed_copies =
1475                                 ioat_check_completed_copies_cb;
1476
1477                         f.async_inorder = 1;
1478                         f.async_threshold = 256;
1479
1480                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1481                                 f.intval, &channel_ops);
1482                 }
1483         }
1484
1485         return 0;
1486 }
1487
1488 /*
1489  * These callback allow devices to be added to the data core when configuration
1490  * has been fully complete.
1491  */
1492 static const struct vhost_device_ops virtio_net_device_ops =
1493 {
1494         .new_device =  new_device,
1495         .destroy_device = destroy_device,
1496 };
1497
1498 /*
1499  * This is a thread will wake up after a period to print stats if the user has
1500  * enabled them.
1501  */
1502 static void *
1503 print_stats(__rte_unused void *arg)
1504 {
1505         struct vhost_dev *vdev;
1506         uint64_t tx_dropped, rx_dropped;
1507         uint64_t tx, tx_total, rx, rx_total;
1508         const char clr[] = { 27, '[', '2', 'J', '\0' };
1509         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1510
1511         while(1) {
1512                 sleep(enable_stats);
1513
1514                 /* Clear screen and move to top left */
1515                 printf("%s%s\n", clr, top_left);
1516                 printf("Device statistics =================================\n");
1517
1518                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1519                         tx_total   = vdev->stats.tx_total;
1520                         tx         = vdev->stats.tx;
1521                         tx_dropped = tx_total - tx;
1522
1523                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1524                                 __ATOMIC_SEQ_CST);
1525                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1526                                 __ATOMIC_SEQ_CST);
1527                         rx_dropped = rx_total - rx;
1528
1529                         printf("Statistics for device %d\n"
1530                                 "-----------------------\n"
1531                                 "TX total:              %" PRIu64 "\n"
1532                                 "TX dropped:            %" PRIu64 "\n"
1533                                 "TX successful:         %" PRIu64 "\n"
1534                                 "RX total:              %" PRIu64 "\n"
1535                                 "RX dropped:            %" PRIu64 "\n"
1536                                 "RX successful:         %" PRIu64 "\n",
1537                                 vdev->vid,
1538                                 tx_total, tx_dropped, tx,
1539                                 rx_total, rx_dropped, rx);
1540                 }
1541
1542                 printf("===================================================\n");
1543
1544                 fflush(stdout);
1545         }
1546
1547         return NULL;
1548 }
1549
1550 static void
1551 unregister_drivers(int socket_num)
1552 {
1553         int i, ret;
1554
1555         for (i = 0; i < socket_num; i++) {
1556                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1557                 if (ret != 0)
1558                         RTE_LOG(ERR, VHOST_CONFIG,
1559                                 "Fail to unregister vhost driver for %s.\n",
1560                                 socket_files + i * PATH_MAX);
1561         }
1562 }
1563
1564 /* When we receive a INT signal, unregister vhost driver */
1565 static void
1566 sigint_handler(__rte_unused int signum)
1567 {
1568         /* Unregister vhost driver. */
1569         unregister_drivers(nb_sockets);
1570
1571         exit(0);
1572 }
1573
1574 /*
1575  * While creating an mbuf pool, one key thing is to figure out how
1576  * many mbuf entries is enough for our use. FYI, here are some
1577  * guidelines:
1578  *
1579  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1580  *
1581  * - For each switch core (A CPU core does the packet switch), we need
1582  *   also make some reservation for receiving the packets from virtio
1583  *   Tx queue. How many is enough depends on the usage. It's normally
1584  *   a simple calculation like following:
1585  *
1586  *       MAX_PKT_BURST * max packet size / mbuf size
1587  *
1588  *   So, we definitely need allocate more mbufs when TSO is enabled.
1589  *
1590  * - Similarly, for each switching core, we should serve @nr_rx_desc
1591  *   mbufs for receiving the packets from physical NIC device.
1592  *
1593  * - We also need make sure, for each switch core, we have allocated
1594  *   enough mbufs to fill up the mbuf cache.
1595  */
1596 static void
1597 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1598         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1599 {
1600         uint32_t nr_mbufs;
1601         uint32_t nr_mbufs_per_core;
1602         uint32_t mtu = 1500;
1603
1604         if (mergeable)
1605                 mtu = 9000;
1606         if (enable_tso)
1607                 mtu = 64 * 1024;
1608
1609         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1610                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1611         nr_mbufs_per_core += nr_rx_desc;
1612         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1613
1614         nr_mbufs  = nr_queues * nr_rx_desc;
1615         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1616         nr_mbufs *= nr_port;
1617
1618         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1619                                             nr_mbuf_cache, 0, mbuf_size,
1620                                             rte_socket_id());
1621         if (mbuf_pool == NULL)
1622                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1623 }
1624
1625 /*
1626  * Main function, does initialisation and calls the per-lcore functions.
1627  */
1628 int
1629 main(int argc, char *argv[])
1630 {
1631         unsigned lcore_id, core_id = 0;
1632         unsigned nb_ports, valid_num_ports;
1633         int ret, i;
1634         uint16_t portid;
1635         static pthread_t tid;
1636         uint64_t flags = 0;
1637
1638         signal(SIGINT, sigint_handler);
1639
1640         /* init EAL */
1641         ret = rte_eal_init(argc, argv);
1642         if (ret < 0)
1643                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1644         argc -= ret;
1645         argv += ret;
1646
1647         /* parse app arguments */
1648         ret = us_vhost_parse_args(argc, argv);
1649         if (ret < 0)
1650                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1651
1652         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1653                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1654
1655                 if (rte_lcore_is_enabled(lcore_id))
1656                         lcore_ids[core_id++] = lcore_id;
1657         }
1658
1659         if (rte_lcore_count() > RTE_MAX_LCORE)
1660                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1661
1662         /* Get the number of physical ports. */
1663         nb_ports = rte_eth_dev_count_avail();
1664
1665         /*
1666          * Update the global var NUM_PORTS and global array PORTS
1667          * and get value of var VALID_NUM_PORTS according to system ports number
1668          */
1669         valid_num_ports = check_ports_num(nb_ports);
1670
1671         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1672                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1673                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1674                 return -1;
1675         }
1676
1677         /*
1678          * FIXME: here we are trying to allocate mbufs big enough for
1679          * @MAX_QUEUES, but the truth is we're never going to use that
1680          * many queues here. We probably should only do allocation for
1681          * those queues we are going to use.
1682          */
1683         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1684                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1685
1686         if (vm2vm_mode == VM2VM_HARDWARE) {
1687                 /* Enable VT loop back to let L2 switch to do it. */
1688                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1689                 RTE_LOG(DEBUG, VHOST_CONFIG,
1690                         "Enable loop back for L2 switch in vmdq.\n");
1691         }
1692
1693         /* initialize all ports */
1694         RTE_ETH_FOREACH_DEV(portid) {
1695                 /* skip ports that are not enabled */
1696                 if ((enabled_port_mask & (1 << portid)) == 0) {
1697                         RTE_LOG(INFO, VHOST_PORT,
1698                                 "Skipping disabled port %d\n", portid);
1699                         continue;
1700                 }
1701                 if (port_init(portid) != 0)
1702                         rte_exit(EXIT_FAILURE,
1703                                 "Cannot initialize network ports\n");
1704         }
1705
1706         /* Enable stats if the user option is set. */
1707         if (enable_stats) {
1708                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1709                                         print_stats, NULL);
1710                 if (ret < 0)
1711                         rte_exit(EXIT_FAILURE,
1712                                 "Cannot create print-stats thread\n");
1713         }
1714
1715         /* Launch all data cores. */
1716         RTE_LCORE_FOREACH_WORKER(lcore_id)
1717                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1718
1719         if (client_mode)
1720                 flags |= RTE_VHOST_USER_CLIENT;
1721
1722         /* Register vhost user driver to handle vhost messages. */
1723         for (i = 0; i < nb_sockets; i++) {
1724                 char *file = socket_files + i * PATH_MAX;
1725
1726                 if (async_vhost_driver)
1727                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1728
1729                 ret = rte_vhost_driver_register(file, flags);
1730                 if (ret != 0) {
1731                         unregister_drivers(i);
1732                         rte_exit(EXIT_FAILURE,
1733                                 "vhost driver register failure.\n");
1734                 }
1735
1736                 if (builtin_net_driver)
1737                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1738
1739                 if (mergeable == 0) {
1740                         rte_vhost_driver_disable_features(file,
1741                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1742                 }
1743
1744                 if (enable_tx_csum == 0) {
1745                         rte_vhost_driver_disable_features(file,
1746                                 1ULL << VIRTIO_NET_F_CSUM);
1747                 }
1748
1749                 if (enable_tso == 0) {
1750                         rte_vhost_driver_disable_features(file,
1751                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1752                         rte_vhost_driver_disable_features(file,
1753                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1754                         rte_vhost_driver_disable_features(file,
1755                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1756                         rte_vhost_driver_disable_features(file,
1757                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1758                 }
1759
1760                 if (promiscuous) {
1761                         rte_vhost_driver_enable_features(file,
1762                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1763                 }
1764
1765                 ret = rte_vhost_driver_callback_register(file,
1766                         &virtio_net_device_ops);
1767                 if (ret != 0) {
1768                         rte_exit(EXIT_FAILURE,
1769                                 "failed to register vhost driver callbacks.\n");
1770                 }
1771
1772                 if (rte_vhost_driver_start(file) < 0) {
1773                         rte_exit(EXIT_FAILURE,
1774                                 "failed to start vhost driver.\n");
1775                 }
1776         }
1777
1778         RTE_LCORE_FOREACH_WORKER(lcore_id)
1779                 rte_eal_wait_lcore(lcore_id);
1780
1781         /* clean up the EAL */
1782         rte_eal_cleanup();
1783
1784         return 0;
1785 }