examples/vhost: enhance getopt_long usage
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26
27 #include "ioat.h"
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95
96 static int builtin_net_driver;
97
98 static int async_vhost_driver;
99
100 static char dma_type[MAX_LONG_OPT_SZ];
101
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113         .rxmode = {
114                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115                 .split_hdr_size = 0,
116                 /*
117                  * VLAN strip is necessary for 1G NIC such as I350,
118                  * this fixes bug of ipv4 forwarding in guest can't
119                  * forward pakets from one virtio dev to another virtio dev.
120                  */
121                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122         },
123
124         .txmode = {
125                 .mq_mode = ETH_MQ_TX_NONE,
126                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127                              DEV_TX_OFFLOAD_TCP_CKSUM |
128                              DEV_TX_OFFLOAD_VLAN_INSERT |
129                              DEV_TX_OFFLOAD_MULTI_SEGS |
130                              DEV_TX_OFFLOAD_TCP_TSO),
131         },
132         .rx_adv_conf = {
133                 /*
134                  * should be overridden separately in code with
135                  * appropriate values
136                  */
137                 .vmdq_rx_conf = {
138                         .nb_queue_pools = ETH_8_POOLS,
139                         .enable_default_pool = 0,
140                         .default_pool = 0,
141                         .nb_pool_maps = 0,
142                         .pool_map = {{0, 0},},
143                 },
144         },
145 };
146
147
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154
155 const uint16_t vlan_tags[] = {
156         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
158         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165
166 /* ethernet addresses of ports */
167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168
169 static struct vhost_dev_tailq_list vhost_dev_list =
170         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176         unsigned len;
177         unsigned txq_id;
178         struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180
181 struct vhost_bufftable {
182         uint32_t len;
183         uint64_t pre_tsc;
184         struct rte_mbuf *m_table[MAX_PKT_BURST];
185 };
186
187 /* TX queue for each data core. */
188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189
190 /*
191  * Vhost TX buffer for each data core.
192  * Every data core maintains a TX buffer for every vhost device,
193  * which is used for batch pkts enqueue for higher performance.
194  */
195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
196
197 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
198                                  / US_PER_S * BURST_TX_DRAIN_US)
199 #define VLAN_HLEN       4
200
201 static inline int
202 open_dma(const char *value)
203 {
204         if (strncmp(dma_type, "ioat", 4) == 0)
205                 return open_ioat(value);
206
207         return -1;
208 }
209
210 /*
211  * Builds up the correct configuration for VMDQ VLAN pool map
212  * according to the pool & queue limits.
213  */
214 static inline int
215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
216 {
217         struct rte_eth_vmdq_rx_conf conf;
218         struct rte_eth_vmdq_rx_conf *def_conf =
219                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220         unsigned i;
221
222         memset(&conf, 0, sizeof(conf));
223         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
224         conf.nb_pool_maps = num_devices;
225         conf.enable_loop_back = def_conf->enable_loop_back;
226         conf.rx_mode = def_conf->rx_mode;
227
228         for (i = 0; i < conf.nb_pool_maps; i++) {
229                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
230                 conf.pool_map[i].pools = (1UL << i);
231         }
232
233         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
234         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
235                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
236         return 0;
237 }
238
239 /*
240  * Initialises a given port using global settings and with the rx buffers
241  * coming from the mbuf_pool passed as parameter
242  */
243 static inline int
244 port_init(uint16_t port)
245 {
246         struct rte_eth_dev_info dev_info;
247         struct rte_eth_conf port_conf;
248         struct rte_eth_rxconf *rxconf;
249         struct rte_eth_txconf *txconf;
250         int16_t rx_rings, tx_rings;
251         uint16_t rx_ring_size, tx_ring_size;
252         int retval;
253         uint16_t q;
254
255         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
256         retval = rte_eth_dev_info_get(port, &dev_info);
257         if (retval != 0) {
258                 RTE_LOG(ERR, VHOST_PORT,
259                         "Error during getting device (port %u) info: %s\n",
260                         port, strerror(-retval));
261
262                 return retval;
263         }
264
265         rxconf = &dev_info.default_rxconf;
266         txconf = &dev_info.default_txconf;
267         rxconf->rx_drop_en = 1;
268
269         /*configure the number of supported virtio devices based on VMDQ limits */
270         num_devices = dev_info.max_vmdq_pools;
271
272         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
273         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
274
275         tx_rings = (uint16_t)rte_lcore_count();
276
277         /* Get port configuration. */
278         retval = get_eth_conf(&port_conf, num_devices);
279         if (retval < 0)
280                 return retval;
281         /* NIC queues are divided into pf queues and vmdq queues.  */
282         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284         num_vmdq_queues = num_devices * queues_per_pool;
285         num_queues = num_pf_queues + num_vmdq_queues;
286         vmdq_queue_base = dev_info.vmdq_queue_base;
287         vmdq_pool_base  = dev_info.vmdq_pool_base;
288         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289                 num_pf_queues, num_devices, queues_per_pool);
290
291         if (!rte_eth_dev_is_valid_port(port))
292                 return -1;
293
294         rx_rings = (uint16_t)dev_info.max_rx_queues;
295         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296                 port_conf.txmode.offloads |=
297                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298         /* Configure ethernet device. */
299         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300         if (retval != 0) {
301                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302                         port, strerror(-retval));
303                 return retval;
304         }
305
306         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307                 &tx_ring_size);
308         if (retval != 0) {
309                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310                         "for port %u: %s.\n", port, strerror(-retval));
311                 return retval;
312         }
313         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315                         "for Rx queues on port %u.\n", port);
316                 return -1;
317         }
318
319         /* Setup the queues. */
320         rxconf->offloads = port_conf.rxmode.offloads;
321         for (q = 0; q < rx_rings; q ++) {
322                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323                                                 rte_eth_dev_socket_id(port),
324                                                 rxconf,
325                                                 mbuf_pool);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup rx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333         txconf->offloads = port_conf.txmode.offloads;
334         for (q = 0; q < tx_rings; q ++) {
335                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336                                                 rte_eth_dev_socket_id(port),
337                                                 txconf);
338                 if (retval < 0) {
339                         RTE_LOG(ERR, VHOST_PORT,
340                                 "Failed to setup tx queue %u of port %u: %s.\n",
341                                 q, port, strerror(-retval));
342                         return retval;
343                 }
344         }
345
346         /* Start the device. */
347         retval  = rte_eth_dev_start(port);
348         if (retval < 0) {
349                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350                         port, strerror(-retval));
351                 return retval;
352         }
353
354         if (promiscuous) {
355                 retval = rte_eth_promiscuous_enable(port);
356                 if (retval != 0) {
357                         RTE_LOG(ERR, VHOST_PORT,
358                                 "Failed to enable promiscuous mode on port %u: %s\n",
359                                 port, rte_strerror(-retval));
360                         return retval;
361                 }
362         }
363
364         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
365         if (retval < 0) {
366                 RTE_LOG(ERR, VHOST_PORT,
367                         "Failed to get MAC address on port %u: %s\n",
368                         port, rte_strerror(-retval));
369                 return retval;
370         }
371
372         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
373         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
374                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
375                         port,
376                         vmdq_ports_eth_addr[port].addr_bytes[0],
377                         vmdq_ports_eth_addr[port].addr_bytes[1],
378                         vmdq_ports_eth_addr[port].addr_bytes[2],
379                         vmdq_ports_eth_addr[port].addr_bytes[3],
380                         vmdq_ports_eth_addr[port].addr_bytes[4],
381                         vmdq_ports_eth_addr[port].addr_bytes[5]);
382
383         return 0;
384 }
385
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392         char *old;
393
394         /* parse number string */
395         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396                 return -1;
397
398         old = socket_files;
399         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400         if (socket_files == NULL) {
401                 free(old);
402                 return -1;
403         }
404
405         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406         nb_sockets++;
407
408         return 0;
409 }
410
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417         char *end = NULL;
418         unsigned long pm;
419
420         errno = 0;
421
422         /* parse hexadecimal string */
423         pm = strtoul(portmask, &end, 16);
424         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425                 return 0;
426
427         return pm;
428
429 }
430
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437         char *end = NULL;
438         unsigned long num;
439
440         errno = 0;
441
442         /* parse unsigned int string */
443         num = strtoul(q_arg, &end, 10);
444         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445                 return -1;
446
447         if (num > max_valid_value)
448                 return -1;
449
450         return num;
451
452 }
453
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461         "               --vm2vm [0|1|2]\n"
462         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463         "               --socket-file <path>\n"
464         "               --nb-devices ND\n"
465         "               -p PORTMASK: Set mask for ports to be used by application\n"
466         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472         "               --socket-file: The path of the socket file.\n"
473         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
474         "               --tso [0|1] disable/enable TCP segment offload.\n"
475         "               --client register a vhost-user socket as client mode.\n"
476         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477         "               --dmas register dma channel for specific vhost device.\n",
478                prgname);
479 }
480
481 enum {
482 #define OPT_VM2VM               "vm2vm"
483         OPT_VM2VM_NUM = 256,
484 #define OPT_RX_RETRY            "rx-retry"
485         OPT_RX_RETRY_NUM,
486 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
487         OPT_RX_RETRY_DELAY_NUM,
488 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
489         OPT_RX_RETRY_NUMB_NUM,
490 #define OPT_MERGEABLE           "mergeable"
491         OPT_MERGEABLE_NUM,
492 #define OPT_STATS               "stats"
493         OPT_STATS_NUM,
494 #define OPT_SOCKET_FILE         "socket-file"
495         OPT_SOCKET_FILE_NUM,
496 #define OPT_TX_CSUM             "tx-csum"
497         OPT_TX_CSUM_NUM,
498 #define OPT_TSO                 "tso"
499         OPT_TSO_NUM,
500 #define OPT_CLIENT              "client"
501         OPT_CLIENT_NUM,
502 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
503         OPT_BUILTIN_NET_DRIVER_NUM,
504 #define OPT_DMA_TYPE            "dma-type"
505         OPT_DMA_TYPE_NUM,
506 #define OPT_DMAS                "dmas"
507         OPT_DMAS_NUM,
508 };
509
510 /*
511  * Parse the arguments given in the command line of the application.
512  */
513 static int
514 us_vhost_parse_args(int argc, char **argv)
515 {
516         int opt, ret;
517         int option_index;
518         unsigned i;
519         const char *prgname = argv[0];
520         static struct option long_option[] = {
521                 {OPT_VM2VM, required_argument,
522                                 NULL, OPT_VM2VM_NUM},
523                 {OPT_RX_RETRY, required_argument,
524                                 NULL, OPT_RX_RETRY_NUM},
525                 {OPT_RX_RETRY_DELAY, required_argument,
526                                 NULL, OPT_RX_RETRY_DELAY_NUM},
527                 {OPT_RX_RETRY_NUMB, required_argument,
528                                 NULL, OPT_RX_RETRY_NUMB_NUM},
529                 {OPT_MERGEABLE, required_argument,
530                                 NULL, OPT_MERGEABLE_NUM},
531                 {OPT_STATS, required_argument,
532                                 NULL, OPT_STATS_NUM},
533                 {OPT_SOCKET_FILE, required_argument,
534                                 NULL, OPT_SOCKET_FILE_NUM},
535                 {OPT_TX_CSUM, required_argument,
536                                 NULL, OPT_TX_CSUM_NUM},
537                 {OPT_TSO, required_argument,
538                                 NULL, OPT_TSO_NUM},
539                 {OPT_CLIENT, no_argument,
540                                 NULL, OPT_CLIENT_NUM},
541                 {OPT_BUILTIN_NET_DRIVER, no_argument,
542                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
543                 {OPT_DMA_TYPE, required_argument,
544                                 NULL, OPT_DMA_TYPE_NUM},
545                 {OPT_DMAS, required_argument,
546                                 NULL, OPT_DMAS_NUM},
547                 {NULL, 0, 0, 0},
548         };
549
550         /* Parse command line */
551         while ((opt = getopt_long(argc, argv, "p:P",
552                         long_option, &option_index)) != EOF) {
553                 switch (opt) {
554                 /* Portmask */
555                 case 'p':
556                         enabled_port_mask = parse_portmask(optarg);
557                         if (enabled_port_mask == 0) {
558                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
559                                 us_vhost_usage(prgname);
560                                 return -1;
561                         }
562                         break;
563
564                 case 'P':
565                         promiscuous = 1;
566                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
567                                 ETH_VMDQ_ACCEPT_BROADCAST |
568                                 ETH_VMDQ_ACCEPT_MULTICAST;
569                         break;
570
571                 case OPT_VM2VM_NUM:
572                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
573                         if (ret == -1) {
574                                 RTE_LOG(INFO, VHOST_CONFIG,
575                                         "Invalid argument for "
576                                         "vm2vm [0|1|2]\n");
577                                 us_vhost_usage(prgname);
578                                 return -1;
579                         }
580                         vm2vm_mode = (vm2vm_type)ret;
581                         break;
582
583                 case OPT_RX_RETRY_NUM:
584                         ret = parse_num_opt(optarg, 1);
585                         if (ret == -1) {
586                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
587                                 us_vhost_usage(prgname);
588                                 return -1;
589                         }
590                         enable_retry = ret;
591                         break;
592
593                 case OPT_TX_CSUM_NUM:
594                         ret = parse_num_opt(optarg, 1);
595                         if (ret == -1) {
596                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
597                                 us_vhost_usage(prgname);
598                                 return -1;
599                         }
600                         enable_tx_csum = ret;
601                         break;
602
603                 case OPT_TSO_NUM:
604                         ret = parse_num_opt(optarg, 1);
605                         if (ret == -1) {
606                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
607                                 us_vhost_usage(prgname);
608                                 return -1;
609                         }
610                         enable_tso = ret;
611                         break;
612
613                 case OPT_RX_RETRY_DELAY_NUM:
614                         ret = parse_num_opt(optarg, INT32_MAX);
615                         if (ret == -1) {
616                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
617                                 us_vhost_usage(prgname);
618                                 return -1;
619                         }
620                         burst_rx_delay_time = ret;
621                         break;
622
623                 case OPT_RX_RETRY_NUMB_NUM:
624                         ret = parse_num_opt(optarg, INT32_MAX);
625                         if (ret == -1) {
626                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
627                                 us_vhost_usage(prgname);
628                                 return -1;
629                         }
630                         burst_rx_retry_num = ret;
631                         break;
632
633                 case OPT_MERGEABLE_NUM:
634                         ret = parse_num_opt(optarg, 1);
635                         if (ret == -1) {
636                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
637                                 us_vhost_usage(prgname);
638                                 return -1;
639                         }
640                         mergeable = !!ret;
641                         if (ret) {
642                                 vmdq_conf_default.rxmode.offloads |=
643                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
644                                 vmdq_conf_default.rxmode.max_rx_pkt_len
645                                         = JUMBO_FRAME_MAX_SIZE;
646                         }
647                         break;
648
649                 case OPT_STATS_NUM:
650                         ret = parse_num_opt(optarg, INT32_MAX);
651                         if (ret == -1) {
652                                 RTE_LOG(INFO, VHOST_CONFIG,
653                                         "Invalid argument for stats [0..N]\n");
654                                 us_vhost_usage(prgname);
655                                 return -1;
656                         }
657                         enable_stats = ret;
658                         break;
659
660                 /* Set socket file path. */
661                 case OPT_SOCKET_FILE_NUM:
662                         if (us_vhost_parse_socket_path(optarg) == -1) {
663                                 RTE_LOG(INFO, VHOST_CONFIG,
664                                 "Invalid argument for socket name (Max %d characters)\n",
665                                 PATH_MAX);
666                                 us_vhost_usage(prgname);
667                                 return -1;
668                         }
669                         break;
670
671                 case OPT_DMA_TYPE_NUM:
672                         strcpy(dma_type, optarg);
673                         break;
674
675                 case OPT_DMAS_NUM:
676                         if (open_dma(optarg) == -1) {
677                                 RTE_LOG(INFO, VHOST_CONFIG,
678                                         "Wrong DMA args\n");
679                                 us_vhost_usage(prgname);
680                                 return -1;
681                         }
682                         async_vhost_driver = 1;
683                         break;
684
685                 case OPT_CLIENT_NUM:
686                         client_mode = 1;
687                         break;
688
689                 case OPT_BUILTIN_NET_DRIVER_NUM:
690                         builtin_net_driver = 1;
691                         break;
692
693                 /* Invalid option - print options. */
694                 default:
695                         us_vhost_usage(prgname);
696                         return -1;
697                 }
698         }
699
700         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
701                 if (enabled_port_mask & (1 << i))
702                         ports[num_ports++] = i;
703         }
704
705         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
706                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
707                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
708                 return -1;
709         }
710
711         return 0;
712 }
713
714 /*
715  * Update the global var NUM_PORTS and array PORTS according to system ports number
716  * and return valid ports number
717  */
718 static unsigned check_ports_num(unsigned nb_ports)
719 {
720         unsigned valid_num_ports = num_ports;
721         unsigned portid;
722
723         if (num_ports > nb_ports) {
724                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
725                         num_ports, nb_ports);
726                 num_ports = nb_ports;
727         }
728
729         for (portid = 0; portid < num_ports; portid ++) {
730                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
731                         RTE_LOG(INFO, VHOST_PORT,
732                                 "\nSpecified port ID(%u) is not valid\n",
733                                 ports[portid]);
734                         ports[portid] = INVALID_PORT_ID;
735                         valid_num_ports--;
736                 }
737         }
738         return valid_num_ports;
739 }
740
741 static __rte_always_inline struct vhost_dev *
742 find_vhost_dev(struct rte_ether_addr *mac)
743 {
744         struct vhost_dev *vdev;
745
746         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
747                 if (vdev->ready == DEVICE_RX &&
748                     rte_is_same_ether_addr(mac, &vdev->mac_address))
749                         return vdev;
750         }
751
752         return NULL;
753 }
754
755 /*
756  * This function learns the MAC address of the device and registers this along with a
757  * vlan tag to a VMDQ.
758  */
759 static int
760 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
761 {
762         struct rte_ether_hdr *pkt_hdr;
763         int i, ret;
764
765         /* Learn MAC address of guest device from packet */
766         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
767
768         if (find_vhost_dev(&pkt_hdr->s_addr)) {
769                 RTE_LOG(ERR, VHOST_DATA,
770                         "(%d) device is using a registered MAC!\n",
771                         vdev->vid);
772                 return -1;
773         }
774
775         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
776                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
777
778         /* vlan_tag currently uses the device_id. */
779         vdev->vlan_tag = vlan_tags[vdev->vid];
780
781         /* Print out VMDQ registration info. */
782         RTE_LOG(INFO, VHOST_DATA,
783                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
784                 vdev->vid,
785                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
786                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
787                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
788                 vdev->vlan_tag);
789
790         /* Register the MAC address. */
791         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
792                                 (uint32_t)vdev->vid + vmdq_pool_base);
793         if (ret)
794                 RTE_LOG(ERR, VHOST_DATA,
795                         "(%d) failed to add device MAC address to VMDQ\n",
796                         vdev->vid);
797
798         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
799
800         /* Set device as ready for RX. */
801         vdev->ready = DEVICE_RX;
802
803         return 0;
804 }
805
806 /*
807  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
808  * queue before disabling RX on the device.
809  */
810 static inline void
811 unlink_vmdq(struct vhost_dev *vdev)
812 {
813         unsigned i = 0;
814         unsigned rx_count;
815         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
816
817         if (vdev->ready == DEVICE_RX) {
818                 /*clear MAC and VLAN settings*/
819                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
820                 for (i = 0; i < 6; i++)
821                         vdev->mac_address.addr_bytes[i] = 0;
822
823                 vdev->vlan_tag = 0;
824
825                 /*Clear out the receive buffers*/
826                 rx_count = rte_eth_rx_burst(ports[0],
827                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
828
829                 while (rx_count) {
830                         for (i = 0; i < rx_count; i++)
831                                 rte_pktmbuf_free(pkts_burst[i]);
832
833                         rx_count = rte_eth_rx_burst(ports[0],
834                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
835                 }
836
837                 vdev->ready = DEVICE_MAC_LEARNING;
838         }
839 }
840
841 static inline void
842 free_pkts(struct rte_mbuf **pkts, uint16_t n)
843 {
844         while (n--)
845                 rte_pktmbuf_free(pkts[n]);
846 }
847
848 static __rte_always_inline void
849 complete_async_pkts(struct vhost_dev *vdev)
850 {
851         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
852         uint16_t complete_count;
853
854         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
855                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
856         if (complete_count)
857                 free_pkts(p_cpl, complete_count);
858 }
859
860 static __rte_always_inline void
861 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
862             struct rte_mbuf *m)
863 {
864         uint16_t ret;
865
866         if (builtin_net_driver) {
867                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
868         } else {
869                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
870         }
871
872         if (enable_stats) {
873                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
874                                 __ATOMIC_SEQ_CST);
875                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
876                                 __ATOMIC_SEQ_CST);
877                 src_vdev->stats.tx_total++;
878                 src_vdev->stats.tx += ret;
879         }
880 }
881
882 static __rte_always_inline void
883 drain_vhost(struct vhost_dev *vdev)
884 {
885         uint16_t ret;
886         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
887         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
888         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
889
890         if (builtin_net_driver) {
891                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
892         } else if (async_vhost_driver) {
893                 uint32_t cpu_cpl_nr = 0;
894                 uint16_t enqueue_fail = 0;
895                 struct rte_mbuf *m_cpu_cpl[nr_xmit];
896
897                 complete_async_pkts(vdev);
898                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
899                                         m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
900
901                 if (cpu_cpl_nr)
902                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
903
904                 enqueue_fail = nr_xmit - ret;
905                 if (enqueue_fail)
906                         free_pkts(&m[ret], nr_xmit - ret);
907         } else {
908                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
909                                                 m, nr_xmit);
910         }
911
912         if (enable_stats) {
913                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
914                                 __ATOMIC_SEQ_CST);
915                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
916                                 __ATOMIC_SEQ_CST);
917         }
918
919         if (!async_vhost_driver)
920                 free_pkts(m, nr_xmit);
921 }
922
923 static __rte_always_inline void
924 drain_vhost_table(void)
925 {
926         uint16_t lcore_id = rte_lcore_id();
927         struct vhost_bufftable *vhost_txq;
928         struct vhost_dev *vdev;
929         uint64_t cur_tsc;
930
931         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
932                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
933                                                 + vdev->vid];
934
935                 cur_tsc = rte_rdtsc();
936                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
937                                 > MBUF_TABLE_DRAIN_TSC)) {
938                         RTE_LOG_DP(DEBUG, VHOST_DATA,
939                                 "Vhost TX queue drained after timeout with burst size %u\n",
940                                 vhost_txq->len);
941                         drain_vhost(vdev);
942                         vhost_txq->len = 0;
943                         vhost_txq->pre_tsc = cur_tsc;
944                 }
945         }
946 }
947
948 /*
949  * Check if the packet destination MAC address is for a local device. If so then put
950  * the packet on that devices RX queue. If not then return.
951  */
952 static __rte_always_inline int
953 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
954 {
955         struct rte_ether_hdr *pkt_hdr;
956         struct vhost_dev *dst_vdev;
957         struct vhost_bufftable *vhost_txq;
958         uint16_t lcore_id = rte_lcore_id();
959         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
960
961         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
962         if (!dst_vdev)
963                 return -1;
964
965         if (vdev->vid == dst_vdev->vid) {
966                 RTE_LOG_DP(DEBUG, VHOST_DATA,
967                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
968                         vdev->vid);
969                 return 0;
970         }
971
972         RTE_LOG_DP(DEBUG, VHOST_DATA,
973                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
974
975         if (unlikely(dst_vdev->remove)) {
976                 RTE_LOG_DP(DEBUG, VHOST_DATA,
977                         "(%d) device is marked for removal\n", dst_vdev->vid);
978                 return 0;
979         }
980
981         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
982         vhost_txq->m_table[vhost_txq->len++] = m;
983
984         if (enable_stats) {
985                 vdev->stats.tx_total++;
986                 vdev->stats.tx++;
987         }
988
989         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
990                 drain_vhost(dst_vdev);
991                 vhost_txq->len = 0;
992                 vhost_txq->pre_tsc = rte_rdtsc();
993         }
994         return 0;
995 }
996
997 /*
998  * Check if the destination MAC of a packet is one local VM,
999  * and get its vlan tag, and offset if it is.
1000  */
1001 static __rte_always_inline int
1002 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1003         uint32_t *offset, uint16_t *vlan_tag)
1004 {
1005         struct vhost_dev *dst_vdev;
1006         struct rte_ether_hdr *pkt_hdr =
1007                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1008
1009         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1010         if (!dst_vdev)
1011                 return 0;
1012
1013         if (vdev->vid == dst_vdev->vid) {
1014                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1015                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1016                         vdev->vid);
1017                 return -1;
1018         }
1019
1020         /*
1021          * HW vlan strip will reduce the packet length
1022          * by minus length of vlan tag, so need restore
1023          * the packet length by plus it.
1024          */
1025         *offset  = VLAN_HLEN;
1026         *vlan_tag = vlan_tags[vdev->vid];
1027
1028         RTE_LOG_DP(DEBUG, VHOST_DATA,
1029                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1030                 vdev->vid, dst_vdev->vid, *vlan_tag);
1031
1032         return 0;
1033 }
1034
1035 static uint16_t
1036 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1037 {
1038         if (ol_flags & PKT_TX_IPV4)
1039                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1040         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1041                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1042 }
1043
1044 static void virtio_tx_offload(struct rte_mbuf *m)
1045 {
1046         void *l3_hdr;
1047         struct rte_ipv4_hdr *ipv4_hdr = NULL;
1048         struct rte_tcp_hdr *tcp_hdr = NULL;
1049         struct rte_ether_hdr *eth_hdr =
1050                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1051
1052         l3_hdr = (char *)eth_hdr + m->l2_len;
1053
1054         if (m->ol_flags & PKT_TX_IPV4) {
1055                 ipv4_hdr = l3_hdr;
1056                 ipv4_hdr->hdr_checksum = 0;
1057                 m->ol_flags |= PKT_TX_IP_CKSUM;
1058         }
1059
1060         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1061         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1062 }
1063
1064 static __rte_always_inline void
1065 do_drain_mbuf_table(struct mbuf_table *tx_q)
1066 {
1067         uint16_t count;
1068
1069         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1070                                  tx_q->m_table, tx_q->len);
1071         if (unlikely(count < tx_q->len))
1072                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1073
1074         tx_q->len = 0;
1075 }
1076
1077 /*
1078  * This function routes the TX packet to the correct interface. This
1079  * may be a local device or the physical port.
1080  */
1081 static __rte_always_inline void
1082 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1083 {
1084         struct mbuf_table *tx_q;
1085         unsigned offset = 0;
1086         const uint16_t lcore_id = rte_lcore_id();
1087         struct rte_ether_hdr *nh;
1088
1089
1090         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1091         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1092                 struct vhost_dev *vdev2;
1093
1094                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1095                         if (vdev2 != vdev)
1096                                 sync_virtio_xmit(vdev2, vdev, m);
1097                 }
1098                 goto queue2nic;
1099         }
1100
1101         /*check if destination is local VM*/
1102         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1103                 return;
1104
1105         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1106                 if (unlikely(find_local_dest(vdev, m, &offset,
1107                                              &vlan_tag) != 0)) {
1108                         rte_pktmbuf_free(m);
1109                         return;
1110                 }
1111         }
1112
1113         RTE_LOG_DP(DEBUG, VHOST_DATA,
1114                 "(%d) TX: MAC address is external\n", vdev->vid);
1115
1116 queue2nic:
1117
1118         /*Add packet to the port tx queue*/
1119         tx_q = &lcore_tx_queue[lcore_id];
1120
1121         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1122         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1123                 /* Guest has inserted the vlan tag. */
1124                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1125                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1126                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1127                         (vh->vlan_tci != vlan_tag_be))
1128                         vh->vlan_tci = vlan_tag_be;
1129         } else {
1130                 m->ol_flags |= PKT_TX_VLAN_PKT;
1131
1132                 /*
1133                  * Find the right seg to adjust the data len when offset is
1134                  * bigger than tail room size.
1135                  */
1136                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1137                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1138                                 m->data_len += offset;
1139                         else {
1140                                 struct rte_mbuf *seg = m;
1141
1142                                 while ((seg->next != NULL) &&
1143                                         (offset > rte_pktmbuf_tailroom(seg)))
1144                                         seg = seg->next;
1145
1146                                 seg->data_len += offset;
1147                         }
1148                         m->pkt_len += offset;
1149                 }
1150
1151                 m->vlan_tci = vlan_tag;
1152         }
1153
1154         if (m->ol_flags & PKT_TX_TCP_SEG)
1155                 virtio_tx_offload(m);
1156
1157         tx_q->m_table[tx_q->len++] = m;
1158         if (enable_stats) {
1159                 vdev->stats.tx_total++;
1160                 vdev->stats.tx++;
1161         }
1162
1163         if (unlikely(tx_q->len == MAX_PKT_BURST))
1164                 do_drain_mbuf_table(tx_q);
1165 }
1166
1167
1168 static __rte_always_inline void
1169 drain_mbuf_table(struct mbuf_table *tx_q)
1170 {
1171         static uint64_t prev_tsc;
1172         uint64_t cur_tsc;
1173
1174         if (tx_q->len == 0)
1175                 return;
1176
1177         cur_tsc = rte_rdtsc();
1178         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1179                 prev_tsc = cur_tsc;
1180
1181                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1182                         "TX queue drained after timeout with burst size %u\n",
1183                         tx_q->len);
1184                 do_drain_mbuf_table(tx_q);
1185         }
1186 }
1187
1188 static __rte_always_inline void
1189 drain_eth_rx(struct vhost_dev *vdev)
1190 {
1191         uint16_t rx_count, enqueue_count;
1192         struct rte_mbuf *pkts[MAX_PKT_BURST];
1193
1194         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1195                                     pkts, MAX_PKT_BURST);
1196
1197         if (!rx_count)
1198                 return;
1199
1200         /*
1201          * When "enable_retry" is set, here we wait and retry when there
1202          * is no enough free slots in the queue to hold @rx_count packets,
1203          * to diminish packet loss.
1204          */
1205         if (enable_retry &&
1206             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1207                         VIRTIO_RXQ))) {
1208                 uint32_t retry;
1209
1210                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1211                         rte_delay_us(burst_rx_delay_time);
1212                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1213                                         VIRTIO_RXQ))
1214                                 break;
1215                 }
1216         }
1217
1218         if (builtin_net_driver) {
1219                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1220                                                 pkts, rx_count);
1221         } else if (async_vhost_driver) {
1222                 uint32_t cpu_cpl_nr = 0;
1223                 uint16_t enqueue_fail = 0;
1224                 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1225
1226                 complete_async_pkts(vdev);
1227                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1228                                         VIRTIO_RXQ, pkts, rx_count,
1229                                         m_cpu_cpl, &cpu_cpl_nr);
1230                 if (cpu_cpl_nr)
1231                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
1232
1233                 enqueue_fail = rx_count - enqueue_count;
1234                 if (enqueue_fail)
1235                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1236
1237         } else {
1238                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1239                                                 pkts, rx_count);
1240         }
1241
1242         if (enable_stats) {
1243                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1244                                 __ATOMIC_SEQ_CST);
1245                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1246                                 __ATOMIC_SEQ_CST);
1247         }
1248
1249         if (!async_vhost_driver)
1250                 free_pkts(pkts, rx_count);
1251 }
1252
1253 static __rte_always_inline void
1254 drain_virtio_tx(struct vhost_dev *vdev)
1255 {
1256         struct rte_mbuf *pkts[MAX_PKT_BURST];
1257         uint16_t count;
1258         uint16_t i;
1259
1260         if (builtin_net_driver) {
1261                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1262                                         pkts, MAX_PKT_BURST);
1263         } else {
1264                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1265                                         mbuf_pool, pkts, MAX_PKT_BURST);
1266         }
1267
1268         /* setup VMDq for the first packet */
1269         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1270                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1271                         free_pkts(pkts, count);
1272         }
1273
1274         for (i = 0; i < count; ++i)
1275                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1276 }
1277
1278 /*
1279  * Main function of vhost-switch. It basically does:
1280  *
1281  * for each vhost device {
1282  *    - drain_eth_rx()
1283  *
1284  *      Which drains the host eth Rx queue linked to the vhost device,
1285  *      and deliver all of them to guest virito Rx ring associated with
1286  *      this vhost device.
1287  *
1288  *    - drain_virtio_tx()
1289  *
1290  *      Which drains the guest virtio Tx queue and deliver all of them
1291  *      to the target, which could be another vhost device, or the
1292  *      physical eth dev. The route is done in function "virtio_tx_route".
1293  * }
1294  */
1295 static int
1296 switch_worker(void *arg __rte_unused)
1297 {
1298         unsigned i;
1299         unsigned lcore_id = rte_lcore_id();
1300         struct vhost_dev *vdev;
1301         struct mbuf_table *tx_q;
1302
1303         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1304
1305         tx_q = &lcore_tx_queue[lcore_id];
1306         for (i = 0; i < rte_lcore_count(); i++) {
1307                 if (lcore_ids[i] == lcore_id) {
1308                         tx_q->txq_id = i;
1309                         break;
1310                 }
1311         }
1312
1313         while(1) {
1314                 drain_mbuf_table(tx_q);
1315                 drain_vhost_table();
1316                 /*
1317                  * Inform the configuration core that we have exited the
1318                  * linked list and that no devices are in use if requested.
1319                  */
1320                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1321                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1322
1323                 /*
1324                  * Process vhost devices
1325                  */
1326                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1327                               lcore_vdev_entry) {
1328                         if (unlikely(vdev->remove)) {
1329                                 unlink_vmdq(vdev);
1330                                 vdev->ready = DEVICE_SAFE_REMOVE;
1331                                 continue;
1332                         }
1333
1334                         if (likely(vdev->ready == DEVICE_RX))
1335                                 drain_eth_rx(vdev);
1336
1337                         if (likely(!vdev->remove))
1338                                 drain_virtio_tx(vdev);
1339                 }
1340         }
1341
1342         return 0;
1343 }
1344
1345 /*
1346  * Remove a device from the specific data core linked list and from the
1347  * main linked list. Synchonization  occurs through the use of the
1348  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1349  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1350  */
1351 static void
1352 destroy_device(int vid)
1353 {
1354         struct vhost_dev *vdev = NULL;
1355         int lcore;
1356         uint16_t i;
1357
1358         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1359                 if (vdev->vid == vid)
1360                         break;
1361         }
1362         if (!vdev)
1363                 return;
1364         /*set the remove flag. */
1365         vdev->remove = 1;
1366         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1367                 rte_pause();
1368         }
1369
1370         for (i = 0; i < RTE_MAX_LCORE; i++)
1371                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1372
1373         if (builtin_net_driver)
1374                 vs_vhost_net_remove(vdev);
1375
1376         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1377                      lcore_vdev_entry);
1378         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1379
1380
1381         /* Set the dev_removal_flag on each lcore. */
1382         RTE_LCORE_FOREACH_WORKER(lcore)
1383                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1384
1385         /*
1386          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1387          * we can be sure that they can no longer access the device removed
1388          * from the linked lists and that the devices are no longer in use.
1389          */
1390         RTE_LCORE_FOREACH_WORKER(lcore) {
1391                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1392                         rte_pause();
1393         }
1394
1395         lcore_info[vdev->coreid].device_num--;
1396
1397         RTE_LOG(INFO, VHOST_DATA,
1398                 "(%d) device has been removed from data core\n",
1399                 vdev->vid);
1400
1401         if (async_vhost_driver)
1402                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1403
1404         rte_free(vdev);
1405 }
1406
1407 /*
1408  * A new device is added to a data core. First the device is added to the main linked list
1409  * and then allocated to a specific data core.
1410  */
1411 static int
1412 new_device(int vid)
1413 {
1414         int lcore, core_add = 0;
1415         uint16_t i;
1416         uint32_t device_num_min = num_devices;
1417         struct vhost_dev *vdev;
1418         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1419         if (vdev == NULL) {
1420                 RTE_LOG(INFO, VHOST_DATA,
1421                         "(%d) couldn't allocate memory for vhost dev\n",
1422                         vid);
1423                 return -1;
1424         }
1425         vdev->vid = vid;
1426
1427         for (i = 0; i < RTE_MAX_LCORE; i++) {
1428                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1429                         = rte_zmalloc("vhost bufftable",
1430                                 sizeof(struct vhost_bufftable),
1431                                 RTE_CACHE_LINE_SIZE);
1432
1433                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1434                         RTE_LOG(INFO, VHOST_DATA,
1435                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1436                         return -1;
1437                 }
1438         }
1439
1440         if (builtin_net_driver)
1441                 vs_vhost_net_setup(vdev);
1442
1443         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1444         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1445
1446         /*reset ready flag*/
1447         vdev->ready = DEVICE_MAC_LEARNING;
1448         vdev->remove = 0;
1449
1450         /* Find a suitable lcore to add the device. */
1451         RTE_LCORE_FOREACH_WORKER(lcore) {
1452                 if (lcore_info[lcore].device_num < device_num_min) {
1453                         device_num_min = lcore_info[lcore].device_num;
1454                         core_add = lcore;
1455                 }
1456         }
1457         vdev->coreid = core_add;
1458
1459         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1460                           lcore_vdev_entry);
1461         lcore_info[vdev->coreid].device_num++;
1462
1463         /* Disable notifications. */
1464         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1465         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1466
1467         RTE_LOG(INFO, VHOST_DATA,
1468                 "(%d) device has been added to data core %d\n",
1469                 vid, vdev->coreid);
1470
1471         if (async_vhost_driver) {
1472                 struct rte_vhost_async_features f;
1473                 struct rte_vhost_async_channel_ops channel_ops;
1474
1475                 if (strncmp(dma_type, "ioat", 4) == 0) {
1476                         channel_ops.transfer_data = ioat_transfer_data_cb;
1477                         channel_ops.check_completed_copies =
1478                                 ioat_check_completed_copies_cb;
1479
1480                         f.async_inorder = 1;
1481                         f.async_threshold = 256;
1482
1483                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1484                                 f.intval, &channel_ops);
1485                 }
1486         }
1487
1488         return 0;
1489 }
1490
1491 /*
1492  * These callback allow devices to be added to the data core when configuration
1493  * has been fully complete.
1494  */
1495 static const struct vhost_device_ops virtio_net_device_ops =
1496 {
1497         .new_device =  new_device,
1498         .destroy_device = destroy_device,
1499 };
1500
1501 /*
1502  * This is a thread will wake up after a period to print stats if the user has
1503  * enabled them.
1504  */
1505 static void *
1506 print_stats(__rte_unused void *arg)
1507 {
1508         struct vhost_dev *vdev;
1509         uint64_t tx_dropped, rx_dropped;
1510         uint64_t tx, tx_total, rx, rx_total;
1511         const char clr[] = { 27, '[', '2', 'J', '\0' };
1512         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1513
1514         while(1) {
1515                 sleep(enable_stats);
1516
1517                 /* Clear screen and move to top left */
1518                 printf("%s%s\n", clr, top_left);
1519                 printf("Device statistics =================================\n");
1520
1521                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1522                         tx_total   = vdev->stats.tx_total;
1523                         tx         = vdev->stats.tx;
1524                         tx_dropped = tx_total - tx;
1525
1526                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1527                                 __ATOMIC_SEQ_CST);
1528                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1529                                 __ATOMIC_SEQ_CST);
1530                         rx_dropped = rx_total - rx;
1531
1532                         printf("Statistics for device %d\n"
1533                                 "-----------------------\n"
1534                                 "TX total:              %" PRIu64 "\n"
1535                                 "TX dropped:            %" PRIu64 "\n"
1536                                 "TX successful:         %" PRIu64 "\n"
1537                                 "RX total:              %" PRIu64 "\n"
1538                                 "RX dropped:            %" PRIu64 "\n"
1539                                 "RX successful:         %" PRIu64 "\n",
1540                                 vdev->vid,
1541                                 tx_total, tx_dropped, tx,
1542                                 rx_total, rx_dropped, rx);
1543                 }
1544
1545                 printf("===================================================\n");
1546
1547                 fflush(stdout);
1548         }
1549
1550         return NULL;
1551 }
1552
1553 static void
1554 unregister_drivers(int socket_num)
1555 {
1556         int i, ret;
1557
1558         for (i = 0; i < socket_num; i++) {
1559                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1560                 if (ret != 0)
1561                         RTE_LOG(ERR, VHOST_CONFIG,
1562                                 "Fail to unregister vhost driver for %s.\n",
1563                                 socket_files + i * PATH_MAX);
1564         }
1565 }
1566
1567 /* When we receive a INT signal, unregister vhost driver */
1568 static void
1569 sigint_handler(__rte_unused int signum)
1570 {
1571         /* Unregister vhost driver. */
1572         unregister_drivers(nb_sockets);
1573
1574         exit(0);
1575 }
1576
1577 /*
1578  * While creating an mbuf pool, one key thing is to figure out how
1579  * many mbuf entries is enough for our use. FYI, here are some
1580  * guidelines:
1581  *
1582  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1583  *
1584  * - For each switch core (A CPU core does the packet switch), we need
1585  *   also make some reservation for receiving the packets from virtio
1586  *   Tx queue. How many is enough depends on the usage. It's normally
1587  *   a simple calculation like following:
1588  *
1589  *       MAX_PKT_BURST * max packet size / mbuf size
1590  *
1591  *   So, we definitely need allocate more mbufs when TSO is enabled.
1592  *
1593  * - Similarly, for each switching core, we should serve @nr_rx_desc
1594  *   mbufs for receiving the packets from physical NIC device.
1595  *
1596  * - We also need make sure, for each switch core, we have allocated
1597  *   enough mbufs to fill up the mbuf cache.
1598  */
1599 static void
1600 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1601         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1602 {
1603         uint32_t nr_mbufs;
1604         uint32_t nr_mbufs_per_core;
1605         uint32_t mtu = 1500;
1606
1607         if (mergeable)
1608                 mtu = 9000;
1609         if (enable_tso)
1610                 mtu = 64 * 1024;
1611
1612         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1613                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1614         nr_mbufs_per_core += nr_rx_desc;
1615         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1616
1617         nr_mbufs  = nr_queues * nr_rx_desc;
1618         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1619         nr_mbufs *= nr_port;
1620
1621         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1622                                             nr_mbuf_cache, 0, mbuf_size,
1623                                             rte_socket_id());
1624         if (mbuf_pool == NULL)
1625                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1626 }
1627
1628 /*
1629  * Main function, does initialisation and calls the per-lcore functions.
1630  */
1631 int
1632 main(int argc, char *argv[])
1633 {
1634         unsigned lcore_id, core_id = 0;
1635         unsigned nb_ports, valid_num_ports;
1636         int ret, i;
1637         uint16_t portid;
1638         static pthread_t tid;
1639         uint64_t flags = 0;
1640
1641         signal(SIGINT, sigint_handler);
1642
1643         /* init EAL */
1644         ret = rte_eal_init(argc, argv);
1645         if (ret < 0)
1646                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1647         argc -= ret;
1648         argv += ret;
1649
1650         /* parse app arguments */
1651         ret = us_vhost_parse_args(argc, argv);
1652         if (ret < 0)
1653                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1654
1655         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1656                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1657
1658                 if (rte_lcore_is_enabled(lcore_id))
1659                         lcore_ids[core_id++] = lcore_id;
1660         }
1661
1662         if (rte_lcore_count() > RTE_MAX_LCORE)
1663                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1664
1665         /* Get the number of physical ports. */
1666         nb_ports = rte_eth_dev_count_avail();
1667
1668         /*
1669          * Update the global var NUM_PORTS and global array PORTS
1670          * and get value of var VALID_NUM_PORTS according to system ports number
1671          */
1672         valid_num_ports = check_ports_num(nb_ports);
1673
1674         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1675                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1676                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1677                 return -1;
1678         }
1679
1680         /*
1681          * FIXME: here we are trying to allocate mbufs big enough for
1682          * @MAX_QUEUES, but the truth is we're never going to use that
1683          * many queues here. We probably should only do allocation for
1684          * those queues we are going to use.
1685          */
1686         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1687                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1688
1689         if (vm2vm_mode == VM2VM_HARDWARE) {
1690                 /* Enable VT loop back to let L2 switch to do it. */
1691                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1692                 RTE_LOG(DEBUG, VHOST_CONFIG,
1693                         "Enable loop back for L2 switch in vmdq.\n");
1694         }
1695
1696         /* initialize all ports */
1697         RTE_ETH_FOREACH_DEV(portid) {
1698                 /* skip ports that are not enabled */
1699                 if ((enabled_port_mask & (1 << portid)) == 0) {
1700                         RTE_LOG(INFO, VHOST_PORT,
1701                                 "Skipping disabled port %d\n", portid);
1702                         continue;
1703                 }
1704                 if (port_init(portid) != 0)
1705                         rte_exit(EXIT_FAILURE,
1706                                 "Cannot initialize network ports\n");
1707         }
1708
1709         /* Enable stats if the user option is set. */
1710         if (enable_stats) {
1711                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1712                                         print_stats, NULL);
1713                 if (ret < 0)
1714                         rte_exit(EXIT_FAILURE,
1715                                 "Cannot create print-stats thread\n");
1716         }
1717
1718         /* Launch all data cores. */
1719         RTE_LCORE_FOREACH_WORKER(lcore_id)
1720                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1721
1722         if (client_mode)
1723                 flags |= RTE_VHOST_USER_CLIENT;
1724
1725         /* Register vhost user driver to handle vhost messages. */
1726         for (i = 0; i < nb_sockets; i++) {
1727                 char *file = socket_files + i * PATH_MAX;
1728
1729                 if (async_vhost_driver)
1730                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1731
1732                 ret = rte_vhost_driver_register(file, flags);
1733                 if (ret != 0) {
1734                         unregister_drivers(i);
1735                         rte_exit(EXIT_FAILURE,
1736                                 "vhost driver register failure.\n");
1737                 }
1738
1739                 if (builtin_net_driver)
1740                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1741
1742                 if (mergeable == 0) {
1743                         rte_vhost_driver_disable_features(file,
1744                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1745                 }
1746
1747                 if (enable_tx_csum == 0) {
1748                         rte_vhost_driver_disable_features(file,
1749                                 1ULL << VIRTIO_NET_F_CSUM);
1750                 }
1751
1752                 if (enable_tso == 0) {
1753                         rte_vhost_driver_disable_features(file,
1754                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1755                         rte_vhost_driver_disable_features(file,
1756                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1757                         rte_vhost_driver_disable_features(file,
1758                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1759                         rte_vhost_driver_disable_features(file,
1760                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1761                 }
1762
1763                 if (promiscuous) {
1764                         rte_vhost_driver_enable_features(file,
1765                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1766                 }
1767
1768                 ret = rte_vhost_driver_callback_register(file,
1769                         &virtio_net_device_ops);
1770                 if (ret != 0) {
1771                         rte_exit(EXIT_FAILURE,
1772                                 "failed to register vhost driver callbacks.\n");
1773                 }
1774
1775                 if (rte_vhost_driver_start(file) < 0) {
1776                         rte_exit(EXIT_FAILURE,
1777                                 "failed to start vhost driver.\n");
1778                 }
1779         }
1780
1781         RTE_LCORE_FOREACH_WORKER(lcore_id)
1782                 rte_eal_wait_lcore(lcore_id);
1783
1784         return 0;
1785
1786 }