net/i40e/base: fix function name in comments
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* mask of enabled ports */
60 static uint32_t enabled_port_mask = 0;
61
62 /* Promiscuous mode */
63 static uint32_t promiscuous;
64
65 /* number of devices/queues to support*/
66 static uint32_t num_queues = 0;
67 static uint32_t num_devices;
68
69 static struct rte_mempool *mbuf_pool;
70 static int mergeable;
71
72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
73 typedef enum {
74         VM2VM_DISABLED = 0,
75         VM2VM_SOFTWARE = 1,
76         VM2VM_HARDWARE = 2,
77         VM2VM_LAST
78 } vm2vm_type;
79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
80
81 /* Enable stats. */
82 static uint32_t enable_stats = 0;
83 /* Enable retries on RX. */
84 static uint32_t enable_retry = 1;
85
86 /* Disable TX checksum offload */
87 static uint32_t enable_tx_csum;
88
89 /* Disable TSO offload */
90 static uint32_t enable_tso;
91
92 static int client_mode;
93
94 static int builtin_net_driver;
95
96 static int async_vhost_driver;
97
98 static char *dma_type;
99
100 /* Specify timeout (in useconds) between retries on RX. */
101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
102 /* Specify the number of retries on RX. */
103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104
105 /* Socket file paths. Can be set by user */
106 static char *socket_files;
107 static int nb_sockets;
108
109 /* empty vmdq configuration structure. Filled in programatically */
110 static struct rte_eth_conf vmdq_conf_default = {
111         .rxmode = {
112                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
113                 .split_hdr_size = 0,
114                 /*
115                  * VLAN strip is necessary for 1G NIC such as I350,
116                  * this fixes bug of ipv4 forwarding in guest can't
117                  * forward pakets from one virtio dev to another virtio dev.
118                  */
119                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
120         },
121
122         .txmode = {
123                 .mq_mode = ETH_MQ_TX_NONE,
124                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
125                              DEV_TX_OFFLOAD_TCP_CKSUM |
126                              DEV_TX_OFFLOAD_VLAN_INSERT |
127                              DEV_TX_OFFLOAD_MULTI_SEGS |
128                              DEV_TX_OFFLOAD_TCP_TSO),
129         },
130         .rx_adv_conf = {
131                 /*
132                  * should be overridden separately in code with
133                  * appropriate values
134                  */
135                 .vmdq_rx_conf = {
136                         .nb_queue_pools = ETH_8_POOLS,
137                         .enable_default_pool = 0,
138                         .default_pool = 0,
139                         .nb_pool_maps = 0,
140                         .pool_map = {{0, 0},},
141                 },
142         },
143 };
144
145
146 static unsigned lcore_ids[RTE_MAX_LCORE];
147 static uint16_t ports[RTE_MAX_ETHPORTS];
148 static unsigned num_ports = 0; /**< The number of ports specified in command line */
149 static uint16_t num_pf_queues, num_vmdq_queues;
150 static uint16_t vmdq_pool_base, vmdq_queue_base;
151 static uint16_t queues_per_pool;
152
153 const uint16_t vlan_tags[] = {
154         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
155         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
156         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
157         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
158         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
159         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
160         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
161         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
162 };
163
164 /* ethernet addresses of ports */
165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166
167 static struct vhost_dev_tailq_list vhost_dev_list =
168         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169
170 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171
172 /* Used for queueing bursts of TX packets. */
173 struct mbuf_table {
174         unsigned len;
175         unsigned txq_id;
176         struct rte_mbuf *m_table[MAX_PKT_BURST];
177 };
178
179 struct vhost_bufftable {
180         uint32_t len;
181         uint64_t pre_tsc;
182         struct rte_mbuf *m_table[MAX_PKT_BURST];
183 };
184
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
187
188 /*
189  * Vhost TX buffer for each data core.
190  * Every data core maintains a TX buffer for every vhost device,
191  * which is used for batch pkts enqueue for higher performance.
192  */
193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
194
195 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
196                                  / US_PER_S * BURST_TX_DRAIN_US)
197 #define VLAN_HLEN       4
198
199 static inline int
200 open_dma(const char *value)
201 {
202         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203                 return open_ioat(value);
204
205         return -1;
206 }
207
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215         struct rte_eth_vmdq_rx_conf conf;
216         struct rte_eth_vmdq_rx_conf *def_conf =
217                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218         unsigned i;
219
220         memset(&conf, 0, sizeof(conf));
221         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222         conf.nb_pool_maps = num_devices;
223         conf.enable_loop_back = def_conf->enable_loop_back;
224         conf.rx_mode = def_conf->rx_mode;
225
226         for (i = 0; i < conf.nb_pool_maps; i++) {
227                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
228                 conf.pool_map[i].pools = (1UL << i);
229         }
230
231         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234         return 0;
235 }
236
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244         struct rte_eth_dev_info dev_info;
245         struct rte_eth_conf port_conf;
246         struct rte_eth_rxconf *rxconf;
247         struct rte_eth_txconf *txconf;
248         int16_t rx_rings, tx_rings;
249         uint16_t rx_ring_size, tx_ring_size;
250         int retval;
251         uint16_t q;
252
253         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254         retval = rte_eth_dev_info_get(port, &dev_info);
255         if (retval != 0) {
256                 RTE_LOG(ERR, VHOST_PORT,
257                         "Error during getting device (port %u) info: %s\n",
258                         port, strerror(-retval));
259
260                 return retval;
261         }
262
263         rxconf = &dev_info.default_rxconf;
264         txconf = &dev_info.default_txconf;
265         rxconf->rx_drop_en = 1;
266
267         /*configure the number of supported virtio devices based on VMDQ limits */
268         num_devices = dev_info.max_vmdq_pools;
269
270         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272
273         tx_rings = (uint16_t)rte_lcore_count();
274
275         /* Get port configuration. */
276         retval = get_eth_conf(&port_conf, num_devices);
277         if (retval < 0)
278                 return retval;
279         /* NIC queues are divided into pf queues and vmdq queues.  */
280         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
281         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
282         num_vmdq_queues = num_devices * queues_per_pool;
283         num_queues = num_pf_queues + num_vmdq_queues;
284         vmdq_queue_base = dev_info.vmdq_queue_base;
285         vmdq_pool_base  = dev_info.vmdq_pool_base;
286         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
287                 num_pf_queues, num_devices, queues_per_pool);
288
289         if (!rte_eth_dev_is_valid_port(port))
290                 return -1;
291
292         rx_rings = (uint16_t)dev_info.max_rx_queues;
293         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
294                 port_conf.txmode.offloads |=
295                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
296         /* Configure ethernet device. */
297         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
298         if (retval != 0) {
299                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
300                         port, strerror(-retval));
301                 return retval;
302         }
303
304         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
305                 &tx_ring_size);
306         if (retval != 0) {
307                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
308                         "for port %u: %s.\n", port, strerror(-retval));
309                 return retval;
310         }
311         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
312                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
313                         "for Rx queues on port %u.\n", port);
314                 return -1;
315         }
316
317         /* Setup the queues. */
318         rxconf->offloads = port_conf.rxmode.offloads;
319         for (q = 0; q < rx_rings; q ++) {
320                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
321                                                 rte_eth_dev_socket_id(port),
322                                                 rxconf,
323                                                 mbuf_pool);
324                 if (retval < 0) {
325                         RTE_LOG(ERR, VHOST_PORT,
326                                 "Failed to setup rx queue %u of port %u: %s.\n",
327                                 q, port, strerror(-retval));
328                         return retval;
329                 }
330         }
331         txconf->offloads = port_conf.txmode.offloads;
332         for (q = 0; q < tx_rings; q ++) {
333                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
334                                                 rte_eth_dev_socket_id(port),
335                                                 txconf);
336                 if (retval < 0) {
337                         RTE_LOG(ERR, VHOST_PORT,
338                                 "Failed to setup tx queue %u of port %u: %s.\n",
339                                 q, port, strerror(-retval));
340                         return retval;
341                 }
342         }
343
344         /* Start the device. */
345         retval  = rte_eth_dev_start(port);
346         if (retval < 0) {
347                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
348                         port, strerror(-retval));
349                 return retval;
350         }
351
352         if (promiscuous) {
353                 retval = rte_eth_promiscuous_enable(port);
354                 if (retval != 0) {
355                         RTE_LOG(ERR, VHOST_PORT,
356                                 "Failed to enable promiscuous mode on port %u: %s\n",
357                                 port, rte_strerror(-retval));
358                         return retval;
359                 }
360         }
361
362         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363         if (retval < 0) {
364                 RTE_LOG(ERR, VHOST_PORT,
365                         "Failed to get MAC address on port %u: %s\n",
366                         port, rte_strerror(-retval));
367                 return retval;
368         }
369
370         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
371         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
372                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
373                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
374
375         return 0;
376 }
377
378 /*
379  * Set socket file path.
380  */
381 static int
382 us_vhost_parse_socket_path(const char *q_arg)
383 {
384         char *old;
385
386         /* parse number string */
387         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
388                 return -1;
389
390         old = socket_files;
391         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
392         if (socket_files == NULL) {
393                 free(old);
394                 return -1;
395         }
396
397         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
398         nb_sockets++;
399
400         return 0;
401 }
402
403 /*
404  * Parse the portmask provided at run time.
405  */
406 static int
407 parse_portmask(const char *portmask)
408 {
409         char *end = NULL;
410         unsigned long pm;
411
412         errno = 0;
413
414         /* parse hexadecimal string */
415         pm = strtoul(portmask, &end, 16);
416         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
417                 return 0;
418
419         return pm;
420
421 }
422
423 /*
424  * Parse num options at run time.
425  */
426 static int
427 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
428 {
429         char *end = NULL;
430         unsigned long num;
431
432         errno = 0;
433
434         /* parse unsigned int string */
435         num = strtoul(q_arg, &end, 10);
436         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
437                 return -1;
438
439         if (num > max_valid_value)
440                 return -1;
441
442         return num;
443
444 }
445
446 /*
447  * Display usage
448  */
449 static void
450 us_vhost_usage(const char *prgname)
451 {
452         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
453         "               --vm2vm [0|1|2]\n"
454         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
455         "               --socket-file <path>\n"
456         "               --nb-devices ND\n"
457         "               -p PORTMASK: Set mask for ports to be used by application\n"
458         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
459         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
460         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
461         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
462         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
463         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
464         "               --socket-file: The path of the socket file.\n"
465         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
466         "               --tso [0|1] disable/enable TCP segment offload.\n"
467         "               --client register a vhost-user socket as client mode.\n"
468         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
469         "               --dmas register dma channel for specific vhost device.\n",
470                prgname);
471 }
472
473 enum {
474 #define OPT_VM2VM               "vm2vm"
475         OPT_VM2VM_NUM = 256,
476 #define OPT_RX_RETRY            "rx-retry"
477         OPT_RX_RETRY_NUM,
478 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
479         OPT_RX_RETRY_DELAY_NUM,
480 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
481         OPT_RX_RETRY_NUMB_NUM,
482 #define OPT_MERGEABLE           "mergeable"
483         OPT_MERGEABLE_NUM,
484 #define OPT_STATS               "stats"
485         OPT_STATS_NUM,
486 #define OPT_SOCKET_FILE         "socket-file"
487         OPT_SOCKET_FILE_NUM,
488 #define OPT_TX_CSUM             "tx-csum"
489         OPT_TX_CSUM_NUM,
490 #define OPT_TSO                 "tso"
491         OPT_TSO_NUM,
492 #define OPT_CLIENT              "client"
493         OPT_CLIENT_NUM,
494 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
495         OPT_BUILTIN_NET_DRIVER_NUM,
496 #define OPT_DMA_TYPE            "dma-type"
497         OPT_DMA_TYPE_NUM,
498 #define OPT_DMAS                "dmas"
499         OPT_DMAS_NUM,
500 };
501
502 /*
503  * Parse the arguments given in the command line of the application.
504  */
505 static int
506 us_vhost_parse_args(int argc, char **argv)
507 {
508         int opt, ret;
509         int option_index;
510         unsigned i;
511         const char *prgname = argv[0];
512         static struct option long_option[] = {
513                 {OPT_VM2VM, required_argument,
514                                 NULL, OPT_VM2VM_NUM},
515                 {OPT_RX_RETRY, required_argument,
516                                 NULL, OPT_RX_RETRY_NUM},
517                 {OPT_RX_RETRY_DELAY, required_argument,
518                                 NULL, OPT_RX_RETRY_DELAY_NUM},
519                 {OPT_RX_RETRY_NUMB, required_argument,
520                                 NULL, OPT_RX_RETRY_NUMB_NUM},
521                 {OPT_MERGEABLE, required_argument,
522                                 NULL, OPT_MERGEABLE_NUM},
523                 {OPT_STATS, required_argument,
524                                 NULL, OPT_STATS_NUM},
525                 {OPT_SOCKET_FILE, required_argument,
526                                 NULL, OPT_SOCKET_FILE_NUM},
527                 {OPT_TX_CSUM, required_argument,
528                                 NULL, OPT_TX_CSUM_NUM},
529                 {OPT_TSO, required_argument,
530                                 NULL, OPT_TSO_NUM},
531                 {OPT_CLIENT, no_argument,
532                                 NULL, OPT_CLIENT_NUM},
533                 {OPT_BUILTIN_NET_DRIVER, no_argument,
534                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
535                 {OPT_DMA_TYPE, required_argument,
536                                 NULL, OPT_DMA_TYPE_NUM},
537                 {OPT_DMAS, required_argument,
538                                 NULL, OPT_DMAS_NUM},
539                 {NULL, 0, 0, 0},
540         };
541
542         /* Parse command line */
543         while ((opt = getopt_long(argc, argv, "p:P",
544                         long_option, &option_index)) != EOF) {
545                 switch (opt) {
546                 /* Portmask */
547                 case 'p':
548                         enabled_port_mask = parse_portmask(optarg);
549                         if (enabled_port_mask == 0) {
550                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
551                                 us_vhost_usage(prgname);
552                                 return -1;
553                         }
554                         break;
555
556                 case 'P':
557                         promiscuous = 1;
558                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
559                                 ETH_VMDQ_ACCEPT_BROADCAST |
560                                 ETH_VMDQ_ACCEPT_MULTICAST;
561                         break;
562
563                 case OPT_VM2VM_NUM:
564                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
565                         if (ret == -1) {
566                                 RTE_LOG(INFO, VHOST_CONFIG,
567                                         "Invalid argument for "
568                                         "vm2vm [0|1|2]\n");
569                                 us_vhost_usage(prgname);
570                                 return -1;
571                         }
572                         vm2vm_mode = (vm2vm_type)ret;
573                         break;
574
575                 case OPT_RX_RETRY_NUM:
576                         ret = parse_num_opt(optarg, 1);
577                         if (ret == -1) {
578                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
579                                 us_vhost_usage(prgname);
580                                 return -1;
581                         }
582                         enable_retry = ret;
583                         break;
584
585                 case OPT_TX_CSUM_NUM:
586                         ret = parse_num_opt(optarg, 1);
587                         if (ret == -1) {
588                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
589                                 us_vhost_usage(prgname);
590                                 return -1;
591                         }
592                         enable_tx_csum = ret;
593                         break;
594
595                 case OPT_TSO_NUM:
596                         ret = parse_num_opt(optarg, 1);
597                         if (ret == -1) {
598                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
599                                 us_vhost_usage(prgname);
600                                 return -1;
601                         }
602                         enable_tso = ret;
603                         break;
604
605                 case OPT_RX_RETRY_DELAY_NUM:
606                         ret = parse_num_opt(optarg, INT32_MAX);
607                         if (ret == -1) {
608                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
609                                 us_vhost_usage(prgname);
610                                 return -1;
611                         }
612                         burst_rx_delay_time = ret;
613                         break;
614
615                 case OPT_RX_RETRY_NUMB_NUM:
616                         ret = parse_num_opt(optarg, INT32_MAX);
617                         if (ret == -1) {
618                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
619                                 us_vhost_usage(prgname);
620                                 return -1;
621                         }
622                         burst_rx_retry_num = ret;
623                         break;
624
625                 case OPT_MERGEABLE_NUM:
626                         ret = parse_num_opt(optarg, 1);
627                         if (ret == -1) {
628                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
629                                 us_vhost_usage(prgname);
630                                 return -1;
631                         }
632                         mergeable = !!ret;
633                         if (ret) {
634                                 vmdq_conf_default.rxmode.offloads |=
635                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
636                                 vmdq_conf_default.rxmode.max_rx_pkt_len
637                                         = JUMBO_FRAME_MAX_SIZE;
638                         }
639                         break;
640
641                 case OPT_STATS_NUM:
642                         ret = parse_num_opt(optarg, INT32_MAX);
643                         if (ret == -1) {
644                                 RTE_LOG(INFO, VHOST_CONFIG,
645                                         "Invalid argument for stats [0..N]\n");
646                                 us_vhost_usage(prgname);
647                                 return -1;
648                         }
649                         enable_stats = ret;
650                         break;
651
652                 /* Set socket file path. */
653                 case OPT_SOCKET_FILE_NUM:
654                         if (us_vhost_parse_socket_path(optarg) == -1) {
655                                 RTE_LOG(INFO, VHOST_CONFIG,
656                                 "Invalid argument for socket name (Max %d characters)\n",
657                                 PATH_MAX);
658                                 us_vhost_usage(prgname);
659                                 return -1;
660                         }
661                         break;
662
663                 case OPT_DMA_TYPE_NUM:
664                         dma_type = optarg;
665                         break;
666
667                 case OPT_DMAS_NUM:
668                         if (open_dma(optarg) == -1) {
669                                 RTE_LOG(INFO, VHOST_CONFIG,
670                                         "Wrong DMA args\n");
671                                 us_vhost_usage(prgname);
672                                 return -1;
673                         }
674                         async_vhost_driver = 1;
675                         break;
676
677                 case OPT_CLIENT_NUM:
678                         client_mode = 1;
679                         break;
680
681                 case OPT_BUILTIN_NET_DRIVER_NUM:
682                         builtin_net_driver = 1;
683                         break;
684
685                 /* Invalid option - print options. */
686                 default:
687                         us_vhost_usage(prgname);
688                         return -1;
689                 }
690         }
691
692         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
693                 if (enabled_port_mask & (1 << i))
694                         ports[num_ports++] = i;
695         }
696
697         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
698                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
699                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
700                 return -1;
701         }
702
703         return 0;
704 }
705
706 /*
707  * Update the global var NUM_PORTS and array PORTS according to system ports number
708  * and return valid ports number
709  */
710 static unsigned check_ports_num(unsigned nb_ports)
711 {
712         unsigned valid_num_ports = num_ports;
713         unsigned portid;
714
715         if (num_ports > nb_ports) {
716                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
717                         num_ports, nb_ports);
718                 num_ports = nb_ports;
719         }
720
721         for (portid = 0; portid < num_ports; portid ++) {
722                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
723                         RTE_LOG(INFO, VHOST_PORT,
724                                 "\nSpecified port ID(%u) is not valid\n",
725                                 ports[portid]);
726                         ports[portid] = INVALID_PORT_ID;
727                         valid_num_ports--;
728                 }
729         }
730         return valid_num_ports;
731 }
732
733 static __rte_always_inline struct vhost_dev *
734 find_vhost_dev(struct rte_ether_addr *mac)
735 {
736         struct vhost_dev *vdev;
737
738         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
739                 if (vdev->ready == DEVICE_RX &&
740                     rte_is_same_ether_addr(mac, &vdev->mac_address))
741                         return vdev;
742         }
743
744         return NULL;
745 }
746
747 /*
748  * This function learns the MAC address of the device and registers this along with a
749  * vlan tag to a VMDQ.
750  */
751 static int
752 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
753 {
754         struct rte_ether_hdr *pkt_hdr;
755         int i, ret;
756
757         /* Learn MAC address of guest device from packet */
758         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
759
760         if (find_vhost_dev(&pkt_hdr->src_addr)) {
761                 RTE_LOG(ERR, VHOST_DATA,
762                         "(%d) device is using a registered MAC!\n",
763                         vdev->vid);
764                 return -1;
765         }
766
767         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
768                 vdev->mac_address.addr_bytes[i] =
769                         pkt_hdr->src_addr.addr_bytes[i];
770
771         /* vlan_tag currently uses the device_id. */
772         vdev->vlan_tag = vlan_tags[vdev->vid];
773
774         /* Print out VMDQ registration info. */
775         RTE_LOG(INFO, VHOST_DATA,
776                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
777                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
778                 vdev->vlan_tag);
779
780         /* Register the MAC address. */
781         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
782                                 (uint32_t)vdev->vid + vmdq_pool_base);
783         if (ret)
784                 RTE_LOG(ERR, VHOST_DATA,
785                         "(%d) failed to add device MAC address to VMDQ\n",
786                         vdev->vid);
787
788         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
789
790         /* Set device as ready for RX. */
791         vdev->ready = DEVICE_RX;
792
793         return 0;
794 }
795
796 /*
797  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
798  * queue before disabling RX on the device.
799  */
800 static inline void
801 unlink_vmdq(struct vhost_dev *vdev)
802 {
803         unsigned i = 0;
804         unsigned rx_count;
805         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
806
807         if (vdev->ready == DEVICE_RX) {
808                 /*clear MAC and VLAN settings*/
809                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
810                 for (i = 0; i < 6; i++)
811                         vdev->mac_address.addr_bytes[i] = 0;
812
813                 vdev->vlan_tag = 0;
814
815                 /*Clear out the receive buffers*/
816                 rx_count = rte_eth_rx_burst(ports[0],
817                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
818
819                 while (rx_count) {
820                         for (i = 0; i < rx_count; i++)
821                                 rte_pktmbuf_free(pkts_burst[i]);
822
823                         rx_count = rte_eth_rx_burst(ports[0],
824                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
825                 }
826
827                 vdev->ready = DEVICE_MAC_LEARNING;
828         }
829 }
830
831 static inline void
832 free_pkts(struct rte_mbuf **pkts, uint16_t n)
833 {
834         while (n--)
835                 rte_pktmbuf_free(pkts[n]);
836 }
837
838 static __rte_always_inline void
839 complete_async_pkts(struct vhost_dev *vdev)
840 {
841         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
842         uint16_t complete_count;
843
844         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
845                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
846         if (complete_count) {
847                 free_pkts(p_cpl, complete_count);
848                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
849         }
850
851 }
852
853 static __rte_always_inline void
854 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
855             struct rte_mbuf *m)
856 {
857         uint16_t ret;
858
859         if (builtin_net_driver) {
860                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
861         } else {
862                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
863         }
864
865         if (enable_stats) {
866                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
867                                 __ATOMIC_SEQ_CST);
868                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
869                                 __ATOMIC_SEQ_CST);
870                 src_vdev->stats.tx_total++;
871                 src_vdev->stats.tx += ret;
872         }
873 }
874
875 static __rte_always_inline void
876 drain_vhost(struct vhost_dev *vdev)
877 {
878         uint16_t ret;
879         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
880         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
881         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
882
883         if (builtin_net_driver) {
884                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
885         } else if (async_vhost_driver) {
886                 uint16_t enqueue_fail = 0;
887
888                 complete_async_pkts(vdev);
889                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
890                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
891
892                 enqueue_fail = nr_xmit - ret;
893                 if (enqueue_fail)
894                         free_pkts(&m[ret], nr_xmit - ret);
895         } else {
896                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
897                                                 m, nr_xmit);
898         }
899
900         if (enable_stats) {
901                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
902                                 __ATOMIC_SEQ_CST);
903                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
904                                 __ATOMIC_SEQ_CST);
905         }
906
907         if (!async_vhost_driver)
908                 free_pkts(m, nr_xmit);
909 }
910
911 static __rte_always_inline void
912 drain_vhost_table(void)
913 {
914         uint16_t lcore_id = rte_lcore_id();
915         struct vhost_bufftable *vhost_txq;
916         struct vhost_dev *vdev;
917         uint64_t cur_tsc;
918
919         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
920                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
921                                                 + vdev->vid];
922
923                 cur_tsc = rte_rdtsc();
924                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
925                                 > MBUF_TABLE_DRAIN_TSC)) {
926                         RTE_LOG_DP(DEBUG, VHOST_DATA,
927                                 "Vhost TX queue drained after timeout with burst size %u\n",
928                                 vhost_txq->len);
929                         drain_vhost(vdev);
930                         vhost_txq->len = 0;
931                         vhost_txq->pre_tsc = cur_tsc;
932                 }
933         }
934 }
935
936 /*
937  * Check if the packet destination MAC address is for a local device. If so then put
938  * the packet on that devices RX queue. If not then return.
939  */
940 static __rte_always_inline int
941 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
942 {
943         struct rte_ether_hdr *pkt_hdr;
944         struct vhost_dev *dst_vdev;
945         struct vhost_bufftable *vhost_txq;
946         uint16_t lcore_id = rte_lcore_id();
947         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
948
949         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
950         if (!dst_vdev)
951                 return -1;
952
953         if (vdev->vid == dst_vdev->vid) {
954                 RTE_LOG_DP(DEBUG, VHOST_DATA,
955                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
956                         vdev->vid);
957                 return 0;
958         }
959
960         RTE_LOG_DP(DEBUG, VHOST_DATA,
961                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
962
963         if (unlikely(dst_vdev->remove)) {
964                 RTE_LOG_DP(DEBUG, VHOST_DATA,
965                         "(%d) device is marked for removal\n", dst_vdev->vid);
966                 return 0;
967         }
968
969         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
970         vhost_txq->m_table[vhost_txq->len++] = m;
971
972         if (enable_stats) {
973                 vdev->stats.tx_total++;
974                 vdev->stats.tx++;
975         }
976
977         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
978                 drain_vhost(dst_vdev);
979                 vhost_txq->len = 0;
980                 vhost_txq->pre_tsc = rte_rdtsc();
981         }
982         return 0;
983 }
984
985 /*
986  * Check if the destination MAC of a packet is one local VM,
987  * and get its vlan tag, and offset if it is.
988  */
989 static __rte_always_inline int
990 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
991         uint32_t *offset, uint16_t *vlan_tag)
992 {
993         struct vhost_dev *dst_vdev;
994         struct rte_ether_hdr *pkt_hdr =
995                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
996
997         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
998         if (!dst_vdev)
999                 return 0;
1000
1001         if (vdev->vid == dst_vdev->vid) {
1002                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1003                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1004                         vdev->vid);
1005                 return -1;
1006         }
1007
1008         /*
1009          * HW vlan strip will reduce the packet length
1010          * by minus length of vlan tag, so need restore
1011          * the packet length by plus it.
1012          */
1013         *offset  = VLAN_HLEN;
1014         *vlan_tag = vlan_tags[vdev->vid];
1015
1016         RTE_LOG_DP(DEBUG, VHOST_DATA,
1017                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1018                 vdev->vid, dst_vdev->vid, *vlan_tag);
1019
1020         return 0;
1021 }
1022
1023 static void virtio_tx_offload(struct rte_mbuf *m)
1024 {
1025         struct rte_net_hdr_lens hdr_lens;
1026         struct rte_ipv4_hdr *ipv4_hdr;
1027         struct rte_tcp_hdr *tcp_hdr;
1028         uint32_t ptype;
1029         void *l3_hdr;
1030
1031         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1032         m->l2_len = hdr_lens.l2_len;
1033         m->l3_len = hdr_lens.l3_len;
1034         m->l4_len = hdr_lens.l4_len;
1035
1036         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1037         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1038                 m->l2_len + m->l3_len);
1039
1040         m->ol_flags |= PKT_TX_TCP_SEG;
1041         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1042                 m->ol_flags |= PKT_TX_IPV4;
1043                 m->ol_flags |= PKT_TX_IP_CKSUM;
1044                 ipv4_hdr = l3_hdr;
1045                 ipv4_hdr->hdr_checksum = 0;
1046                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1047         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1048                 m->ol_flags |= PKT_TX_IPV6;
1049                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1050         }
1051 }
1052
1053 static __rte_always_inline void
1054 do_drain_mbuf_table(struct mbuf_table *tx_q)
1055 {
1056         uint16_t count;
1057
1058         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1059                                  tx_q->m_table, tx_q->len);
1060         if (unlikely(count < tx_q->len))
1061                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1062
1063         tx_q->len = 0;
1064 }
1065
1066 /*
1067  * This function routes the TX packet to the correct interface. This
1068  * may be a local device or the physical port.
1069  */
1070 static __rte_always_inline void
1071 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1072 {
1073         struct mbuf_table *tx_q;
1074         unsigned offset = 0;
1075         const uint16_t lcore_id = rte_lcore_id();
1076         struct rte_ether_hdr *nh;
1077
1078
1079         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1080         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1081                 struct vhost_dev *vdev2;
1082
1083                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1084                         if (vdev2 != vdev)
1085                                 sync_virtio_xmit(vdev2, vdev, m);
1086                 }
1087                 goto queue2nic;
1088         }
1089
1090         /*check if destination is local VM*/
1091         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1092                 return;
1093
1094         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1095                 if (unlikely(find_local_dest(vdev, m, &offset,
1096                                              &vlan_tag) != 0)) {
1097                         rte_pktmbuf_free(m);
1098                         return;
1099                 }
1100         }
1101
1102         RTE_LOG_DP(DEBUG, VHOST_DATA,
1103                 "(%d) TX: MAC address is external\n", vdev->vid);
1104
1105 queue2nic:
1106
1107         /*Add packet to the port tx queue*/
1108         tx_q = &lcore_tx_queue[lcore_id];
1109
1110         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1111         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1112                 /* Guest has inserted the vlan tag. */
1113                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1114                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1115                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1116                         (vh->vlan_tci != vlan_tag_be))
1117                         vh->vlan_tci = vlan_tag_be;
1118         } else {
1119                 m->ol_flags |= PKT_TX_VLAN_PKT;
1120
1121                 /*
1122                  * Find the right seg to adjust the data len when offset is
1123                  * bigger than tail room size.
1124                  */
1125                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1126                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1127                                 m->data_len += offset;
1128                         else {
1129                                 struct rte_mbuf *seg = m;
1130
1131                                 while ((seg->next != NULL) &&
1132                                         (offset > rte_pktmbuf_tailroom(seg)))
1133                                         seg = seg->next;
1134
1135                                 seg->data_len += offset;
1136                         }
1137                         m->pkt_len += offset;
1138                 }
1139
1140                 m->vlan_tci = vlan_tag;
1141         }
1142
1143         if (m->ol_flags & PKT_RX_LRO)
1144                 virtio_tx_offload(m);
1145
1146         tx_q->m_table[tx_q->len++] = m;
1147         if (enable_stats) {
1148                 vdev->stats.tx_total++;
1149                 vdev->stats.tx++;
1150         }
1151
1152         if (unlikely(tx_q->len == MAX_PKT_BURST))
1153                 do_drain_mbuf_table(tx_q);
1154 }
1155
1156
1157 static __rte_always_inline void
1158 drain_mbuf_table(struct mbuf_table *tx_q)
1159 {
1160         static uint64_t prev_tsc;
1161         uint64_t cur_tsc;
1162
1163         if (tx_q->len == 0)
1164                 return;
1165
1166         cur_tsc = rte_rdtsc();
1167         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1168                 prev_tsc = cur_tsc;
1169
1170                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1171                         "TX queue drained after timeout with burst size %u\n",
1172                         tx_q->len);
1173                 do_drain_mbuf_table(tx_q);
1174         }
1175 }
1176
1177 static __rte_always_inline void
1178 drain_eth_rx(struct vhost_dev *vdev)
1179 {
1180         uint16_t rx_count, enqueue_count;
1181         struct rte_mbuf *pkts[MAX_PKT_BURST];
1182
1183         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1184                                     pkts, MAX_PKT_BURST);
1185
1186         if (!rx_count)
1187                 return;
1188
1189         /*
1190          * When "enable_retry" is set, here we wait and retry when there
1191          * is no enough free slots in the queue to hold @rx_count packets,
1192          * to diminish packet loss.
1193          */
1194         if (enable_retry &&
1195             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1196                         VIRTIO_RXQ))) {
1197                 uint32_t retry;
1198
1199                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1200                         rte_delay_us(burst_rx_delay_time);
1201                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1202                                         VIRTIO_RXQ))
1203                                 break;
1204                 }
1205         }
1206
1207         if (builtin_net_driver) {
1208                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1209                                                 pkts, rx_count);
1210         } else if (async_vhost_driver) {
1211                 uint16_t enqueue_fail = 0;
1212
1213                 complete_async_pkts(vdev);
1214                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1215                                         VIRTIO_RXQ, pkts, rx_count);
1216                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1217
1218                 enqueue_fail = rx_count - enqueue_count;
1219                 if (enqueue_fail)
1220                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1221
1222         } else {
1223                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1224                                                 pkts, rx_count);
1225         }
1226
1227         if (enable_stats) {
1228                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1229                                 __ATOMIC_SEQ_CST);
1230                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1231                                 __ATOMIC_SEQ_CST);
1232         }
1233
1234         if (!async_vhost_driver)
1235                 free_pkts(pkts, rx_count);
1236 }
1237
1238 static __rte_always_inline void
1239 drain_virtio_tx(struct vhost_dev *vdev)
1240 {
1241         struct rte_mbuf *pkts[MAX_PKT_BURST];
1242         uint16_t count;
1243         uint16_t i;
1244
1245         if (builtin_net_driver) {
1246                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1247                                         pkts, MAX_PKT_BURST);
1248         } else {
1249                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1250                                         mbuf_pool, pkts, MAX_PKT_BURST);
1251         }
1252
1253         /* setup VMDq for the first packet */
1254         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1255                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1256                         free_pkts(pkts, count);
1257         }
1258
1259         for (i = 0; i < count; ++i)
1260                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1261 }
1262
1263 /*
1264  * Main function of vhost-switch. It basically does:
1265  *
1266  * for each vhost device {
1267  *    - drain_eth_rx()
1268  *
1269  *      Which drains the host eth Rx queue linked to the vhost device,
1270  *      and deliver all of them to guest virito Rx ring associated with
1271  *      this vhost device.
1272  *
1273  *    - drain_virtio_tx()
1274  *
1275  *      Which drains the guest virtio Tx queue and deliver all of them
1276  *      to the target, which could be another vhost device, or the
1277  *      physical eth dev. The route is done in function "virtio_tx_route".
1278  * }
1279  */
1280 static int
1281 switch_worker(void *arg __rte_unused)
1282 {
1283         unsigned i;
1284         unsigned lcore_id = rte_lcore_id();
1285         struct vhost_dev *vdev;
1286         struct mbuf_table *tx_q;
1287
1288         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1289
1290         tx_q = &lcore_tx_queue[lcore_id];
1291         for (i = 0; i < rte_lcore_count(); i++) {
1292                 if (lcore_ids[i] == lcore_id) {
1293                         tx_q->txq_id = i;
1294                         break;
1295                 }
1296         }
1297
1298         while(1) {
1299                 drain_mbuf_table(tx_q);
1300                 drain_vhost_table();
1301                 /*
1302                  * Inform the configuration core that we have exited the
1303                  * linked list and that no devices are in use if requested.
1304                  */
1305                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1306                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1307
1308                 /*
1309                  * Process vhost devices
1310                  */
1311                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1312                               lcore_vdev_entry) {
1313                         if (unlikely(vdev->remove)) {
1314                                 unlink_vmdq(vdev);
1315                                 vdev->ready = DEVICE_SAFE_REMOVE;
1316                                 continue;
1317                         }
1318
1319                         if (likely(vdev->ready == DEVICE_RX))
1320                                 drain_eth_rx(vdev);
1321
1322                         if (likely(!vdev->remove))
1323                                 drain_virtio_tx(vdev);
1324                 }
1325         }
1326
1327         return 0;
1328 }
1329
1330 /*
1331  * Remove a device from the specific data core linked list and from the
1332  * main linked list. Synchonization  occurs through the use of the
1333  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1334  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1335  */
1336 static void
1337 destroy_device(int vid)
1338 {
1339         struct vhost_dev *vdev = NULL;
1340         int lcore;
1341         uint16_t i;
1342
1343         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1344                 if (vdev->vid == vid)
1345                         break;
1346         }
1347         if (!vdev)
1348                 return;
1349         /*set the remove flag. */
1350         vdev->remove = 1;
1351         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1352                 rte_pause();
1353         }
1354
1355         for (i = 0; i < RTE_MAX_LCORE; i++)
1356                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1357
1358         if (builtin_net_driver)
1359                 vs_vhost_net_remove(vdev);
1360
1361         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1362                      lcore_vdev_entry);
1363         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1364
1365
1366         /* Set the dev_removal_flag on each lcore. */
1367         RTE_LCORE_FOREACH_WORKER(lcore)
1368                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1369
1370         /*
1371          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1372          * we can be sure that they can no longer access the device removed
1373          * from the linked lists and that the devices are no longer in use.
1374          */
1375         RTE_LCORE_FOREACH_WORKER(lcore) {
1376                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1377                         rte_pause();
1378         }
1379
1380         lcore_info[vdev->coreid].device_num--;
1381
1382         RTE_LOG(INFO, VHOST_DATA,
1383                 "(%d) device has been removed from data core\n",
1384                 vdev->vid);
1385
1386         if (async_vhost_driver) {
1387                 uint16_t n_pkt = 0;
1388                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1389
1390                 while (vdev->pkts_inflight) {
1391                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1392                                                 m_cpl, vdev->pkts_inflight);
1393                         free_pkts(m_cpl, n_pkt);
1394                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1395                 }
1396
1397                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1398         }
1399
1400         rte_free(vdev);
1401 }
1402
1403 /*
1404  * A new device is added to a data core. First the device is added to the main linked list
1405  * and then allocated to a specific data core.
1406  */
1407 static int
1408 new_device(int vid)
1409 {
1410         int lcore, core_add = 0;
1411         uint16_t i;
1412         uint32_t device_num_min = num_devices;
1413         struct vhost_dev *vdev;
1414         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1415         if (vdev == NULL) {
1416                 RTE_LOG(INFO, VHOST_DATA,
1417                         "(%d) couldn't allocate memory for vhost dev\n",
1418                         vid);
1419                 return -1;
1420         }
1421         vdev->vid = vid;
1422
1423         for (i = 0; i < RTE_MAX_LCORE; i++) {
1424                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1425                         = rte_zmalloc("vhost bufftable",
1426                                 sizeof(struct vhost_bufftable),
1427                                 RTE_CACHE_LINE_SIZE);
1428
1429                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1430                         RTE_LOG(INFO, VHOST_DATA,
1431                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1432                         return -1;
1433                 }
1434         }
1435
1436         if (builtin_net_driver)
1437                 vs_vhost_net_setup(vdev);
1438
1439         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1440         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1441
1442         /*reset ready flag*/
1443         vdev->ready = DEVICE_MAC_LEARNING;
1444         vdev->remove = 0;
1445
1446         /* Find a suitable lcore to add the device. */
1447         RTE_LCORE_FOREACH_WORKER(lcore) {
1448                 if (lcore_info[lcore].device_num < device_num_min) {
1449                         device_num_min = lcore_info[lcore].device_num;
1450                         core_add = lcore;
1451                 }
1452         }
1453         vdev->coreid = core_add;
1454
1455         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1456                           lcore_vdev_entry);
1457         lcore_info[vdev->coreid].device_num++;
1458
1459         /* Disable notifications. */
1460         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1461         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1462
1463         RTE_LOG(INFO, VHOST_DATA,
1464                 "(%d) device has been added to data core %d\n",
1465                 vid, vdev->coreid);
1466
1467         if (async_vhost_driver) {
1468                 struct rte_vhost_async_config config = {0};
1469                 struct rte_vhost_async_channel_ops channel_ops;
1470
1471                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1472                         channel_ops.transfer_data = ioat_transfer_data_cb;
1473                         channel_ops.check_completed_copies =
1474                                 ioat_check_completed_copies_cb;
1475
1476                         config.features = RTE_VHOST_ASYNC_INORDER;
1477
1478                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1479                                 config, &channel_ops);
1480                 }
1481         }
1482
1483         return 0;
1484 }
1485
1486 static int
1487 vring_state_changed(int vid, uint16_t queue_id, int enable)
1488 {
1489         struct vhost_dev *vdev = NULL;
1490
1491         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1492                 if (vdev->vid == vid)
1493                         break;
1494         }
1495         if (!vdev)
1496                 return -1;
1497
1498         if (queue_id != VIRTIO_RXQ)
1499                 return 0;
1500
1501         if (async_vhost_driver) {
1502                 if (!enable) {
1503                         uint16_t n_pkt = 0;
1504                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1505
1506                         while (vdev->pkts_inflight) {
1507                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1508                                                         m_cpl, vdev->pkts_inflight);
1509                                 free_pkts(m_cpl, n_pkt);
1510                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1511                         }
1512                 }
1513         }
1514
1515         return 0;
1516 }
1517
1518 /*
1519  * These callback allow devices to be added to the data core when configuration
1520  * has been fully complete.
1521  */
1522 static const struct vhost_device_ops virtio_net_device_ops =
1523 {
1524         .new_device =  new_device,
1525         .destroy_device = destroy_device,
1526         .vring_state_changed = vring_state_changed,
1527 };
1528
1529 /*
1530  * This is a thread will wake up after a period to print stats if the user has
1531  * enabled them.
1532  */
1533 static void *
1534 print_stats(__rte_unused void *arg)
1535 {
1536         struct vhost_dev *vdev;
1537         uint64_t tx_dropped, rx_dropped;
1538         uint64_t tx, tx_total, rx, rx_total;
1539         const char clr[] = { 27, '[', '2', 'J', '\0' };
1540         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1541
1542         while(1) {
1543                 sleep(enable_stats);
1544
1545                 /* Clear screen and move to top left */
1546                 printf("%s%s\n", clr, top_left);
1547                 printf("Device statistics =================================\n");
1548
1549                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1550                         tx_total   = vdev->stats.tx_total;
1551                         tx         = vdev->stats.tx;
1552                         tx_dropped = tx_total - tx;
1553
1554                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1555                                 __ATOMIC_SEQ_CST);
1556                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1557                                 __ATOMIC_SEQ_CST);
1558                         rx_dropped = rx_total - rx;
1559
1560                         printf("Statistics for device %d\n"
1561                                 "-----------------------\n"
1562                                 "TX total:              %" PRIu64 "\n"
1563                                 "TX dropped:            %" PRIu64 "\n"
1564                                 "TX successful:         %" PRIu64 "\n"
1565                                 "RX total:              %" PRIu64 "\n"
1566                                 "RX dropped:            %" PRIu64 "\n"
1567                                 "RX successful:         %" PRIu64 "\n",
1568                                 vdev->vid,
1569                                 tx_total, tx_dropped, tx,
1570                                 rx_total, rx_dropped, rx);
1571                 }
1572
1573                 printf("===================================================\n");
1574
1575                 fflush(stdout);
1576         }
1577
1578         return NULL;
1579 }
1580
1581 static void
1582 unregister_drivers(int socket_num)
1583 {
1584         int i, ret;
1585
1586         for (i = 0; i < socket_num; i++) {
1587                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1588                 if (ret != 0)
1589                         RTE_LOG(ERR, VHOST_CONFIG,
1590                                 "Fail to unregister vhost driver for %s.\n",
1591                                 socket_files + i * PATH_MAX);
1592         }
1593 }
1594
1595 /* When we receive a INT signal, unregister vhost driver */
1596 static void
1597 sigint_handler(__rte_unused int signum)
1598 {
1599         /* Unregister vhost driver. */
1600         unregister_drivers(nb_sockets);
1601
1602         exit(0);
1603 }
1604
1605 /*
1606  * While creating an mbuf pool, one key thing is to figure out how
1607  * many mbuf entries is enough for our use. FYI, here are some
1608  * guidelines:
1609  *
1610  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1611  *
1612  * - For each switch core (A CPU core does the packet switch), we need
1613  *   also make some reservation for receiving the packets from virtio
1614  *   Tx queue. How many is enough depends on the usage. It's normally
1615  *   a simple calculation like following:
1616  *
1617  *       MAX_PKT_BURST * max packet size / mbuf size
1618  *
1619  *   So, we definitely need allocate more mbufs when TSO is enabled.
1620  *
1621  * - Similarly, for each switching core, we should serve @nr_rx_desc
1622  *   mbufs for receiving the packets from physical NIC device.
1623  *
1624  * - We also need make sure, for each switch core, we have allocated
1625  *   enough mbufs to fill up the mbuf cache.
1626  */
1627 static void
1628 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1629         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1630 {
1631         uint32_t nr_mbufs;
1632         uint32_t nr_mbufs_per_core;
1633         uint32_t mtu = 1500;
1634
1635         if (mergeable)
1636                 mtu = 9000;
1637         if (enable_tso)
1638                 mtu = 64 * 1024;
1639
1640         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1641                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1642         nr_mbufs_per_core += nr_rx_desc;
1643         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1644
1645         nr_mbufs  = nr_queues * nr_rx_desc;
1646         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1647         nr_mbufs *= nr_port;
1648
1649         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1650                                             nr_mbuf_cache, 0, mbuf_size,
1651                                             rte_socket_id());
1652         if (mbuf_pool == NULL)
1653                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1654 }
1655
1656 /*
1657  * Main function, does initialisation and calls the per-lcore functions.
1658  */
1659 int
1660 main(int argc, char *argv[])
1661 {
1662         unsigned lcore_id, core_id = 0;
1663         unsigned nb_ports, valid_num_ports;
1664         int ret, i;
1665         uint16_t portid;
1666         static pthread_t tid;
1667         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1668
1669         signal(SIGINT, sigint_handler);
1670
1671         /* init EAL */
1672         ret = rte_eal_init(argc, argv);
1673         if (ret < 0)
1674                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1675         argc -= ret;
1676         argv += ret;
1677
1678         /* parse app arguments */
1679         ret = us_vhost_parse_args(argc, argv);
1680         if (ret < 0)
1681                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1682
1683         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1684                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1685
1686                 if (rte_lcore_is_enabled(lcore_id))
1687                         lcore_ids[core_id++] = lcore_id;
1688         }
1689
1690         if (rte_lcore_count() > RTE_MAX_LCORE)
1691                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1692
1693         /* Get the number of physical ports. */
1694         nb_ports = rte_eth_dev_count_avail();
1695
1696         /*
1697          * Update the global var NUM_PORTS and global array PORTS
1698          * and get value of var VALID_NUM_PORTS according to system ports number
1699          */
1700         valid_num_ports = check_ports_num(nb_ports);
1701
1702         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1703                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1704                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1705                 return -1;
1706         }
1707
1708         /*
1709          * FIXME: here we are trying to allocate mbufs big enough for
1710          * @MAX_QUEUES, but the truth is we're never going to use that
1711          * many queues here. We probably should only do allocation for
1712          * those queues we are going to use.
1713          */
1714         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1715                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1716
1717         if (vm2vm_mode == VM2VM_HARDWARE) {
1718                 /* Enable VT loop back to let L2 switch to do it. */
1719                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1720                 RTE_LOG(DEBUG, VHOST_CONFIG,
1721                         "Enable loop back for L2 switch in vmdq.\n");
1722         }
1723
1724         /* initialize all ports */
1725         RTE_ETH_FOREACH_DEV(portid) {
1726                 /* skip ports that are not enabled */
1727                 if ((enabled_port_mask & (1 << portid)) == 0) {
1728                         RTE_LOG(INFO, VHOST_PORT,
1729                                 "Skipping disabled port %d\n", portid);
1730                         continue;
1731                 }
1732                 if (port_init(portid) != 0)
1733                         rte_exit(EXIT_FAILURE,
1734                                 "Cannot initialize network ports\n");
1735         }
1736
1737         /* Enable stats if the user option is set. */
1738         if (enable_stats) {
1739                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1740                                         print_stats, NULL);
1741                 if (ret < 0)
1742                         rte_exit(EXIT_FAILURE,
1743                                 "Cannot create print-stats thread\n");
1744         }
1745
1746         /* Launch all data cores. */
1747         RTE_LCORE_FOREACH_WORKER(lcore_id)
1748                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1749
1750         if (client_mode)
1751                 flags |= RTE_VHOST_USER_CLIENT;
1752
1753         /* Register vhost user driver to handle vhost messages. */
1754         for (i = 0; i < nb_sockets; i++) {
1755                 char *file = socket_files + i * PATH_MAX;
1756
1757                 if (async_vhost_driver)
1758                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1759
1760                 ret = rte_vhost_driver_register(file, flags);
1761                 if (ret != 0) {
1762                         unregister_drivers(i);
1763                         rte_exit(EXIT_FAILURE,
1764                                 "vhost driver register failure.\n");
1765                 }
1766
1767                 if (builtin_net_driver)
1768                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1769
1770                 if (mergeable == 0) {
1771                         rte_vhost_driver_disable_features(file,
1772                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1773                 }
1774
1775                 if (enable_tx_csum == 0) {
1776                         rte_vhost_driver_disable_features(file,
1777                                 1ULL << VIRTIO_NET_F_CSUM);
1778                 }
1779
1780                 if (enable_tso == 0) {
1781                         rte_vhost_driver_disable_features(file,
1782                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1783                         rte_vhost_driver_disable_features(file,
1784                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1785                         rte_vhost_driver_disable_features(file,
1786                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1787                         rte_vhost_driver_disable_features(file,
1788                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1789                 }
1790
1791                 if (promiscuous) {
1792                         rte_vhost_driver_enable_features(file,
1793                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1794                 }
1795
1796                 ret = rte_vhost_driver_callback_register(file,
1797                         &virtio_net_device_ops);
1798                 if (ret != 0) {
1799                         rte_exit(EXIT_FAILURE,
1800                                 "failed to register vhost driver callbacks.\n");
1801                 }
1802
1803                 if (rte_vhost_driver_start(file) < 0) {
1804                         rte_exit(EXIT_FAILURE,
1805                                 "failed to start vhost driver.\n");
1806                 }
1807         }
1808
1809         RTE_LCORE_FOREACH_WORKER(lcore_id)
1810                 rte_eal_wait_lcore(lcore_id);
1811
1812         /* clean up the EAL */
1813         rte_eal_cleanup();
1814
1815         return 0;
1816 }