vhost: remove copy threshold for async path
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* mask of enabled ports */
60 static uint32_t enabled_port_mask = 0;
61
62 /* Promiscuous mode */
63 static uint32_t promiscuous;
64
65 /* number of devices/queues to support*/
66 static uint32_t num_queues = 0;
67 static uint32_t num_devices;
68
69 static struct rte_mempool *mbuf_pool;
70 static int mergeable;
71
72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
73 typedef enum {
74         VM2VM_DISABLED = 0,
75         VM2VM_SOFTWARE = 1,
76         VM2VM_HARDWARE = 2,
77         VM2VM_LAST
78 } vm2vm_type;
79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
80
81 /* Enable stats. */
82 static uint32_t enable_stats = 0;
83 /* Enable retries on RX. */
84 static uint32_t enable_retry = 1;
85
86 /* Disable TX checksum offload */
87 static uint32_t enable_tx_csum;
88
89 /* Disable TSO offload */
90 static uint32_t enable_tso;
91
92 static int client_mode;
93
94 static int builtin_net_driver;
95
96 static int async_vhost_driver;
97
98 static char *dma_type;
99
100 /* Specify timeout (in useconds) between retries on RX. */
101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
102 /* Specify the number of retries on RX. */
103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104
105 /* Socket file paths. Can be set by user */
106 static char *socket_files;
107 static int nb_sockets;
108
109 /* empty vmdq configuration structure. Filled in programatically */
110 static struct rte_eth_conf vmdq_conf_default = {
111         .rxmode = {
112                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
113                 .split_hdr_size = 0,
114                 /*
115                  * VLAN strip is necessary for 1G NIC such as I350,
116                  * this fixes bug of ipv4 forwarding in guest can't
117                  * forward pakets from one virtio dev to another virtio dev.
118                  */
119                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
120         },
121
122         .txmode = {
123                 .mq_mode = ETH_MQ_TX_NONE,
124                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
125                              DEV_TX_OFFLOAD_TCP_CKSUM |
126                              DEV_TX_OFFLOAD_VLAN_INSERT |
127                              DEV_TX_OFFLOAD_MULTI_SEGS |
128                              DEV_TX_OFFLOAD_TCP_TSO),
129         },
130         .rx_adv_conf = {
131                 /*
132                  * should be overridden separately in code with
133                  * appropriate values
134                  */
135                 .vmdq_rx_conf = {
136                         .nb_queue_pools = ETH_8_POOLS,
137                         .enable_default_pool = 0,
138                         .default_pool = 0,
139                         .nb_pool_maps = 0,
140                         .pool_map = {{0, 0},},
141                 },
142         },
143 };
144
145
146 static unsigned lcore_ids[RTE_MAX_LCORE];
147 static uint16_t ports[RTE_MAX_ETHPORTS];
148 static unsigned num_ports = 0; /**< The number of ports specified in command line */
149 static uint16_t num_pf_queues, num_vmdq_queues;
150 static uint16_t vmdq_pool_base, vmdq_queue_base;
151 static uint16_t queues_per_pool;
152
153 const uint16_t vlan_tags[] = {
154         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
155         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
156         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
157         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
158         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
159         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
160         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
161         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
162 };
163
164 /* ethernet addresses of ports */
165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166
167 static struct vhost_dev_tailq_list vhost_dev_list =
168         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169
170 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171
172 /* Used for queueing bursts of TX packets. */
173 struct mbuf_table {
174         unsigned len;
175         unsigned txq_id;
176         struct rte_mbuf *m_table[MAX_PKT_BURST];
177 };
178
179 struct vhost_bufftable {
180         uint32_t len;
181         uint64_t pre_tsc;
182         struct rte_mbuf *m_table[MAX_PKT_BURST];
183 };
184
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
187
188 /*
189  * Vhost TX buffer for each data core.
190  * Every data core maintains a TX buffer for every vhost device,
191  * which is used for batch pkts enqueue for higher performance.
192  */
193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
194
195 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
196                                  / US_PER_S * BURST_TX_DRAIN_US)
197 #define VLAN_HLEN       4
198
199 static inline int
200 open_dma(const char *value)
201 {
202         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203                 return open_ioat(value);
204
205         return -1;
206 }
207
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215         struct rte_eth_vmdq_rx_conf conf;
216         struct rte_eth_vmdq_rx_conf *def_conf =
217                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218         unsigned i;
219
220         memset(&conf, 0, sizeof(conf));
221         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222         conf.nb_pool_maps = num_devices;
223         conf.enable_loop_back = def_conf->enable_loop_back;
224         conf.rx_mode = def_conf->rx_mode;
225
226         for (i = 0; i < conf.nb_pool_maps; i++) {
227                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
228                 conf.pool_map[i].pools = (1UL << i);
229         }
230
231         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234         return 0;
235 }
236
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244         struct rte_eth_dev_info dev_info;
245         struct rte_eth_conf port_conf;
246         struct rte_eth_rxconf *rxconf;
247         struct rte_eth_txconf *txconf;
248         int16_t rx_rings, tx_rings;
249         uint16_t rx_ring_size, tx_ring_size;
250         int retval;
251         uint16_t q;
252
253         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254         retval = rte_eth_dev_info_get(port, &dev_info);
255         if (retval != 0) {
256                 RTE_LOG(ERR, VHOST_PORT,
257                         "Error during getting device (port %u) info: %s\n",
258                         port, strerror(-retval));
259
260                 return retval;
261         }
262
263         rxconf = &dev_info.default_rxconf;
264         txconf = &dev_info.default_txconf;
265         rxconf->rx_drop_en = 1;
266
267         /*configure the number of supported virtio devices based on VMDQ limits */
268         num_devices = dev_info.max_vmdq_pools;
269
270         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272
273         tx_rings = (uint16_t)rte_lcore_count();
274
275         /* Get port configuration. */
276         retval = get_eth_conf(&port_conf, num_devices);
277         if (retval < 0)
278                 return retval;
279         /* NIC queues are divided into pf queues and vmdq queues.  */
280         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
281         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
282         num_vmdq_queues = num_devices * queues_per_pool;
283         num_queues = num_pf_queues + num_vmdq_queues;
284         vmdq_queue_base = dev_info.vmdq_queue_base;
285         vmdq_pool_base  = dev_info.vmdq_pool_base;
286         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
287                 num_pf_queues, num_devices, queues_per_pool);
288
289         if (!rte_eth_dev_is_valid_port(port))
290                 return -1;
291
292         rx_rings = (uint16_t)dev_info.max_rx_queues;
293         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
294                 port_conf.txmode.offloads |=
295                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
296         /* Configure ethernet device. */
297         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
298         if (retval != 0) {
299                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
300                         port, strerror(-retval));
301                 return retval;
302         }
303
304         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
305                 &tx_ring_size);
306         if (retval != 0) {
307                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
308                         "for port %u: %s.\n", port, strerror(-retval));
309                 return retval;
310         }
311         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
312                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
313                         "for Rx queues on port %u.\n", port);
314                 return -1;
315         }
316
317         /* Setup the queues. */
318         rxconf->offloads = port_conf.rxmode.offloads;
319         for (q = 0; q < rx_rings; q ++) {
320                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
321                                                 rte_eth_dev_socket_id(port),
322                                                 rxconf,
323                                                 mbuf_pool);
324                 if (retval < 0) {
325                         RTE_LOG(ERR, VHOST_PORT,
326                                 "Failed to setup rx queue %u of port %u: %s.\n",
327                                 q, port, strerror(-retval));
328                         return retval;
329                 }
330         }
331         txconf->offloads = port_conf.txmode.offloads;
332         for (q = 0; q < tx_rings; q ++) {
333                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
334                                                 rte_eth_dev_socket_id(port),
335                                                 txconf);
336                 if (retval < 0) {
337                         RTE_LOG(ERR, VHOST_PORT,
338                                 "Failed to setup tx queue %u of port %u: %s.\n",
339                                 q, port, strerror(-retval));
340                         return retval;
341                 }
342         }
343
344         /* Start the device. */
345         retval  = rte_eth_dev_start(port);
346         if (retval < 0) {
347                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
348                         port, strerror(-retval));
349                 return retval;
350         }
351
352         if (promiscuous) {
353                 retval = rte_eth_promiscuous_enable(port);
354                 if (retval != 0) {
355                         RTE_LOG(ERR, VHOST_PORT,
356                                 "Failed to enable promiscuous mode on port %u: %s\n",
357                                 port, rte_strerror(-retval));
358                         return retval;
359                 }
360         }
361
362         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363         if (retval < 0) {
364                 RTE_LOG(ERR, VHOST_PORT,
365                         "Failed to get MAC address on port %u: %s\n",
366                         port, rte_strerror(-retval));
367                 return retval;
368         }
369
370         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
371         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
372                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
373                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
374
375         return 0;
376 }
377
378 /*
379  * Set socket file path.
380  */
381 static int
382 us_vhost_parse_socket_path(const char *q_arg)
383 {
384         char *old;
385
386         /* parse number string */
387         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
388                 return -1;
389
390         old = socket_files;
391         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
392         if (socket_files == NULL) {
393                 free(old);
394                 return -1;
395         }
396
397         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
398         nb_sockets++;
399
400         return 0;
401 }
402
403 /*
404  * Parse the portmask provided at run time.
405  */
406 static int
407 parse_portmask(const char *portmask)
408 {
409         char *end = NULL;
410         unsigned long pm;
411
412         errno = 0;
413
414         /* parse hexadecimal string */
415         pm = strtoul(portmask, &end, 16);
416         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
417                 return 0;
418
419         return pm;
420
421 }
422
423 /*
424  * Parse num options at run time.
425  */
426 static int
427 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
428 {
429         char *end = NULL;
430         unsigned long num;
431
432         errno = 0;
433
434         /* parse unsigned int string */
435         num = strtoul(q_arg, &end, 10);
436         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
437                 return -1;
438
439         if (num > max_valid_value)
440                 return -1;
441
442         return num;
443
444 }
445
446 /*
447  * Display usage
448  */
449 static void
450 us_vhost_usage(const char *prgname)
451 {
452         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
453         "               --vm2vm [0|1|2]\n"
454         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
455         "               --socket-file <path>\n"
456         "               --nb-devices ND\n"
457         "               -p PORTMASK: Set mask for ports to be used by application\n"
458         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
459         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
460         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
461         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
462         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
463         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
464         "               --socket-file: The path of the socket file.\n"
465         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
466         "               --tso [0|1] disable/enable TCP segment offload.\n"
467         "               --client register a vhost-user socket as client mode.\n"
468         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
469         "               --dmas register dma channel for specific vhost device.\n",
470                prgname);
471 }
472
473 enum {
474 #define OPT_VM2VM               "vm2vm"
475         OPT_VM2VM_NUM = 256,
476 #define OPT_RX_RETRY            "rx-retry"
477         OPT_RX_RETRY_NUM,
478 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
479         OPT_RX_RETRY_DELAY_NUM,
480 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
481         OPT_RX_RETRY_NUMB_NUM,
482 #define OPT_MERGEABLE           "mergeable"
483         OPT_MERGEABLE_NUM,
484 #define OPT_STATS               "stats"
485         OPT_STATS_NUM,
486 #define OPT_SOCKET_FILE         "socket-file"
487         OPT_SOCKET_FILE_NUM,
488 #define OPT_TX_CSUM             "tx-csum"
489         OPT_TX_CSUM_NUM,
490 #define OPT_TSO                 "tso"
491         OPT_TSO_NUM,
492 #define OPT_CLIENT              "client"
493         OPT_CLIENT_NUM,
494 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
495         OPT_BUILTIN_NET_DRIVER_NUM,
496 #define OPT_DMA_TYPE            "dma-type"
497         OPT_DMA_TYPE_NUM,
498 #define OPT_DMAS                "dmas"
499         OPT_DMAS_NUM,
500 };
501
502 /*
503  * Parse the arguments given in the command line of the application.
504  */
505 static int
506 us_vhost_parse_args(int argc, char **argv)
507 {
508         int opt, ret;
509         int option_index;
510         unsigned i;
511         const char *prgname = argv[0];
512         static struct option long_option[] = {
513                 {OPT_VM2VM, required_argument,
514                                 NULL, OPT_VM2VM_NUM},
515                 {OPT_RX_RETRY, required_argument,
516                                 NULL, OPT_RX_RETRY_NUM},
517                 {OPT_RX_RETRY_DELAY, required_argument,
518                                 NULL, OPT_RX_RETRY_DELAY_NUM},
519                 {OPT_RX_RETRY_NUMB, required_argument,
520                                 NULL, OPT_RX_RETRY_NUMB_NUM},
521                 {OPT_MERGEABLE, required_argument,
522                                 NULL, OPT_MERGEABLE_NUM},
523                 {OPT_STATS, required_argument,
524                                 NULL, OPT_STATS_NUM},
525                 {OPT_SOCKET_FILE, required_argument,
526                                 NULL, OPT_SOCKET_FILE_NUM},
527                 {OPT_TX_CSUM, required_argument,
528                                 NULL, OPT_TX_CSUM_NUM},
529                 {OPT_TSO, required_argument,
530                                 NULL, OPT_TSO_NUM},
531                 {OPT_CLIENT, no_argument,
532                                 NULL, OPT_CLIENT_NUM},
533                 {OPT_BUILTIN_NET_DRIVER, no_argument,
534                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
535                 {OPT_DMA_TYPE, required_argument,
536                                 NULL, OPT_DMA_TYPE_NUM},
537                 {OPT_DMAS, required_argument,
538                                 NULL, OPT_DMAS_NUM},
539                 {NULL, 0, 0, 0},
540         };
541
542         /* Parse command line */
543         while ((opt = getopt_long(argc, argv, "p:P",
544                         long_option, &option_index)) != EOF) {
545                 switch (opt) {
546                 /* Portmask */
547                 case 'p':
548                         enabled_port_mask = parse_portmask(optarg);
549                         if (enabled_port_mask == 0) {
550                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
551                                 us_vhost_usage(prgname);
552                                 return -1;
553                         }
554                         break;
555
556                 case 'P':
557                         promiscuous = 1;
558                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
559                                 ETH_VMDQ_ACCEPT_BROADCAST |
560                                 ETH_VMDQ_ACCEPT_MULTICAST;
561                         break;
562
563                 case OPT_VM2VM_NUM:
564                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
565                         if (ret == -1) {
566                                 RTE_LOG(INFO, VHOST_CONFIG,
567                                         "Invalid argument for "
568                                         "vm2vm [0|1|2]\n");
569                                 us_vhost_usage(prgname);
570                                 return -1;
571                         }
572                         vm2vm_mode = (vm2vm_type)ret;
573                         break;
574
575                 case OPT_RX_RETRY_NUM:
576                         ret = parse_num_opt(optarg, 1);
577                         if (ret == -1) {
578                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
579                                 us_vhost_usage(prgname);
580                                 return -1;
581                         }
582                         enable_retry = ret;
583                         break;
584
585                 case OPT_TX_CSUM_NUM:
586                         ret = parse_num_opt(optarg, 1);
587                         if (ret == -1) {
588                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
589                                 us_vhost_usage(prgname);
590                                 return -1;
591                         }
592                         enable_tx_csum = ret;
593                         break;
594
595                 case OPT_TSO_NUM:
596                         ret = parse_num_opt(optarg, 1);
597                         if (ret == -1) {
598                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
599                                 us_vhost_usage(prgname);
600                                 return -1;
601                         }
602                         enable_tso = ret;
603                         break;
604
605                 case OPT_RX_RETRY_DELAY_NUM:
606                         ret = parse_num_opt(optarg, INT32_MAX);
607                         if (ret == -1) {
608                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
609                                 us_vhost_usage(prgname);
610                                 return -1;
611                         }
612                         burst_rx_delay_time = ret;
613                         break;
614
615                 case OPT_RX_RETRY_NUMB_NUM:
616                         ret = parse_num_opt(optarg, INT32_MAX);
617                         if (ret == -1) {
618                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
619                                 us_vhost_usage(prgname);
620                                 return -1;
621                         }
622                         burst_rx_retry_num = ret;
623                         break;
624
625                 case OPT_MERGEABLE_NUM:
626                         ret = parse_num_opt(optarg, 1);
627                         if (ret == -1) {
628                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
629                                 us_vhost_usage(prgname);
630                                 return -1;
631                         }
632                         mergeable = !!ret;
633                         if (ret) {
634                                 vmdq_conf_default.rxmode.offloads |=
635                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
636                                 vmdq_conf_default.rxmode.max_rx_pkt_len
637                                         = JUMBO_FRAME_MAX_SIZE;
638                         }
639                         break;
640
641                 case OPT_STATS_NUM:
642                         ret = parse_num_opt(optarg, INT32_MAX);
643                         if (ret == -1) {
644                                 RTE_LOG(INFO, VHOST_CONFIG,
645                                         "Invalid argument for stats [0..N]\n");
646                                 us_vhost_usage(prgname);
647                                 return -1;
648                         }
649                         enable_stats = ret;
650                         break;
651
652                 /* Set socket file path. */
653                 case OPT_SOCKET_FILE_NUM:
654                         if (us_vhost_parse_socket_path(optarg) == -1) {
655                                 RTE_LOG(INFO, VHOST_CONFIG,
656                                 "Invalid argument for socket name (Max %d characters)\n",
657                                 PATH_MAX);
658                                 us_vhost_usage(prgname);
659                                 return -1;
660                         }
661                         break;
662
663                 case OPT_DMA_TYPE_NUM:
664                         dma_type = optarg;
665                         break;
666
667                 case OPT_DMAS_NUM:
668                         if (open_dma(optarg) == -1) {
669                                 RTE_LOG(INFO, VHOST_CONFIG,
670                                         "Wrong DMA args\n");
671                                 us_vhost_usage(prgname);
672                                 return -1;
673                         }
674                         async_vhost_driver = 1;
675                         break;
676
677                 case OPT_CLIENT_NUM:
678                         client_mode = 1;
679                         break;
680
681                 case OPT_BUILTIN_NET_DRIVER_NUM:
682                         builtin_net_driver = 1;
683                         break;
684
685                 /* Invalid option - print options. */
686                 default:
687                         us_vhost_usage(prgname);
688                         return -1;
689                 }
690         }
691
692         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
693                 if (enabled_port_mask & (1 << i))
694                         ports[num_ports++] = i;
695         }
696
697         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
698                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
699                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
700                 return -1;
701         }
702
703         return 0;
704 }
705
706 /*
707  * Update the global var NUM_PORTS and array PORTS according to system ports number
708  * and return valid ports number
709  */
710 static unsigned check_ports_num(unsigned nb_ports)
711 {
712         unsigned valid_num_ports = num_ports;
713         unsigned portid;
714
715         if (num_ports > nb_ports) {
716                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
717                         num_ports, nb_ports);
718                 num_ports = nb_ports;
719         }
720
721         for (portid = 0; portid < num_ports; portid ++) {
722                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
723                         RTE_LOG(INFO, VHOST_PORT,
724                                 "\nSpecified port ID(%u) is not valid\n",
725                                 ports[portid]);
726                         ports[portid] = INVALID_PORT_ID;
727                         valid_num_ports--;
728                 }
729         }
730         return valid_num_ports;
731 }
732
733 static __rte_always_inline struct vhost_dev *
734 find_vhost_dev(struct rte_ether_addr *mac)
735 {
736         struct vhost_dev *vdev;
737
738         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
739                 if (vdev->ready == DEVICE_RX &&
740                     rte_is_same_ether_addr(mac, &vdev->mac_address))
741                         return vdev;
742         }
743
744         return NULL;
745 }
746
747 /*
748  * This function learns the MAC address of the device and registers this along with a
749  * vlan tag to a VMDQ.
750  */
751 static int
752 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
753 {
754         struct rte_ether_hdr *pkt_hdr;
755         int i, ret;
756
757         /* Learn MAC address of guest device from packet */
758         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
759
760         if (find_vhost_dev(&pkt_hdr->s_addr)) {
761                 RTE_LOG(ERR, VHOST_DATA,
762                         "(%d) device is using a registered MAC!\n",
763                         vdev->vid);
764                 return -1;
765         }
766
767         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
768                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
769
770         /* vlan_tag currently uses the device_id. */
771         vdev->vlan_tag = vlan_tags[vdev->vid];
772
773         /* Print out VMDQ registration info. */
774         RTE_LOG(INFO, VHOST_DATA,
775                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
776                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
777                 vdev->vlan_tag);
778
779         /* Register the MAC address. */
780         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
781                                 (uint32_t)vdev->vid + vmdq_pool_base);
782         if (ret)
783                 RTE_LOG(ERR, VHOST_DATA,
784                         "(%d) failed to add device MAC address to VMDQ\n",
785                         vdev->vid);
786
787         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
788
789         /* Set device as ready for RX. */
790         vdev->ready = DEVICE_RX;
791
792         return 0;
793 }
794
795 /*
796  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
797  * queue before disabling RX on the device.
798  */
799 static inline void
800 unlink_vmdq(struct vhost_dev *vdev)
801 {
802         unsigned i = 0;
803         unsigned rx_count;
804         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
805
806         if (vdev->ready == DEVICE_RX) {
807                 /*clear MAC and VLAN settings*/
808                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
809                 for (i = 0; i < 6; i++)
810                         vdev->mac_address.addr_bytes[i] = 0;
811
812                 vdev->vlan_tag = 0;
813
814                 /*Clear out the receive buffers*/
815                 rx_count = rte_eth_rx_burst(ports[0],
816                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
817
818                 while (rx_count) {
819                         for (i = 0; i < rx_count; i++)
820                                 rte_pktmbuf_free(pkts_burst[i]);
821
822                         rx_count = rte_eth_rx_burst(ports[0],
823                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
824                 }
825
826                 vdev->ready = DEVICE_MAC_LEARNING;
827         }
828 }
829
830 static inline void
831 free_pkts(struct rte_mbuf **pkts, uint16_t n)
832 {
833         while (n--)
834                 rte_pktmbuf_free(pkts[n]);
835 }
836
837 static __rte_always_inline void
838 complete_async_pkts(struct vhost_dev *vdev)
839 {
840         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
841         uint16_t complete_count;
842
843         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
844                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
845         if (complete_count) {
846                 free_pkts(p_cpl, complete_count);
847                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
848         }
849
850 }
851
852 static __rte_always_inline void
853 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
854             struct rte_mbuf *m)
855 {
856         uint16_t ret;
857
858         if (builtin_net_driver) {
859                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
860         } else {
861                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
862         }
863
864         if (enable_stats) {
865                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
866                                 __ATOMIC_SEQ_CST);
867                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
868                                 __ATOMIC_SEQ_CST);
869                 src_vdev->stats.tx_total++;
870                 src_vdev->stats.tx += ret;
871         }
872 }
873
874 static __rte_always_inline void
875 drain_vhost(struct vhost_dev *vdev)
876 {
877         uint16_t ret;
878         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
879         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
880         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
881
882         if (builtin_net_driver) {
883                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
884         } else if (async_vhost_driver) {
885                 uint16_t enqueue_fail = 0;
886
887                 complete_async_pkts(vdev);
888                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
889                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
890
891                 enqueue_fail = nr_xmit - ret;
892                 if (enqueue_fail)
893                         free_pkts(&m[ret], nr_xmit - ret);
894         } else {
895                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
896                                                 m, nr_xmit);
897         }
898
899         if (enable_stats) {
900                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
901                                 __ATOMIC_SEQ_CST);
902                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
903                                 __ATOMIC_SEQ_CST);
904         }
905
906         if (!async_vhost_driver)
907                 free_pkts(m, nr_xmit);
908 }
909
910 static __rte_always_inline void
911 drain_vhost_table(void)
912 {
913         uint16_t lcore_id = rte_lcore_id();
914         struct vhost_bufftable *vhost_txq;
915         struct vhost_dev *vdev;
916         uint64_t cur_tsc;
917
918         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
919                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
920                                                 + vdev->vid];
921
922                 cur_tsc = rte_rdtsc();
923                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
924                                 > MBUF_TABLE_DRAIN_TSC)) {
925                         RTE_LOG_DP(DEBUG, VHOST_DATA,
926                                 "Vhost TX queue drained after timeout with burst size %u\n",
927                                 vhost_txq->len);
928                         drain_vhost(vdev);
929                         vhost_txq->len = 0;
930                         vhost_txq->pre_tsc = cur_tsc;
931                 }
932         }
933 }
934
935 /*
936  * Check if the packet destination MAC address is for a local device. If so then put
937  * the packet on that devices RX queue. If not then return.
938  */
939 static __rte_always_inline int
940 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
941 {
942         struct rte_ether_hdr *pkt_hdr;
943         struct vhost_dev *dst_vdev;
944         struct vhost_bufftable *vhost_txq;
945         uint16_t lcore_id = rte_lcore_id();
946         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
947
948         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
949         if (!dst_vdev)
950                 return -1;
951
952         if (vdev->vid == dst_vdev->vid) {
953                 RTE_LOG_DP(DEBUG, VHOST_DATA,
954                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
955                         vdev->vid);
956                 return 0;
957         }
958
959         RTE_LOG_DP(DEBUG, VHOST_DATA,
960                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
961
962         if (unlikely(dst_vdev->remove)) {
963                 RTE_LOG_DP(DEBUG, VHOST_DATA,
964                         "(%d) device is marked for removal\n", dst_vdev->vid);
965                 return 0;
966         }
967
968         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
969         vhost_txq->m_table[vhost_txq->len++] = m;
970
971         if (enable_stats) {
972                 vdev->stats.tx_total++;
973                 vdev->stats.tx++;
974         }
975
976         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
977                 drain_vhost(dst_vdev);
978                 vhost_txq->len = 0;
979                 vhost_txq->pre_tsc = rte_rdtsc();
980         }
981         return 0;
982 }
983
984 /*
985  * Check if the destination MAC of a packet is one local VM,
986  * and get its vlan tag, and offset if it is.
987  */
988 static __rte_always_inline int
989 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
990         uint32_t *offset, uint16_t *vlan_tag)
991 {
992         struct vhost_dev *dst_vdev;
993         struct rte_ether_hdr *pkt_hdr =
994                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
995
996         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
997         if (!dst_vdev)
998                 return 0;
999
1000         if (vdev->vid == dst_vdev->vid) {
1001                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1002                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1003                         vdev->vid);
1004                 return -1;
1005         }
1006
1007         /*
1008          * HW vlan strip will reduce the packet length
1009          * by minus length of vlan tag, so need restore
1010          * the packet length by plus it.
1011          */
1012         *offset  = VLAN_HLEN;
1013         *vlan_tag = vlan_tags[vdev->vid];
1014
1015         RTE_LOG_DP(DEBUG, VHOST_DATA,
1016                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1017                 vdev->vid, dst_vdev->vid, *vlan_tag);
1018
1019         return 0;
1020 }
1021
1022 static void virtio_tx_offload(struct rte_mbuf *m)
1023 {
1024         struct rte_net_hdr_lens hdr_lens;
1025         struct rte_ipv4_hdr *ipv4_hdr;
1026         struct rte_tcp_hdr *tcp_hdr;
1027         uint32_t ptype;
1028         void *l3_hdr;
1029
1030         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1031         m->l2_len = hdr_lens.l2_len;
1032         m->l3_len = hdr_lens.l3_len;
1033         m->l4_len = hdr_lens.l4_len;
1034
1035         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1036         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1037                 m->l2_len + m->l3_len);
1038
1039         m->ol_flags |= PKT_TX_TCP_SEG;
1040         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1041                 m->ol_flags |= PKT_TX_IPV4;
1042                 m->ol_flags |= PKT_TX_IP_CKSUM;
1043                 ipv4_hdr = l3_hdr;
1044                 ipv4_hdr->hdr_checksum = 0;
1045                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1046         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1047                 m->ol_flags |= PKT_TX_IPV6;
1048                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1049         }
1050 }
1051
1052 static __rte_always_inline void
1053 do_drain_mbuf_table(struct mbuf_table *tx_q)
1054 {
1055         uint16_t count;
1056
1057         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1058                                  tx_q->m_table, tx_q->len);
1059         if (unlikely(count < tx_q->len))
1060                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1061
1062         tx_q->len = 0;
1063 }
1064
1065 /*
1066  * This function routes the TX packet to the correct interface. This
1067  * may be a local device or the physical port.
1068  */
1069 static __rte_always_inline void
1070 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1071 {
1072         struct mbuf_table *tx_q;
1073         unsigned offset = 0;
1074         const uint16_t lcore_id = rte_lcore_id();
1075         struct rte_ether_hdr *nh;
1076
1077
1078         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1079         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1080                 struct vhost_dev *vdev2;
1081
1082                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1083                         if (vdev2 != vdev)
1084                                 sync_virtio_xmit(vdev2, vdev, m);
1085                 }
1086                 goto queue2nic;
1087         }
1088
1089         /*check if destination is local VM*/
1090         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1091                 return;
1092
1093         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1094                 if (unlikely(find_local_dest(vdev, m, &offset,
1095                                              &vlan_tag) != 0)) {
1096                         rte_pktmbuf_free(m);
1097                         return;
1098                 }
1099         }
1100
1101         RTE_LOG_DP(DEBUG, VHOST_DATA,
1102                 "(%d) TX: MAC address is external\n", vdev->vid);
1103
1104 queue2nic:
1105
1106         /*Add packet to the port tx queue*/
1107         tx_q = &lcore_tx_queue[lcore_id];
1108
1109         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1110         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1111                 /* Guest has inserted the vlan tag. */
1112                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1113                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1114                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1115                         (vh->vlan_tci != vlan_tag_be))
1116                         vh->vlan_tci = vlan_tag_be;
1117         } else {
1118                 m->ol_flags |= PKT_TX_VLAN_PKT;
1119
1120                 /*
1121                  * Find the right seg to adjust the data len when offset is
1122                  * bigger than tail room size.
1123                  */
1124                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1125                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1126                                 m->data_len += offset;
1127                         else {
1128                                 struct rte_mbuf *seg = m;
1129
1130                                 while ((seg->next != NULL) &&
1131                                         (offset > rte_pktmbuf_tailroom(seg)))
1132                                         seg = seg->next;
1133
1134                                 seg->data_len += offset;
1135                         }
1136                         m->pkt_len += offset;
1137                 }
1138
1139                 m->vlan_tci = vlan_tag;
1140         }
1141
1142         if (m->ol_flags & PKT_RX_LRO)
1143                 virtio_tx_offload(m);
1144
1145         tx_q->m_table[tx_q->len++] = m;
1146         if (enable_stats) {
1147                 vdev->stats.tx_total++;
1148                 vdev->stats.tx++;
1149         }
1150
1151         if (unlikely(tx_q->len == MAX_PKT_BURST))
1152                 do_drain_mbuf_table(tx_q);
1153 }
1154
1155
1156 static __rte_always_inline void
1157 drain_mbuf_table(struct mbuf_table *tx_q)
1158 {
1159         static uint64_t prev_tsc;
1160         uint64_t cur_tsc;
1161
1162         if (tx_q->len == 0)
1163                 return;
1164
1165         cur_tsc = rte_rdtsc();
1166         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1167                 prev_tsc = cur_tsc;
1168
1169                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1170                         "TX queue drained after timeout with burst size %u\n",
1171                         tx_q->len);
1172                 do_drain_mbuf_table(tx_q);
1173         }
1174 }
1175
1176 static __rte_always_inline void
1177 drain_eth_rx(struct vhost_dev *vdev)
1178 {
1179         uint16_t rx_count, enqueue_count;
1180         struct rte_mbuf *pkts[MAX_PKT_BURST];
1181
1182         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1183                                     pkts, MAX_PKT_BURST);
1184
1185         if (!rx_count)
1186                 return;
1187
1188         /*
1189          * When "enable_retry" is set, here we wait and retry when there
1190          * is no enough free slots in the queue to hold @rx_count packets,
1191          * to diminish packet loss.
1192          */
1193         if (enable_retry &&
1194             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1195                         VIRTIO_RXQ))) {
1196                 uint32_t retry;
1197
1198                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1199                         rte_delay_us(burst_rx_delay_time);
1200                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1201                                         VIRTIO_RXQ))
1202                                 break;
1203                 }
1204         }
1205
1206         if (builtin_net_driver) {
1207                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1208                                                 pkts, rx_count);
1209         } else if (async_vhost_driver) {
1210                 uint16_t enqueue_fail = 0;
1211
1212                 complete_async_pkts(vdev);
1213                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1214                                         VIRTIO_RXQ, pkts, rx_count);
1215                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1216
1217                 enqueue_fail = rx_count - enqueue_count;
1218                 if (enqueue_fail)
1219                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1220
1221         } else {
1222                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1223                                                 pkts, rx_count);
1224         }
1225
1226         if (enable_stats) {
1227                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1228                                 __ATOMIC_SEQ_CST);
1229                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1230                                 __ATOMIC_SEQ_CST);
1231         }
1232
1233         if (!async_vhost_driver)
1234                 free_pkts(pkts, rx_count);
1235 }
1236
1237 static __rte_always_inline void
1238 drain_virtio_tx(struct vhost_dev *vdev)
1239 {
1240         struct rte_mbuf *pkts[MAX_PKT_BURST];
1241         uint16_t count;
1242         uint16_t i;
1243
1244         if (builtin_net_driver) {
1245                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1246                                         pkts, MAX_PKT_BURST);
1247         } else {
1248                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1249                                         mbuf_pool, pkts, MAX_PKT_BURST);
1250         }
1251
1252         /* setup VMDq for the first packet */
1253         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1254                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1255                         free_pkts(pkts, count);
1256         }
1257
1258         for (i = 0; i < count; ++i)
1259                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1260 }
1261
1262 /*
1263  * Main function of vhost-switch. It basically does:
1264  *
1265  * for each vhost device {
1266  *    - drain_eth_rx()
1267  *
1268  *      Which drains the host eth Rx queue linked to the vhost device,
1269  *      and deliver all of them to guest virito Rx ring associated with
1270  *      this vhost device.
1271  *
1272  *    - drain_virtio_tx()
1273  *
1274  *      Which drains the guest virtio Tx queue and deliver all of them
1275  *      to the target, which could be another vhost device, or the
1276  *      physical eth dev. The route is done in function "virtio_tx_route".
1277  * }
1278  */
1279 static int
1280 switch_worker(void *arg __rte_unused)
1281 {
1282         unsigned i;
1283         unsigned lcore_id = rte_lcore_id();
1284         struct vhost_dev *vdev;
1285         struct mbuf_table *tx_q;
1286
1287         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1288
1289         tx_q = &lcore_tx_queue[lcore_id];
1290         for (i = 0; i < rte_lcore_count(); i++) {
1291                 if (lcore_ids[i] == lcore_id) {
1292                         tx_q->txq_id = i;
1293                         break;
1294                 }
1295         }
1296
1297         while(1) {
1298                 drain_mbuf_table(tx_q);
1299                 drain_vhost_table();
1300                 /*
1301                  * Inform the configuration core that we have exited the
1302                  * linked list and that no devices are in use if requested.
1303                  */
1304                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1305                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1306
1307                 /*
1308                  * Process vhost devices
1309                  */
1310                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1311                               lcore_vdev_entry) {
1312                         if (unlikely(vdev->remove)) {
1313                                 unlink_vmdq(vdev);
1314                                 vdev->ready = DEVICE_SAFE_REMOVE;
1315                                 continue;
1316                         }
1317
1318                         if (likely(vdev->ready == DEVICE_RX))
1319                                 drain_eth_rx(vdev);
1320
1321                         if (likely(!vdev->remove))
1322                                 drain_virtio_tx(vdev);
1323                 }
1324         }
1325
1326         return 0;
1327 }
1328
1329 /*
1330  * Remove a device from the specific data core linked list and from the
1331  * main linked list. Synchonization  occurs through the use of the
1332  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1333  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1334  */
1335 static void
1336 destroy_device(int vid)
1337 {
1338         struct vhost_dev *vdev = NULL;
1339         int lcore;
1340         uint16_t i;
1341
1342         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1343                 if (vdev->vid == vid)
1344                         break;
1345         }
1346         if (!vdev)
1347                 return;
1348         /*set the remove flag. */
1349         vdev->remove = 1;
1350         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1351                 rte_pause();
1352         }
1353
1354         for (i = 0; i < RTE_MAX_LCORE; i++)
1355                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1356
1357         if (builtin_net_driver)
1358                 vs_vhost_net_remove(vdev);
1359
1360         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1361                      lcore_vdev_entry);
1362         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1363
1364
1365         /* Set the dev_removal_flag on each lcore. */
1366         RTE_LCORE_FOREACH_WORKER(lcore)
1367                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1368
1369         /*
1370          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1371          * we can be sure that they can no longer access the device removed
1372          * from the linked lists and that the devices are no longer in use.
1373          */
1374         RTE_LCORE_FOREACH_WORKER(lcore) {
1375                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1376                         rte_pause();
1377         }
1378
1379         lcore_info[vdev->coreid].device_num--;
1380
1381         RTE_LOG(INFO, VHOST_DATA,
1382                 "(%d) device has been removed from data core\n",
1383                 vdev->vid);
1384
1385         if (async_vhost_driver) {
1386                 uint16_t n_pkt = 0;
1387                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1388
1389                 while (vdev->pkts_inflight) {
1390                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1391                                                 m_cpl, vdev->pkts_inflight);
1392                         free_pkts(m_cpl, n_pkt);
1393                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1394                 }
1395
1396                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1397         }
1398
1399         rte_free(vdev);
1400 }
1401
1402 /*
1403  * A new device is added to a data core. First the device is added to the main linked list
1404  * and then allocated to a specific data core.
1405  */
1406 static int
1407 new_device(int vid)
1408 {
1409         int lcore, core_add = 0;
1410         uint16_t i;
1411         uint32_t device_num_min = num_devices;
1412         struct vhost_dev *vdev;
1413         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1414         if (vdev == NULL) {
1415                 RTE_LOG(INFO, VHOST_DATA,
1416                         "(%d) couldn't allocate memory for vhost dev\n",
1417                         vid);
1418                 return -1;
1419         }
1420         vdev->vid = vid;
1421
1422         for (i = 0; i < RTE_MAX_LCORE; i++) {
1423                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1424                         = rte_zmalloc("vhost bufftable",
1425                                 sizeof(struct vhost_bufftable),
1426                                 RTE_CACHE_LINE_SIZE);
1427
1428                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1429                         RTE_LOG(INFO, VHOST_DATA,
1430                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1431                         return -1;
1432                 }
1433         }
1434
1435         if (builtin_net_driver)
1436                 vs_vhost_net_setup(vdev);
1437
1438         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1439         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1440
1441         /*reset ready flag*/
1442         vdev->ready = DEVICE_MAC_LEARNING;
1443         vdev->remove = 0;
1444
1445         /* Find a suitable lcore to add the device. */
1446         RTE_LCORE_FOREACH_WORKER(lcore) {
1447                 if (lcore_info[lcore].device_num < device_num_min) {
1448                         device_num_min = lcore_info[lcore].device_num;
1449                         core_add = lcore;
1450                 }
1451         }
1452         vdev->coreid = core_add;
1453
1454         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1455                           lcore_vdev_entry);
1456         lcore_info[vdev->coreid].device_num++;
1457
1458         /* Disable notifications. */
1459         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1460         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1461
1462         RTE_LOG(INFO, VHOST_DATA,
1463                 "(%d) device has been added to data core %d\n",
1464                 vid, vdev->coreid);
1465
1466         if (async_vhost_driver) {
1467                 struct rte_vhost_async_config config = {0};
1468                 struct rte_vhost_async_channel_ops channel_ops;
1469
1470                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1471                         channel_ops.transfer_data = ioat_transfer_data_cb;
1472                         channel_ops.check_completed_copies =
1473                                 ioat_check_completed_copies_cb;
1474
1475                         config.features = RTE_VHOST_ASYNC_INORDER;
1476
1477                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1478                                 config, &channel_ops);
1479                 }
1480         }
1481
1482         return 0;
1483 }
1484
1485 static int
1486 vring_state_changed(int vid, uint16_t queue_id, int enable)
1487 {
1488         struct vhost_dev *vdev = NULL;
1489
1490         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1491                 if (vdev->vid == vid)
1492                         break;
1493         }
1494         if (!vdev)
1495                 return -1;
1496
1497         if (queue_id != VIRTIO_RXQ)
1498                 return 0;
1499
1500         if (async_vhost_driver) {
1501                 if (!enable) {
1502                         uint16_t n_pkt = 0;
1503                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1504
1505                         while (vdev->pkts_inflight) {
1506                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1507                                                         m_cpl, vdev->pkts_inflight);
1508                                 free_pkts(m_cpl, n_pkt);
1509                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1510                         }
1511                 }
1512         }
1513
1514         return 0;
1515 }
1516
1517 /*
1518  * These callback allow devices to be added to the data core when configuration
1519  * has been fully complete.
1520  */
1521 static const struct vhost_device_ops virtio_net_device_ops =
1522 {
1523         .new_device =  new_device,
1524         .destroy_device = destroy_device,
1525         .vring_state_changed = vring_state_changed,
1526 };
1527
1528 /*
1529  * This is a thread will wake up after a period to print stats if the user has
1530  * enabled them.
1531  */
1532 static void *
1533 print_stats(__rte_unused void *arg)
1534 {
1535         struct vhost_dev *vdev;
1536         uint64_t tx_dropped, rx_dropped;
1537         uint64_t tx, tx_total, rx, rx_total;
1538         const char clr[] = { 27, '[', '2', 'J', '\0' };
1539         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1540
1541         while(1) {
1542                 sleep(enable_stats);
1543
1544                 /* Clear screen and move to top left */
1545                 printf("%s%s\n", clr, top_left);
1546                 printf("Device statistics =================================\n");
1547
1548                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1549                         tx_total   = vdev->stats.tx_total;
1550                         tx         = vdev->stats.tx;
1551                         tx_dropped = tx_total - tx;
1552
1553                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1554                                 __ATOMIC_SEQ_CST);
1555                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1556                                 __ATOMIC_SEQ_CST);
1557                         rx_dropped = rx_total - rx;
1558
1559                         printf("Statistics for device %d\n"
1560                                 "-----------------------\n"
1561                                 "TX total:              %" PRIu64 "\n"
1562                                 "TX dropped:            %" PRIu64 "\n"
1563                                 "TX successful:         %" PRIu64 "\n"
1564                                 "RX total:              %" PRIu64 "\n"
1565                                 "RX dropped:            %" PRIu64 "\n"
1566                                 "RX successful:         %" PRIu64 "\n",
1567                                 vdev->vid,
1568                                 tx_total, tx_dropped, tx,
1569                                 rx_total, rx_dropped, rx);
1570                 }
1571
1572                 printf("===================================================\n");
1573
1574                 fflush(stdout);
1575         }
1576
1577         return NULL;
1578 }
1579
1580 static void
1581 unregister_drivers(int socket_num)
1582 {
1583         int i, ret;
1584
1585         for (i = 0; i < socket_num; i++) {
1586                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1587                 if (ret != 0)
1588                         RTE_LOG(ERR, VHOST_CONFIG,
1589                                 "Fail to unregister vhost driver for %s.\n",
1590                                 socket_files + i * PATH_MAX);
1591         }
1592 }
1593
1594 /* When we receive a INT signal, unregister vhost driver */
1595 static void
1596 sigint_handler(__rte_unused int signum)
1597 {
1598         /* Unregister vhost driver. */
1599         unregister_drivers(nb_sockets);
1600
1601         exit(0);
1602 }
1603
1604 /*
1605  * While creating an mbuf pool, one key thing is to figure out how
1606  * many mbuf entries is enough for our use. FYI, here are some
1607  * guidelines:
1608  *
1609  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1610  *
1611  * - For each switch core (A CPU core does the packet switch), we need
1612  *   also make some reservation for receiving the packets from virtio
1613  *   Tx queue. How many is enough depends on the usage. It's normally
1614  *   a simple calculation like following:
1615  *
1616  *       MAX_PKT_BURST * max packet size / mbuf size
1617  *
1618  *   So, we definitely need allocate more mbufs when TSO is enabled.
1619  *
1620  * - Similarly, for each switching core, we should serve @nr_rx_desc
1621  *   mbufs for receiving the packets from physical NIC device.
1622  *
1623  * - We also need make sure, for each switch core, we have allocated
1624  *   enough mbufs to fill up the mbuf cache.
1625  */
1626 static void
1627 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1628         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1629 {
1630         uint32_t nr_mbufs;
1631         uint32_t nr_mbufs_per_core;
1632         uint32_t mtu = 1500;
1633
1634         if (mergeable)
1635                 mtu = 9000;
1636         if (enable_tso)
1637                 mtu = 64 * 1024;
1638
1639         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1640                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1641         nr_mbufs_per_core += nr_rx_desc;
1642         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1643
1644         nr_mbufs  = nr_queues * nr_rx_desc;
1645         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1646         nr_mbufs *= nr_port;
1647
1648         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1649                                             nr_mbuf_cache, 0, mbuf_size,
1650                                             rte_socket_id());
1651         if (mbuf_pool == NULL)
1652                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1653 }
1654
1655 /*
1656  * Main function, does initialisation and calls the per-lcore functions.
1657  */
1658 int
1659 main(int argc, char *argv[])
1660 {
1661         unsigned lcore_id, core_id = 0;
1662         unsigned nb_ports, valid_num_ports;
1663         int ret, i;
1664         uint16_t portid;
1665         static pthread_t tid;
1666         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1667
1668         signal(SIGINT, sigint_handler);
1669
1670         /* init EAL */
1671         ret = rte_eal_init(argc, argv);
1672         if (ret < 0)
1673                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1674         argc -= ret;
1675         argv += ret;
1676
1677         /* parse app arguments */
1678         ret = us_vhost_parse_args(argc, argv);
1679         if (ret < 0)
1680                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1681
1682         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1683                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1684
1685                 if (rte_lcore_is_enabled(lcore_id))
1686                         lcore_ids[core_id++] = lcore_id;
1687         }
1688
1689         if (rte_lcore_count() > RTE_MAX_LCORE)
1690                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1691
1692         /* Get the number of physical ports. */
1693         nb_ports = rte_eth_dev_count_avail();
1694
1695         /*
1696          * Update the global var NUM_PORTS and global array PORTS
1697          * and get value of var VALID_NUM_PORTS according to system ports number
1698          */
1699         valid_num_ports = check_ports_num(nb_ports);
1700
1701         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1702                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1703                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1704                 return -1;
1705         }
1706
1707         /*
1708          * FIXME: here we are trying to allocate mbufs big enough for
1709          * @MAX_QUEUES, but the truth is we're never going to use that
1710          * many queues here. We probably should only do allocation for
1711          * those queues we are going to use.
1712          */
1713         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1714                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1715
1716         if (vm2vm_mode == VM2VM_HARDWARE) {
1717                 /* Enable VT loop back to let L2 switch to do it. */
1718                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1719                 RTE_LOG(DEBUG, VHOST_CONFIG,
1720                         "Enable loop back for L2 switch in vmdq.\n");
1721         }
1722
1723         /* initialize all ports */
1724         RTE_ETH_FOREACH_DEV(portid) {
1725                 /* skip ports that are not enabled */
1726                 if ((enabled_port_mask & (1 << portid)) == 0) {
1727                         RTE_LOG(INFO, VHOST_PORT,
1728                                 "Skipping disabled port %d\n", portid);
1729                         continue;
1730                 }
1731                 if (port_init(portid) != 0)
1732                         rte_exit(EXIT_FAILURE,
1733                                 "Cannot initialize network ports\n");
1734         }
1735
1736         /* Enable stats if the user option is set. */
1737         if (enable_stats) {
1738                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1739                                         print_stats, NULL);
1740                 if (ret < 0)
1741                         rte_exit(EXIT_FAILURE,
1742                                 "Cannot create print-stats thread\n");
1743         }
1744
1745         /* Launch all data cores. */
1746         RTE_LCORE_FOREACH_WORKER(lcore_id)
1747                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1748
1749         if (client_mode)
1750                 flags |= RTE_VHOST_USER_CLIENT;
1751
1752         /* Register vhost user driver to handle vhost messages. */
1753         for (i = 0; i < nb_sockets; i++) {
1754                 char *file = socket_files + i * PATH_MAX;
1755
1756                 if (async_vhost_driver)
1757                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1758
1759                 ret = rte_vhost_driver_register(file, flags);
1760                 if (ret != 0) {
1761                         unregister_drivers(i);
1762                         rte_exit(EXIT_FAILURE,
1763                                 "vhost driver register failure.\n");
1764                 }
1765
1766                 if (builtin_net_driver)
1767                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1768
1769                 if (mergeable == 0) {
1770                         rte_vhost_driver_disable_features(file,
1771                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1772                 }
1773
1774                 if (enable_tx_csum == 0) {
1775                         rte_vhost_driver_disable_features(file,
1776                                 1ULL << VIRTIO_NET_F_CSUM);
1777                 }
1778
1779                 if (enable_tso == 0) {
1780                         rte_vhost_driver_disable_features(file,
1781                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1782                         rte_vhost_driver_disable_features(file,
1783                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1784                         rte_vhost_driver_disable_features(file,
1785                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1786                         rte_vhost_driver_disable_features(file,
1787                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1788                 }
1789
1790                 if (promiscuous) {
1791                         rte_vhost_driver_enable_features(file,
1792                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1793                 }
1794
1795                 ret = rte_vhost_driver_callback_register(file,
1796                         &virtio_net_device_ops);
1797                 if (ret != 0) {
1798                         rte_exit(EXIT_FAILURE,
1799                                 "failed to register vhost driver callbacks.\n");
1800                 }
1801
1802                 if (rte_vhost_driver_start(file) < 0) {
1803                         rte_exit(EXIT_FAILURE,
1804                                 "failed to start vhost driver.\n");
1805                 }
1806         }
1807
1808         RTE_LCORE_FOREACH_WORKER(lcore_id)
1809                 rte_eal_wait_lcore(lcore_id);
1810
1811         /* clean up the EAL */
1812         rte_eal_cleanup();
1813
1814         return 0;
1815 }