999809e6ed417e5884cc024b41179b1c423e6445
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX                       1
52 #define DEVICE_SAFE_REMOVE      2
53
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57
58 #define INVALID_PORT_ID 0xFF
59
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75         VM2VM_DISABLED = 0,
76         VM2VM_SOFTWARE = 1,
77         VM2VM_HARDWARE = 2,
78         VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92
93 static int client_mode;
94
95 static int builtin_net_driver;
96
97 static int async_vhost_driver;
98
99 static char *dma_type;
100
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109
110 /* empty vmdq configuration structure. Filled in programatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112         .rxmode = {
113                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
114                 .split_hdr_size = 0,
115                 /*
116                  * VLAN strip is necessary for 1G NIC such as I350,
117                  * this fixes bug of ipv4 forwarding in guest can't
118                  * forward pakets from one virtio dev to another virtio dev.
119                  */
120                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
121         },
122
123         .txmode = {
124                 .mq_mode = ETH_MQ_TX_NONE,
125                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
126                              DEV_TX_OFFLOAD_TCP_CKSUM |
127                              DEV_TX_OFFLOAD_VLAN_INSERT |
128                              DEV_TX_OFFLOAD_MULTI_SEGS |
129                              DEV_TX_OFFLOAD_TCP_TSO),
130         },
131         .rx_adv_conf = {
132                 /*
133                  * should be overridden separately in code with
134                  * appropriate values
135                  */
136                 .vmdq_rx_conf = {
137                         .nb_queue_pools = ETH_8_POOLS,
138                         .enable_default_pool = 0,
139                         .default_pool = 0,
140                         .nb_pool_maps = 0,
141                         .pool_map = {{0, 0},},
142                 },
143         },
144 };
145
146
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153
154 const uint16_t vlan_tags[] = {
155         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
157         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167
168 static struct vhost_dev_tailq_list vhost_dev_list =
169         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175         unsigned len;
176         unsigned txq_id;
177         struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179
180 struct vhost_bufftable {
181         uint32_t len;
182         uint64_t pre_tsc;
183         struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195
196 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
197                                  / US_PER_S * BURST_TX_DRAIN_US)
198 #define VLAN_HLEN       4
199
200 static inline int
201 open_dma(const char *value)
202 {
203         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
204                 return open_ioat(value);
205
206         return -1;
207 }
208
209 /*
210  * Builds up the correct configuration for VMDQ VLAN pool map
211  * according to the pool & queue limits.
212  */
213 static inline int
214 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215 {
216         struct rte_eth_vmdq_rx_conf conf;
217         struct rte_eth_vmdq_rx_conf *def_conf =
218                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219         unsigned i;
220
221         memset(&conf, 0, sizeof(conf));
222         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
223         conf.nb_pool_maps = num_devices;
224         conf.enable_loop_back = def_conf->enable_loop_back;
225         conf.rx_mode = def_conf->rx_mode;
226
227         for (i = 0; i < conf.nb_pool_maps; i++) {
228                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
229                 conf.pool_map[i].pools = (1UL << i);
230         }
231
232         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
233         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
234                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
235         return 0;
236 }
237
238 /*
239  * Initialises a given port using global settings and with the rx buffers
240  * coming from the mbuf_pool passed as parameter
241  */
242 static inline int
243 port_init(uint16_t port)
244 {
245         struct rte_eth_dev_info dev_info;
246         struct rte_eth_conf port_conf;
247         struct rte_eth_rxconf *rxconf;
248         struct rte_eth_txconf *txconf;
249         int16_t rx_rings, tx_rings;
250         uint16_t rx_ring_size, tx_ring_size;
251         int retval;
252         uint16_t q;
253
254         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
255         retval = rte_eth_dev_info_get(port, &dev_info);
256         if (retval != 0) {
257                 RTE_LOG(ERR, VHOST_PORT,
258                         "Error during getting device (port %u) info: %s\n",
259                         port, strerror(-retval));
260
261                 return retval;
262         }
263
264         rxconf = &dev_info.default_rxconf;
265         txconf = &dev_info.default_txconf;
266         rxconf->rx_drop_en = 1;
267
268         /*configure the number of supported virtio devices based on VMDQ limits */
269         num_devices = dev_info.max_vmdq_pools;
270
271         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
272         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
273
274         tx_rings = (uint16_t)rte_lcore_count();
275
276         /* Get port configuration. */
277         retval = get_eth_conf(&port_conf, num_devices);
278         if (retval < 0)
279                 return retval;
280         /* NIC queues are divided into pf queues and vmdq queues.  */
281         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
282         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
283         num_vmdq_queues = num_devices * queues_per_pool;
284         num_queues = num_pf_queues + num_vmdq_queues;
285         vmdq_queue_base = dev_info.vmdq_queue_base;
286         vmdq_pool_base  = dev_info.vmdq_pool_base;
287         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
288                 num_pf_queues, num_devices, queues_per_pool);
289
290         if (!rte_eth_dev_is_valid_port(port))
291                 return -1;
292
293         rx_rings = (uint16_t)dev_info.max_rx_queues;
294         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
295                 port_conf.txmode.offloads |=
296                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
297         /* Configure ethernet device. */
298         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
299         if (retval != 0) {
300                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
301                         port, strerror(-retval));
302                 return retval;
303         }
304
305         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
306                 &tx_ring_size);
307         if (retval != 0) {
308                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
309                         "for port %u: %s.\n", port, strerror(-retval));
310                 return retval;
311         }
312         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
313                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
314                         "for Rx queues on port %u.\n", port);
315                 return -1;
316         }
317
318         /* Setup the queues. */
319         rxconf->offloads = port_conf.rxmode.offloads;
320         for (q = 0; q < rx_rings; q ++) {
321                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
322                                                 rte_eth_dev_socket_id(port),
323                                                 rxconf,
324                                                 mbuf_pool);
325                 if (retval < 0) {
326                         RTE_LOG(ERR, VHOST_PORT,
327                                 "Failed to setup rx queue %u of port %u: %s.\n",
328                                 q, port, strerror(-retval));
329                         return retval;
330                 }
331         }
332         txconf->offloads = port_conf.txmode.offloads;
333         for (q = 0; q < tx_rings; q ++) {
334                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
335                                                 rte_eth_dev_socket_id(port),
336                                                 txconf);
337                 if (retval < 0) {
338                         RTE_LOG(ERR, VHOST_PORT,
339                                 "Failed to setup tx queue %u of port %u: %s.\n",
340                                 q, port, strerror(-retval));
341                         return retval;
342                 }
343         }
344
345         /* Start the device. */
346         retval  = rte_eth_dev_start(port);
347         if (retval < 0) {
348                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
349                         port, strerror(-retval));
350                 return retval;
351         }
352
353         if (promiscuous) {
354                 retval = rte_eth_promiscuous_enable(port);
355                 if (retval != 0) {
356                         RTE_LOG(ERR, VHOST_PORT,
357                                 "Failed to enable promiscuous mode on port %u: %s\n",
358                                 port, rte_strerror(-retval));
359                         return retval;
360                 }
361         }
362
363         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
364         if (retval < 0) {
365                 RTE_LOG(ERR, VHOST_PORT,
366                         "Failed to get MAC address on port %u: %s\n",
367                         port, rte_strerror(-retval));
368                 return retval;
369         }
370
371         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
372         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
373                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
375
376         return 0;
377 }
378
379 /*
380  * Set socket file path.
381  */
382 static int
383 us_vhost_parse_socket_path(const char *q_arg)
384 {
385         char *old;
386
387         /* parse number string */
388         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
389                 return -1;
390
391         old = socket_files;
392         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
393         if (socket_files == NULL) {
394                 free(old);
395                 return -1;
396         }
397
398         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
399         nb_sockets++;
400
401         return 0;
402 }
403
404 /*
405  * Parse the portmask provided at run time.
406  */
407 static int
408 parse_portmask(const char *portmask)
409 {
410         char *end = NULL;
411         unsigned long pm;
412
413         errno = 0;
414
415         /* parse hexadecimal string */
416         pm = strtoul(portmask, &end, 16);
417         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
418                 return 0;
419
420         return pm;
421
422 }
423
424 /*
425  * Parse num options at run time.
426  */
427 static int
428 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
429 {
430         char *end = NULL;
431         unsigned long num;
432
433         errno = 0;
434
435         /* parse unsigned int string */
436         num = strtoul(q_arg, &end, 10);
437         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
438                 return -1;
439
440         if (num > max_valid_value)
441                 return -1;
442
443         return num;
444
445 }
446
447 /*
448  * Display usage
449  */
450 static void
451 us_vhost_usage(const char *prgname)
452 {
453         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
454         "               --vm2vm [0|1|2]\n"
455         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
456         "               --socket-file <path>\n"
457         "               --nb-devices ND\n"
458         "               -p PORTMASK: Set mask for ports to be used by application\n"
459         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
460         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
461         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
462         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
463         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
464         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
465         "               --socket-file: The path of the socket file.\n"
466         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
467         "               --tso [0|1] disable/enable TCP segment offload.\n"
468         "               --client register a vhost-user socket as client mode.\n"
469         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
470         "               --dmas register dma channel for specific vhost device.\n",
471                prgname);
472 }
473
474 enum {
475 #define OPT_VM2VM               "vm2vm"
476         OPT_VM2VM_NUM = 256,
477 #define OPT_RX_RETRY            "rx-retry"
478         OPT_RX_RETRY_NUM,
479 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
480         OPT_RX_RETRY_DELAY_NUM,
481 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
482         OPT_RX_RETRY_NUMB_NUM,
483 #define OPT_MERGEABLE           "mergeable"
484         OPT_MERGEABLE_NUM,
485 #define OPT_STATS               "stats"
486         OPT_STATS_NUM,
487 #define OPT_SOCKET_FILE         "socket-file"
488         OPT_SOCKET_FILE_NUM,
489 #define OPT_TX_CSUM             "tx-csum"
490         OPT_TX_CSUM_NUM,
491 #define OPT_TSO                 "tso"
492         OPT_TSO_NUM,
493 #define OPT_CLIENT              "client"
494         OPT_CLIENT_NUM,
495 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
496         OPT_BUILTIN_NET_DRIVER_NUM,
497 #define OPT_DMA_TYPE            "dma-type"
498         OPT_DMA_TYPE_NUM,
499 #define OPT_DMAS                "dmas"
500         OPT_DMAS_NUM,
501 };
502
503 /*
504  * Parse the arguments given in the command line of the application.
505  */
506 static int
507 us_vhost_parse_args(int argc, char **argv)
508 {
509         int opt, ret;
510         int option_index;
511         unsigned i;
512         const char *prgname = argv[0];
513         static struct option long_option[] = {
514                 {OPT_VM2VM, required_argument,
515                                 NULL, OPT_VM2VM_NUM},
516                 {OPT_RX_RETRY, required_argument,
517                                 NULL, OPT_RX_RETRY_NUM},
518                 {OPT_RX_RETRY_DELAY, required_argument,
519                                 NULL, OPT_RX_RETRY_DELAY_NUM},
520                 {OPT_RX_RETRY_NUMB, required_argument,
521                                 NULL, OPT_RX_RETRY_NUMB_NUM},
522                 {OPT_MERGEABLE, required_argument,
523                                 NULL, OPT_MERGEABLE_NUM},
524                 {OPT_STATS, required_argument,
525                                 NULL, OPT_STATS_NUM},
526                 {OPT_SOCKET_FILE, required_argument,
527                                 NULL, OPT_SOCKET_FILE_NUM},
528                 {OPT_TX_CSUM, required_argument,
529                                 NULL, OPT_TX_CSUM_NUM},
530                 {OPT_TSO, required_argument,
531                                 NULL, OPT_TSO_NUM},
532                 {OPT_CLIENT, no_argument,
533                                 NULL, OPT_CLIENT_NUM},
534                 {OPT_BUILTIN_NET_DRIVER, no_argument,
535                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
536                 {OPT_DMA_TYPE, required_argument,
537                                 NULL, OPT_DMA_TYPE_NUM},
538                 {OPT_DMAS, required_argument,
539                                 NULL, OPT_DMAS_NUM},
540                 {NULL, 0, 0, 0},
541         };
542
543         /* Parse command line */
544         while ((opt = getopt_long(argc, argv, "p:P",
545                         long_option, &option_index)) != EOF) {
546                 switch (opt) {
547                 /* Portmask */
548                 case 'p':
549                         enabled_port_mask = parse_portmask(optarg);
550                         if (enabled_port_mask == 0) {
551                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
552                                 us_vhost_usage(prgname);
553                                 return -1;
554                         }
555                         break;
556
557                 case 'P':
558                         promiscuous = 1;
559                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
560                                 ETH_VMDQ_ACCEPT_BROADCAST |
561                                 ETH_VMDQ_ACCEPT_MULTICAST;
562                         break;
563
564                 case OPT_VM2VM_NUM:
565                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
566                         if (ret == -1) {
567                                 RTE_LOG(INFO, VHOST_CONFIG,
568                                         "Invalid argument for "
569                                         "vm2vm [0|1|2]\n");
570                                 us_vhost_usage(prgname);
571                                 return -1;
572                         }
573                         vm2vm_mode = (vm2vm_type)ret;
574                         break;
575
576                 case OPT_RX_RETRY_NUM:
577                         ret = parse_num_opt(optarg, 1);
578                         if (ret == -1) {
579                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
580                                 us_vhost_usage(prgname);
581                                 return -1;
582                         }
583                         enable_retry = ret;
584                         break;
585
586                 case OPT_TX_CSUM_NUM:
587                         ret = parse_num_opt(optarg, 1);
588                         if (ret == -1) {
589                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
590                                 us_vhost_usage(prgname);
591                                 return -1;
592                         }
593                         enable_tx_csum = ret;
594                         break;
595
596                 case OPT_TSO_NUM:
597                         ret = parse_num_opt(optarg, 1);
598                         if (ret == -1) {
599                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
600                                 us_vhost_usage(prgname);
601                                 return -1;
602                         }
603                         enable_tso = ret;
604                         break;
605
606                 case OPT_RX_RETRY_DELAY_NUM:
607                         ret = parse_num_opt(optarg, INT32_MAX);
608                         if (ret == -1) {
609                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
610                                 us_vhost_usage(prgname);
611                                 return -1;
612                         }
613                         burst_rx_delay_time = ret;
614                         break;
615
616                 case OPT_RX_RETRY_NUMB_NUM:
617                         ret = parse_num_opt(optarg, INT32_MAX);
618                         if (ret == -1) {
619                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620                                 us_vhost_usage(prgname);
621                                 return -1;
622                         }
623                         burst_rx_retry_num = ret;
624                         break;
625
626                 case OPT_MERGEABLE_NUM:
627                         ret = parse_num_opt(optarg, 1);
628                         if (ret == -1) {
629                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
630                                 us_vhost_usage(prgname);
631                                 return -1;
632                         }
633                         mergeable = !!ret;
634                         if (ret)
635                                 vmdq_conf_default.rxmode.mtu = MAX_MTU;
636                         break;
637
638                 case OPT_STATS_NUM:
639                         ret = parse_num_opt(optarg, INT32_MAX);
640                         if (ret == -1) {
641                                 RTE_LOG(INFO, VHOST_CONFIG,
642                                         "Invalid argument for stats [0..N]\n");
643                                 us_vhost_usage(prgname);
644                                 return -1;
645                         }
646                         enable_stats = ret;
647                         break;
648
649                 /* Set socket file path. */
650                 case OPT_SOCKET_FILE_NUM:
651                         if (us_vhost_parse_socket_path(optarg) == -1) {
652                                 RTE_LOG(INFO, VHOST_CONFIG,
653                                 "Invalid argument for socket name (Max %d characters)\n",
654                                 PATH_MAX);
655                                 us_vhost_usage(prgname);
656                                 return -1;
657                         }
658                         break;
659
660                 case OPT_DMA_TYPE_NUM:
661                         dma_type = optarg;
662                         break;
663
664                 case OPT_DMAS_NUM:
665                         if (open_dma(optarg) == -1) {
666                                 RTE_LOG(INFO, VHOST_CONFIG,
667                                         "Wrong DMA args\n");
668                                 us_vhost_usage(prgname);
669                                 return -1;
670                         }
671                         async_vhost_driver = 1;
672                         break;
673
674                 case OPT_CLIENT_NUM:
675                         client_mode = 1;
676                         break;
677
678                 case OPT_BUILTIN_NET_DRIVER_NUM:
679                         builtin_net_driver = 1;
680                         break;
681
682                 /* Invalid option - print options. */
683                 default:
684                         us_vhost_usage(prgname);
685                         return -1;
686                 }
687         }
688
689         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
690                 if (enabled_port_mask & (1 << i))
691                         ports[num_ports++] = i;
692         }
693
694         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
695                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
696                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
697                 return -1;
698         }
699
700         return 0;
701 }
702
703 /*
704  * Update the global var NUM_PORTS and array PORTS according to system ports number
705  * and return valid ports number
706  */
707 static unsigned check_ports_num(unsigned nb_ports)
708 {
709         unsigned valid_num_ports = num_ports;
710         unsigned portid;
711
712         if (num_ports > nb_ports) {
713                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
714                         num_ports, nb_ports);
715                 num_ports = nb_ports;
716         }
717
718         for (portid = 0; portid < num_ports; portid ++) {
719                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
720                         RTE_LOG(INFO, VHOST_PORT,
721                                 "\nSpecified port ID(%u) is not valid\n",
722                                 ports[portid]);
723                         ports[portid] = INVALID_PORT_ID;
724                         valid_num_ports--;
725                 }
726         }
727         return valid_num_ports;
728 }
729
730 static __rte_always_inline struct vhost_dev *
731 find_vhost_dev(struct rte_ether_addr *mac)
732 {
733         struct vhost_dev *vdev;
734
735         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
736                 if (vdev->ready == DEVICE_RX &&
737                     rte_is_same_ether_addr(mac, &vdev->mac_address))
738                         return vdev;
739         }
740
741         return NULL;
742 }
743
744 /*
745  * This function learns the MAC address of the device and registers this along with a
746  * vlan tag to a VMDQ.
747  */
748 static int
749 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
750 {
751         struct rte_ether_hdr *pkt_hdr;
752         int i, ret;
753
754         /* Learn MAC address of guest device from packet */
755         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
756
757         if (find_vhost_dev(&pkt_hdr->src_addr)) {
758                 RTE_LOG(ERR, VHOST_DATA,
759                         "(%d) device is using a registered MAC!\n",
760                         vdev->vid);
761                 return -1;
762         }
763
764         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
765                 vdev->mac_address.addr_bytes[i] =
766                         pkt_hdr->src_addr.addr_bytes[i];
767
768         /* vlan_tag currently uses the device_id. */
769         vdev->vlan_tag = vlan_tags[vdev->vid];
770
771         /* Print out VMDQ registration info. */
772         RTE_LOG(INFO, VHOST_DATA,
773                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
774                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
775                 vdev->vlan_tag);
776
777         /* Register the MAC address. */
778         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
779                                 (uint32_t)vdev->vid + vmdq_pool_base);
780         if (ret)
781                 RTE_LOG(ERR, VHOST_DATA,
782                         "(%d) failed to add device MAC address to VMDQ\n",
783                         vdev->vid);
784
785         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
786
787         /* Set device as ready for RX. */
788         vdev->ready = DEVICE_RX;
789
790         return 0;
791 }
792
793 /*
794  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
795  * queue before disabling RX on the device.
796  */
797 static inline void
798 unlink_vmdq(struct vhost_dev *vdev)
799 {
800         unsigned i = 0;
801         unsigned rx_count;
802         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
803
804         if (vdev->ready == DEVICE_RX) {
805                 /*clear MAC and VLAN settings*/
806                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
807                 for (i = 0; i < 6; i++)
808                         vdev->mac_address.addr_bytes[i] = 0;
809
810                 vdev->vlan_tag = 0;
811
812                 /*Clear out the receive buffers*/
813                 rx_count = rte_eth_rx_burst(ports[0],
814                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
815
816                 while (rx_count) {
817                         for (i = 0; i < rx_count; i++)
818                                 rte_pktmbuf_free(pkts_burst[i]);
819
820                         rx_count = rte_eth_rx_burst(ports[0],
821                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
822                 }
823
824                 vdev->ready = DEVICE_MAC_LEARNING;
825         }
826 }
827
828 static inline void
829 free_pkts(struct rte_mbuf **pkts, uint16_t n)
830 {
831         while (n--)
832                 rte_pktmbuf_free(pkts[n]);
833 }
834
835 static __rte_always_inline void
836 complete_async_pkts(struct vhost_dev *vdev)
837 {
838         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
839         uint16_t complete_count;
840
841         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
842                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
843         if (complete_count) {
844                 free_pkts(p_cpl, complete_count);
845                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
846         }
847
848 }
849
850 static __rte_always_inline void
851 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
852             struct rte_mbuf *m)
853 {
854         uint16_t ret;
855
856         if (builtin_net_driver) {
857                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
858         } else {
859                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
860         }
861
862         if (enable_stats) {
863                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
864                                 __ATOMIC_SEQ_CST);
865                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
866                                 __ATOMIC_SEQ_CST);
867                 src_vdev->stats.tx_total++;
868                 src_vdev->stats.tx += ret;
869         }
870 }
871
872 static __rte_always_inline void
873 drain_vhost(struct vhost_dev *vdev)
874 {
875         uint16_t ret;
876         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
877         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
878         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
879
880         if (builtin_net_driver) {
881                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
882         } else if (async_vhost_driver) {
883                 uint16_t enqueue_fail = 0;
884
885                 complete_async_pkts(vdev);
886                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
887                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
888
889                 enqueue_fail = nr_xmit - ret;
890                 if (enqueue_fail)
891                         free_pkts(&m[ret], nr_xmit - ret);
892         } else {
893                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
894                                                 m, nr_xmit);
895         }
896
897         if (enable_stats) {
898                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
899                                 __ATOMIC_SEQ_CST);
900                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
901                                 __ATOMIC_SEQ_CST);
902         }
903
904         if (!async_vhost_driver)
905                 free_pkts(m, nr_xmit);
906 }
907
908 static __rte_always_inline void
909 drain_vhost_table(void)
910 {
911         uint16_t lcore_id = rte_lcore_id();
912         struct vhost_bufftable *vhost_txq;
913         struct vhost_dev *vdev;
914         uint64_t cur_tsc;
915
916         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
917                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
918                                                 + vdev->vid];
919
920                 cur_tsc = rte_rdtsc();
921                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
922                                 > MBUF_TABLE_DRAIN_TSC)) {
923                         RTE_LOG_DP(DEBUG, VHOST_DATA,
924                                 "Vhost TX queue drained after timeout with burst size %u\n",
925                                 vhost_txq->len);
926                         drain_vhost(vdev);
927                         vhost_txq->len = 0;
928                         vhost_txq->pre_tsc = cur_tsc;
929                 }
930         }
931 }
932
933 /*
934  * Check if the packet destination MAC address is for a local device. If so then put
935  * the packet on that devices RX queue. If not then return.
936  */
937 static __rte_always_inline int
938 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
939 {
940         struct rte_ether_hdr *pkt_hdr;
941         struct vhost_dev *dst_vdev;
942         struct vhost_bufftable *vhost_txq;
943         uint16_t lcore_id = rte_lcore_id();
944         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
945
946         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
947         if (!dst_vdev)
948                 return -1;
949
950         if (vdev->vid == dst_vdev->vid) {
951                 RTE_LOG_DP(DEBUG, VHOST_DATA,
952                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
953                         vdev->vid);
954                 return 0;
955         }
956
957         RTE_LOG_DP(DEBUG, VHOST_DATA,
958                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
959
960         if (unlikely(dst_vdev->remove)) {
961                 RTE_LOG_DP(DEBUG, VHOST_DATA,
962                         "(%d) device is marked for removal\n", dst_vdev->vid);
963                 return 0;
964         }
965
966         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
967         vhost_txq->m_table[vhost_txq->len++] = m;
968
969         if (enable_stats) {
970                 vdev->stats.tx_total++;
971                 vdev->stats.tx++;
972         }
973
974         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
975                 drain_vhost(dst_vdev);
976                 vhost_txq->len = 0;
977                 vhost_txq->pre_tsc = rte_rdtsc();
978         }
979         return 0;
980 }
981
982 /*
983  * Check if the destination MAC of a packet is one local VM,
984  * and get its vlan tag, and offset if it is.
985  */
986 static __rte_always_inline int
987 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
988         uint32_t *offset, uint16_t *vlan_tag)
989 {
990         struct vhost_dev *dst_vdev;
991         struct rte_ether_hdr *pkt_hdr =
992                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
993
994         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
995         if (!dst_vdev)
996                 return 0;
997
998         if (vdev->vid == dst_vdev->vid) {
999                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1000                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1001                         vdev->vid);
1002                 return -1;
1003         }
1004
1005         /*
1006          * HW vlan strip will reduce the packet length
1007          * by minus length of vlan tag, so need restore
1008          * the packet length by plus it.
1009          */
1010         *offset  = VLAN_HLEN;
1011         *vlan_tag = vlan_tags[vdev->vid];
1012
1013         RTE_LOG_DP(DEBUG, VHOST_DATA,
1014                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1015                 vdev->vid, dst_vdev->vid, *vlan_tag);
1016
1017         return 0;
1018 }
1019
1020 static void virtio_tx_offload(struct rte_mbuf *m)
1021 {
1022         struct rte_net_hdr_lens hdr_lens;
1023         struct rte_ipv4_hdr *ipv4_hdr;
1024         struct rte_tcp_hdr *tcp_hdr;
1025         uint32_t ptype;
1026         void *l3_hdr;
1027
1028         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1029         m->l2_len = hdr_lens.l2_len;
1030         m->l3_len = hdr_lens.l3_len;
1031         m->l4_len = hdr_lens.l4_len;
1032
1033         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1034         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1035                 m->l2_len + m->l3_len);
1036
1037         m->ol_flags |= PKT_TX_TCP_SEG;
1038         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1039                 m->ol_flags |= PKT_TX_IPV4;
1040                 m->ol_flags |= PKT_TX_IP_CKSUM;
1041                 ipv4_hdr = l3_hdr;
1042                 ipv4_hdr->hdr_checksum = 0;
1043                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1044         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1045                 m->ol_flags |= PKT_TX_IPV6;
1046                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1047         }
1048 }
1049
1050 static __rte_always_inline void
1051 do_drain_mbuf_table(struct mbuf_table *tx_q)
1052 {
1053         uint16_t count;
1054
1055         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1056                                  tx_q->m_table, tx_q->len);
1057         if (unlikely(count < tx_q->len))
1058                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1059
1060         tx_q->len = 0;
1061 }
1062
1063 /*
1064  * This function routes the TX packet to the correct interface. This
1065  * may be a local device or the physical port.
1066  */
1067 static __rte_always_inline void
1068 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1069 {
1070         struct mbuf_table *tx_q;
1071         unsigned offset = 0;
1072         const uint16_t lcore_id = rte_lcore_id();
1073         struct rte_ether_hdr *nh;
1074
1075
1076         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1077         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1078                 struct vhost_dev *vdev2;
1079
1080                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1081                         if (vdev2 != vdev)
1082                                 sync_virtio_xmit(vdev2, vdev, m);
1083                 }
1084                 goto queue2nic;
1085         }
1086
1087         /*check if destination is local VM*/
1088         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1089                 return;
1090
1091         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1092                 if (unlikely(find_local_dest(vdev, m, &offset,
1093                                              &vlan_tag) != 0)) {
1094                         rte_pktmbuf_free(m);
1095                         return;
1096                 }
1097         }
1098
1099         RTE_LOG_DP(DEBUG, VHOST_DATA,
1100                 "(%d) TX: MAC address is external\n", vdev->vid);
1101
1102 queue2nic:
1103
1104         /*Add packet to the port tx queue*/
1105         tx_q = &lcore_tx_queue[lcore_id];
1106
1107         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1108         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1109                 /* Guest has inserted the vlan tag. */
1110                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1111                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1112                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1113                         (vh->vlan_tci != vlan_tag_be))
1114                         vh->vlan_tci = vlan_tag_be;
1115         } else {
1116                 m->ol_flags |= PKT_TX_VLAN_PKT;
1117
1118                 /*
1119                  * Find the right seg to adjust the data len when offset is
1120                  * bigger than tail room size.
1121                  */
1122                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1123                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1124                                 m->data_len += offset;
1125                         else {
1126                                 struct rte_mbuf *seg = m;
1127
1128                                 while ((seg->next != NULL) &&
1129                                         (offset > rte_pktmbuf_tailroom(seg)))
1130                                         seg = seg->next;
1131
1132                                 seg->data_len += offset;
1133                         }
1134                         m->pkt_len += offset;
1135                 }
1136
1137                 m->vlan_tci = vlan_tag;
1138         }
1139
1140         if (m->ol_flags & PKT_RX_LRO)
1141                 virtio_tx_offload(m);
1142
1143         tx_q->m_table[tx_q->len++] = m;
1144         if (enable_stats) {
1145                 vdev->stats.tx_total++;
1146                 vdev->stats.tx++;
1147         }
1148
1149         if (unlikely(tx_q->len == MAX_PKT_BURST))
1150                 do_drain_mbuf_table(tx_q);
1151 }
1152
1153
1154 static __rte_always_inline void
1155 drain_mbuf_table(struct mbuf_table *tx_q)
1156 {
1157         static uint64_t prev_tsc;
1158         uint64_t cur_tsc;
1159
1160         if (tx_q->len == 0)
1161                 return;
1162
1163         cur_tsc = rte_rdtsc();
1164         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1165                 prev_tsc = cur_tsc;
1166
1167                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1168                         "TX queue drained after timeout with burst size %u\n",
1169                         tx_q->len);
1170                 do_drain_mbuf_table(tx_q);
1171         }
1172 }
1173
1174 static __rte_always_inline void
1175 drain_eth_rx(struct vhost_dev *vdev)
1176 {
1177         uint16_t rx_count, enqueue_count;
1178         struct rte_mbuf *pkts[MAX_PKT_BURST];
1179
1180         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1181                                     pkts, MAX_PKT_BURST);
1182
1183         if (!rx_count)
1184                 return;
1185
1186         /*
1187          * When "enable_retry" is set, here we wait and retry when there
1188          * is no enough free slots in the queue to hold @rx_count packets,
1189          * to diminish packet loss.
1190          */
1191         if (enable_retry &&
1192             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1193                         VIRTIO_RXQ))) {
1194                 uint32_t retry;
1195
1196                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1197                         rte_delay_us(burst_rx_delay_time);
1198                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1199                                         VIRTIO_RXQ))
1200                                 break;
1201                 }
1202         }
1203
1204         if (builtin_net_driver) {
1205                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1206                                                 pkts, rx_count);
1207         } else if (async_vhost_driver) {
1208                 uint16_t enqueue_fail = 0;
1209
1210                 complete_async_pkts(vdev);
1211                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1212                                         VIRTIO_RXQ, pkts, rx_count);
1213                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1214
1215                 enqueue_fail = rx_count - enqueue_count;
1216                 if (enqueue_fail)
1217                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1218
1219         } else {
1220                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1221                                                 pkts, rx_count);
1222         }
1223
1224         if (enable_stats) {
1225                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1226                                 __ATOMIC_SEQ_CST);
1227                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1228                                 __ATOMIC_SEQ_CST);
1229         }
1230
1231         if (!async_vhost_driver)
1232                 free_pkts(pkts, rx_count);
1233 }
1234
1235 static __rte_always_inline void
1236 drain_virtio_tx(struct vhost_dev *vdev)
1237 {
1238         struct rte_mbuf *pkts[MAX_PKT_BURST];
1239         uint16_t count;
1240         uint16_t i;
1241
1242         if (builtin_net_driver) {
1243                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1244                                         pkts, MAX_PKT_BURST);
1245         } else {
1246                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1247                                         mbuf_pool, pkts, MAX_PKT_BURST);
1248         }
1249
1250         /* setup VMDq for the first packet */
1251         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1252                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1253                         free_pkts(pkts, count);
1254         }
1255
1256         for (i = 0; i < count; ++i)
1257                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1258 }
1259
1260 /*
1261  * Main function of vhost-switch. It basically does:
1262  *
1263  * for each vhost device {
1264  *    - drain_eth_rx()
1265  *
1266  *      Which drains the host eth Rx queue linked to the vhost device,
1267  *      and deliver all of them to guest virito Rx ring associated with
1268  *      this vhost device.
1269  *
1270  *    - drain_virtio_tx()
1271  *
1272  *      Which drains the guest virtio Tx queue and deliver all of them
1273  *      to the target, which could be another vhost device, or the
1274  *      physical eth dev. The route is done in function "virtio_tx_route".
1275  * }
1276  */
1277 static int
1278 switch_worker(void *arg __rte_unused)
1279 {
1280         unsigned i;
1281         unsigned lcore_id = rte_lcore_id();
1282         struct vhost_dev *vdev;
1283         struct mbuf_table *tx_q;
1284
1285         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1286
1287         tx_q = &lcore_tx_queue[lcore_id];
1288         for (i = 0; i < rte_lcore_count(); i++) {
1289                 if (lcore_ids[i] == lcore_id) {
1290                         tx_q->txq_id = i;
1291                         break;
1292                 }
1293         }
1294
1295         while(1) {
1296                 drain_mbuf_table(tx_q);
1297                 drain_vhost_table();
1298                 /*
1299                  * Inform the configuration core that we have exited the
1300                  * linked list and that no devices are in use if requested.
1301                  */
1302                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1303                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1304
1305                 /*
1306                  * Process vhost devices
1307                  */
1308                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1309                               lcore_vdev_entry) {
1310                         if (unlikely(vdev->remove)) {
1311                                 unlink_vmdq(vdev);
1312                                 vdev->ready = DEVICE_SAFE_REMOVE;
1313                                 continue;
1314                         }
1315
1316                         if (likely(vdev->ready == DEVICE_RX))
1317                                 drain_eth_rx(vdev);
1318
1319                         if (likely(!vdev->remove))
1320                                 drain_virtio_tx(vdev);
1321                 }
1322         }
1323
1324         return 0;
1325 }
1326
1327 /*
1328  * Remove a device from the specific data core linked list and from the
1329  * main linked list. Synchonization  occurs through the use of the
1330  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1331  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1332  */
1333 static void
1334 destroy_device(int vid)
1335 {
1336         struct vhost_dev *vdev = NULL;
1337         int lcore;
1338         uint16_t i;
1339
1340         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1341                 if (vdev->vid == vid)
1342                         break;
1343         }
1344         if (!vdev)
1345                 return;
1346         /*set the remove flag. */
1347         vdev->remove = 1;
1348         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1349                 rte_pause();
1350         }
1351
1352         for (i = 0; i < RTE_MAX_LCORE; i++)
1353                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1354
1355         if (builtin_net_driver)
1356                 vs_vhost_net_remove(vdev);
1357
1358         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1359                      lcore_vdev_entry);
1360         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1361
1362
1363         /* Set the dev_removal_flag on each lcore. */
1364         RTE_LCORE_FOREACH_WORKER(lcore)
1365                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1366
1367         /*
1368          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1369          * we can be sure that they can no longer access the device removed
1370          * from the linked lists and that the devices are no longer in use.
1371          */
1372         RTE_LCORE_FOREACH_WORKER(lcore) {
1373                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1374                         rte_pause();
1375         }
1376
1377         lcore_info[vdev->coreid].device_num--;
1378
1379         RTE_LOG(INFO, VHOST_DATA,
1380                 "(%d) device has been removed from data core\n",
1381                 vdev->vid);
1382
1383         if (async_vhost_driver) {
1384                 uint16_t n_pkt = 0;
1385                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1386
1387                 while (vdev->pkts_inflight) {
1388                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1389                                                 m_cpl, vdev->pkts_inflight);
1390                         free_pkts(m_cpl, n_pkt);
1391                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1392                 }
1393
1394                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1395         }
1396
1397         rte_free(vdev);
1398 }
1399
1400 /*
1401  * A new device is added to a data core. First the device is added to the main linked list
1402  * and then allocated to a specific data core.
1403  */
1404 static int
1405 new_device(int vid)
1406 {
1407         int lcore, core_add = 0;
1408         uint16_t i;
1409         uint32_t device_num_min = num_devices;
1410         struct vhost_dev *vdev;
1411         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1412         if (vdev == NULL) {
1413                 RTE_LOG(INFO, VHOST_DATA,
1414                         "(%d) couldn't allocate memory for vhost dev\n",
1415                         vid);
1416                 return -1;
1417         }
1418         vdev->vid = vid;
1419
1420         for (i = 0; i < RTE_MAX_LCORE; i++) {
1421                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1422                         = rte_zmalloc("vhost bufftable",
1423                                 sizeof(struct vhost_bufftable),
1424                                 RTE_CACHE_LINE_SIZE);
1425
1426                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1427                         RTE_LOG(INFO, VHOST_DATA,
1428                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1429                         return -1;
1430                 }
1431         }
1432
1433         if (builtin_net_driver)
1434                 vs_vhost_net_setup(vdev);
1435
1436         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1437         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1438
1439         /*reset ready flag*/
1440         vdev->ready = DEVICE_MAC_LEARNING;
1441         vdev->remove = 0;
1442
1443         /* Find a suitable lcore to add the device. */
1444         RTE_LCORE_FOREACH_WORKER(lcore) {
1445                 if (lcore_info[lcore].device_num < device_num_min) {
1446                         device_num_min = lcore_info[lcore].device_num;
1447                         core_add = lcore;
1448                 }
1449         }
1450         vdev->coreid = core_add;
1451
1452         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1453                           lcore_vdev_entry);
1454         lcore_info[vdev->coreid].device_num++;
1455
1456         /* Disable notifications. */
1457         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1458         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1459
1460         RTE_LOG(INFO, VHOST_DATA,
1461                 "(%d) device has been added to data core %d\n",
1462                 vid, vdev->coreid);
1463
1464         if (async_vhost_driver) {
1465                 struct rte_vhost_async_config config = {0};
1466                 struct rte_vhost_async_channel_ops channel_ops;
1467
1468                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1469                         channel_ops.transfer_data = ioat_transfer_data_cb;
1470                         channel_ops.check_completed_copies =
1471                                 ioat_check_completed_copies_cb;
1472
1473                         config.features = RTE_VHOST_ASYNC_INORDER;
1474
1475                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1476                                 config, &channel_ops);
1477                 }
1478         }
1479
1480         return 0;
1481 }
1482
1483 static int
1484 vring_state_changed(int vid, uint16_t queue_id, int enable)
1485 {
1486         struct vhost_dev *vdev = NULL;
1487
1488         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1489                 if (vdev->vid == vid)
1490                         break;
1491         }
1492         if (!vdev)
1493                 return -1;
1494
1495         if (queue_id != VIRTIO_RXQ)
1496                 return 0;
1497
1498         if (async_vhost_driver) {
1499                 if (!enable) {
1500                         uint16_t n_pkt = 0;
1501                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1502
1503                         while (vdev->pkts_inflight) {
1504                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1505                                                         m_cpl, vdev->pkts_inflight);
1506                                 free_pkts(m_cpl, n_pkt);
1507                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1508                         }
1509                 }
1510         }
1511
1512         return 0;
1513 }
1514
1515 /*
1516  * These callback allow devices to be added to the data core when configuration
1517  * has been fully complete.
1518  */
1519 static const struct vhost_device_ops virtio_net_device_ops =
1520 {
1521         .new_device =  new_device,
1522         .destroy_device = destroy_device,
1523         .vring_state_changed = vring_state_changed,
1524 };
1525
1526 /*
1527  * This is a thread will wake up after a period to print stats if the user has
1528  * enabled them.
1529  */
1530 static void *
1531 print_stats(__rte_unused void *arg)
1532 {
1533         struct vhost_dev *vdev;
1534         uint64_t tx_dropped, rx_dropped;
1535         uint64_t tx, tx_total, rx, rx_total;
1536         const char clr[] = { 27, '[', '2', 'J', '\0' };
1537         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1538
1539         while(1) {
1540                 sleep(enable_stats);
1541
1542                 /* Clear screen and move to top left */
1543                 printf("%s%s\n", clr, top_left);
1544                 printf("Device statistics =================================\n");
1545
1546                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1547                         tx_total   = vdev->stats.tx_total;
1548                         tx         = vdev->stats.tx;
1549                         tx_dropped = tx_total - tx;
1550
1551                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1552                                 __ATOMIC_SEQ_CST);
1553                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1554                                 __ATOMIC_SEQ_CST);
1555                         rx_dropped = rx_total - rx;
1556
1557                         printf("Statistics for device %d\n"
1558                                 "-----------------------\n"
1559                                 "TX total:              %" PRIu64 "\n"
1560                                 "TX dropped:            %" PRIu64 "\n"
1561                                 "TX successful:         %" PRIu64 "\n"
1562                                 "RX total:              %" PRIu64 "\n"
1563                                 "RX dropped:            %" PRIu64 "\n"
1564                                 "RX successful:         %" PRIu64 "\n",
1565                                 vdev->vid,
1566                                 tx_total, tx_dropped, tx,
1567                                 rx_total, rx_dropped, rx);
1568                 }
1569
1570                 printf("===================================================\n");
1571
1572                 fflush(stdout);
1573         }
1574
1575         return NULL;
1576 }
1577
1578 static void
1579 unregister_drivers(int socket_num)
1580 {
1581         int i, ret;
1582
1583         for (i = 0; i < socket_num; i++) {
1584                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1585                 if (ret != 0)
1586                         RTE_LOG(ERR, VHOST_CONFIG,
1587                                 "Fail to unregister vhost driver for %s.\n",
1588                                 socket_files + i * PATH_MAX);
1589         }
1590 }
1591
1592 /* When we receive a INT signal, unregister vhost driver */
1593 static void
1594 sigint_handler(__rte_unused int signum)
1595 {
1596         /* Unregister vhost driver. */
1597         unregister_drivers(nb_sockets);
1598
1599         exit(0);
1600 }
1601
1602 /*
1603  * While creating an mbuf pool, one key thing is to figure out how
1604  * many mbuf entries is enough for our use. FYI, here are some
1605  * guidelines:
1606  *
1607  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1608  *
1609  * - For each switch core (A CPU core does the packet switch), we need
1610  *   also make some reservation for receiving the packets from virtio
1611  *   Tx queue. How many is enough depends on the usage. It's normally
1612  *   a simple calculation like following:
1613  *
1614  *       MAX_PKT_BURST * max packet size / mbuf size
1615  *
1616  *   So, we definitely need allocate more mbufs when TSO is enabled.
1617  *
1618  * - Similarly, for each switching core, we should serve @nr_rx_desc
1619  *   mbufs for receiving the packets from physical NIC device.
1620  *
1621  * - We also need make sure, for each switch core, we have allocated
1622  *   enough mbufs to fill up the mbuf cache.
1623  */
1624 static void
1625 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1626         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1627 {
1628         uint32_t nr_mbufs;
1629         uint32_t nr_mbufs_per_core;
1630         uint32_t mtu = 1500;
1631
1632         if (mergeable)
1633                 mtu = 9000;
1634         if (enable_tso)
1635                 mtu = 64 * 1024;
1636
1637         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1638                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1639         nr_mbufs_per_core += nr_rx_desc;
1640         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1641
1642         nr_mbufs  = nr_queues * nr_rx_desc;
1643         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1644         nr_mbufs *= nr_port;
1645
1646         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1647                                             nr_mbuf_cache, 0, mbuf_size,
1648                                             rte_socket_id());
1649         if (mbuf_pool == NULL)
1650                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1651 }
1652
1653 /*
1654  * Main function, does initialisation and calls the per-lcore functions.
1655  */
1656 int
1657 main(int argc, char *argv[])
1658 {
1659         unsigned lcore_id, core_id = 0;
1660         unsigned nb_ports, valid_num_ports;
1661         int ret, i;
1662         uint16_t portid;
1663         static pthread_t tid;
1664         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1665
1666         signal(SIGINT, sigint_handler);
1667
1668         /* init EAL */
1669         ret = rte_eal_init(argc, argv);
1670         if (ret < 0)
1671                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1672         argc -= ret;
1673         argv += ret;
1674
1675         /* parse app arguments */
1676         ret = us_vhost_parse_args(argc, argv);
1677         if (ret < 0)
1678                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1679
1680         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1681                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1682
1683                 if (rte_lcore_is_enabled(lcore_id))
1684                         lcore_ids[core_id++] = lcore_id;
1685         }
1686
1687         if (rte_lcore_count() > RTE_MAX_LCORE)
1688                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1689
1690         /* Get the number of physical ports. */
1691         nb_ports = rte_eth_dev_count_avail();
1692
1693         /*
1694          * Update the global var NUM_PORTS and global array PORTS
1695          * and get value of var VALID_NUM_PORTS according to system ports number
1696          */
1697         valid_num_ports = check_ports_num(nb_ports);
1698
1699         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1700                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1701                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1702                 return -1;
1703         }
1704
1705         /*
1706          * FIXME: here we are trying to allocate mbufs big enough for
1707          * @MAX_QUEUES, but the truth is we're never going to use that
1708          * many queues here. We probably should only do allocation for
1709          * those queues we are going to use.
1710          */
1711         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1712                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1713
1714         if (vm2vm_mode == VM2VM_HARDWARE) {
1715                 /* Enable VT loop back to let L2 switch to do it. */
1716                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1717                 RTE_LOG(DEBUG, VHOST_CONFIG,
1718                         "Enable loop back for L2 switch in vmdq.\n");
1719         }
1720
1721         /* initialize all ports */
1722         RTE_ETH_FOREACH_DEV(portid) {
1723                 /* skip ports that are not enabled */
1724                 if ((enabled_port_mask & (1 << portid)) == 0) {
1725                         RTE_LOG(INFO, VHOST_PORT,
1726                                 "Skipping disabled port %d\n", portid);
1727                         continue;
1728                 }
1729                 if (port_init(portid) != 0)
1730                         rte_exit(EXIT_FAILURE,
1731                                 "Cannot initialize network ports\n");
1732         }
1733
1734         /* Enable stats if the user option is set. */
1735         if (enable_stats) {
1736                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1737                                         print_stats, NULL);
1738                 if (ret < 0)
1739                         rte_exit(EXIT_FAILURE,
1740                                 "Cannot create print-stats thread\n");
1741         }
1742
1743         /* Launch all data cores. */
1744         RTE_LCORE_FOREACH_WORKER(lcore_id)
1745                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1746
1747         if (client_mode)
1748                 flags |= RTE_VHOST_USER_CLIENT;
1749
1750         /* Register vhost user driver to handle vhost messages. */
1751         for (i = 0; i < nb_sockets; i++) {
1752                 char *file = socket_files + i * PATH_MAX;
1753
1754                 if (async_vhost_driver)
1755                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1756
1757                 ret = rte_vhost_driver_register(file, flags);
1758                 if (ret != 0) {
1759                         unregister_drivers(i);
1760                         rte_exit(EXIT_FAILURE,
1761                                 "vhost driver register failure.\n");
1762                 }
1763
1764                 if (builtin_net_driver)
1765                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1766
1767                 if (mergeable == 0) {
1768                         rte_vhost_driver_disable_features(file,
1769                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1770                 }
1771
1772                 if (enable_tx_csum == 0) {
1773                         rte_vhost_driver_disable_features(file,
1774                                 1ULL << VIRTIO_NET_F_CSUM);
1775                 }
1776
1777                 if (enable_tso == 0) {
1778                         rte_vhost_driver_disable_features(file,
1779                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1780                         rte_vhost_driver_disable_features(file,
1781                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1782                         rte_vhost_driver_disable_features(file,
1783                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1784                         rte_vhost_driver_disable_features(file,
1785                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1786                 }
1787
1788                 if (promiscuous) {
1789                         rte_vhost_driver_enable_features(file,
1790                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1791                 }
1792
1793                 ret = rte_vhost_driver_callback_register(file,
1794                         &virtio_net_device_ops);
1795                 if (ret != 0) {
1796                         rte_exit(EXIT_FAILURE,
1797                                 "failed to register vhost driver callbacks.\n");
1798                 }
1799
1800                 if (rte_vhost_driver_start(file) < 0) {
1801                         rte_exit(EXIT_FAILURE,
1802                                 "failed to start vhost driver.\n");
1803                 }
1804         }
1805
1806         RTE_LCORE_FOREACH_WORKER(lcore_id)
1807                 rte_eal_wait_lcore(lcore_id);
1808
1809         /* clean up the EAL */
1810         rte_eal_cleanup();
1811
1812         return 0;
1813 }