net: add macro to extract MAC address bytes
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_RX                       1
51 #define DEVICE_SAFE_REMOVE      2
52
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
56
57 #define INVALID_PORT_ID 0xFF
58
59 /* mask of enabled ports */
60 static uint32_t enabled_port_mask = 0;
61
62 /* Promiscuous mode */
63 static uint32_t promiscuous;
64
65 /* number of devices/queues to support*/
66 static uint32_t num_queues = 0;
67 static uint32_t num_devices;
68
69 static struct rte_mempool *mbuf_pool;
70 static int mergeable;
71
72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
73 typedef enum {
74         VM2VM_DISABLED = 0,
75         VM2VM_SOFTWARE = 1,
76         VM2VM_HARDWARE = 2,
77         VM2VM_LAST
78 } vm2vm_type;
79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
80
81 /* Enable stats. */
82 static uint32_t enable_stats = 0;
83 /* Enable retries on RX. */
84 static uint32_t enable_retry = 1;
85
86 /* Disable TX checksum offload */
87 static uint32_t enable_tx_csum;
88
89 /* Disable TSO offload */
90 static uint32_t enable_tso;
91
92 static int client_mode;
93
94 static int builtin_net_driver;
95
96 static int async_vhost_driver;
97
98 static char *dma_type;
99
100 /* Specify timeout (in useconds) between retries on RX. */
101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
102 /* Specify the number of retries on RX. */
103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104
105 /* Socket file paths. Can be set by user */
106 static char *socket_files;
107 static int nb_sockets;
108
109 /* empty vmdq configuration structure. Filled in programatically */
110 static struct rte_eth_conf vmdq_conf_default = {
111         .rxmode = {
112                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
113                 .split_hdr_size = 0,
114                 /*
115                  * VLAN strip is necessary for 1G NIC such as I350,
116                  * this fixes bug of ipv4 forwarding in guest can't
117                  * forward pakets from one virtio dev to another virtio dev.
118                  */
119                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
120         },
121
122         .txmode = {
123                 .mq_mode = ETH_MQ_TX_NONE,
124                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
125                              DEV_TX_OFFLOAD_TCP_CKSUM |
126                              DEV_TX_OFFLOAD_VLAN_INSERT |
127                              DEV_TX_OFFLOAD_MULTI_SEGS |
128                              DEV_TX_OFFLOAD_TCP_TSO),
129         },
130         .rx_adv_conf = {
131                 /*
132                  * should be overridden separately in code with
133                  * appropriate values
134                  */
135                 .vmdq_rx_conf = {
136                         .nb_queue_pools = ETH_8_POOLS,
137                         .enable_default_pool = 0,
138                         .default_pool = 0,
139                         .nb_pool_maps = 0,
140                         .pool_map = {{0, 0},},
141                 },
142         },
143 };
144
145
146 static unsigned lcore_ids[RTE_MAX_LCORE];
147 static uint16_t ports[RTE_MAX_ETHPORTS];
148 static unsigned num_ports = 0; /**< The number of ports specified in command line */
149 static uint16_t num_pf_queues, num_vmdq_queues;
150 static uint16_t vmdq_pool_base, vmdq_queue_base;
151 static uint16_t queues_per_pool;
152
153 const uint16_t vlan_tags[] = {
154         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
155         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
156         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
157         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
158         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
159         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
160         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
161         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
162 };
163
164 /* ethernet addresses of ports */
165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166
167 static struct vhost_dev_tailq_list vhost_dev_list =
168         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169
170 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171
172 /* Used for queueing bursts of TX packets. */
173 struct mbuf_table {
174         unsigned len;
175         unsigned txq_id;
176         struct rte_mbuf *m_table[MAX_PKT_BURST];
177 };
178
179 struct vhost_bufftable {
180         uint32_t len;
181         uint64_t pre_tsc;
182         struct rte_mbuf *m_table[MAX_PKT_BURST];
183 };
184
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
187
188 /*
189  * Vhost TX buffer for each data core.
190  * Every data core maintains a TX buffer for every vhost device,
191  * which is used for batch pkts enqueue for higher performance.
192  */
193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
194
195 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
196                                  / US_PER_S * BURST_TX_DRAIN_US)
197 #define VLAN_HLEN       4
198
199 static inline int
200 open_dma(const char *value)
201 {
202         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203                 return open_ioat(value);
204
205         return -1;
206 }
207
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215         struct rte_eth_vmdq_rx_conf conf;
216         struct rte_eth_vmdq_rx_conf *def_conf =
217                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218         unsigned i;
219
220         memset(&conf, 0, sizeof(conf));
221         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222         conf.nb_pool_maps = num_devices;
223         conf.enable_loop_back = def_conf->enable_loop_back;
224         conf.rx_mode = def_conf->rx_mode;
225
226         for (i = 0; i < conf.nb_pool_maps; i++) {
227                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
228                 conf.pool_map[i].pools = (1UL << i);
229         }
230
231         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234         return 0;
235 }
236
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244         struct rte_eth_dev_info dev_info;
245         struct rte_eth_conf port_conf;
246         struct rte_eth_rxconf *rxconf;
247         struct rte_eth_txconf *txconf;
248         int16_t rx_rings, tx_rings;
249         uint16_t rx_ring_size, tx_ring_size;
250         int retval;
251         uint16_t q;
252
253         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254         retval = rte_eth_dev_info_get(port, &dev_info);
255         if (retval != 0) {
256                 RTE_LOG(ERR, VHOST_PORT,
257                         "Error during getting device (port %u) info: %s\n",
258                         port, strerror(-retval));
259
260                 return retval;
261         }
262
263         rxconf = &dev_info.default_rxconf;
264         txconf = &dev_info.default_txconf;
265         rxconf->rx_drop_en = 1;
266
267         /*configure the number of supported virtio devices based on VMDQ limits */
268         num_devices = dev_info.max_vmdq_pools;
269
270         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272
273         tx_rings = (uint16_t)rte_lcore_count();
274
275         /* Get port configuration. */
276         retval = get_eth_conf(&port_conf, num_devices);
277         if (retval < 0)
278                 return retval;
279         /* NIC queues are divided into pf queues and vmdq queues.  */
280         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
281         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
282         num_vmdq_queues = num_devices * queues_per_pool;
283         num_queues = num_pf_queues + num_vmdq_queues;
284         vmdq_queue_base = dev_info.vmdq_queue_base;
285         vmdq_pool_base  = dev_info.vmdq_pool_base;
286         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
287                 num_pf_queues, num_devices, queues_per_pool);
288
289         if (!rte_eth_dev_is_valid_port(port))
290                 return -1;
291
292         rx_rings = (uint16_t)dev_info.max_rx_queues;
293         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
294                 port_conf.txmode.offloads |=
295                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
296         /* Configure ethernet device. */
297         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
298         if (retval != 0) {
299                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
300                         port, strerror(-retval));
301                 return retval;
302         }
303
304         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
305                 &tx_ring_size);
306         if (retval != 0) {
307                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
308                         "for port %u: %s.\n", port, strerror(-retval));
309                 return retval;
310         }
311         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
312                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
313                         "for Rx queues on port %u.\n", port);
314                 return -1;
315         }
316
317         /* Setup the queues. */
318         rxconf->offloads = port_conf.rxmode.offloads;
319         for (q = 0; q < rx_rings; q ++) {
320                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
321                                                 rte_eth_dev_socket_id(port),
322                                                 rxconf,
323                                                 mbuf_pool);
324                 if (retval < 0) {
325                         RTE_LOG(ERR, VHOST_PORT,
326                                 "Failed to setup rx queue %u of port %u: %s.\n",
327                                 q, port, strerror(-retval));
328                         return retval;
329                 }
330         }
331         txconf->offloads = port_conf.txmode.offloads;
332         for (q = 0; q < tx_rings; q ++) {
333                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
334                                                 rte_eth_dev_socket_id(port),
335                                                 txconf);
336                 if (retval < 0) {
337                         RTE_LOG(ERR, VHOST_PORT,
338                                 "Failed to setup tx queue %u of port %u: %s.\n",
339                                 q, port, strerror(-retval));
340                         return retval;
341                 }
342         }
343
344         /* Start the device. */
345         retval  = rte_eth_dev_start(port);
346         if (retval < 0) {
347                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
348                         port, strerror(-retval));
349                 return retval;
350         }
351
352         if (promiscuous) {
353                 retval = rte_eth_promiscuous_enable(port);
354                 if (retval != 0) {
355                         RTE_LOG(ERR, VHOST_PORT,
356                                 "Failed to enable promiscuous mode on port %u: %s\n",
357                                 port, rte_strerror(-retval));
358                         return retval;
359                 }
360         }
361
362         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363         if (retval < 0) {
364                 RTE_LOG(ERR, VHOST_PORT,
365                         "Failed to get MAC address on port %u: %s\n",
366                         port, rte_strerror(-retval));
367                 return retval;
368         }
369
370         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
371         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
372                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
373                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
374
375         return 0;
376 }
377
378 /*
379  * Set socket file path.
380  */
381 static int
382 us_vhost_parse_socket_path(const char *q_arg)
383 {
384         char *old;
385
386         /* parse number string */
387         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
388                 return -1;
389
390         old = socket_files;
391         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
392         if (socket_files == NULL) {
393                 free(old);
394                 return -1;
395         }
396
397         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
398         nb_sockets++;
399
400         return 0;
401 }
402
403 /*
404  * Parse the portmask provided at run time.
405  */
406 static int
407 parse_portmask(const char *portmask)
408 {
409         char *end = NULL;
410         unsigned long pm;
411
412         errno = 0;
413
414         /* parse hexadecimal string */
415         pm = strtoul(portmask, &end, 16);
416         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
417                 return 0;
418
419         return pm;
420
421 }
422
423 /*
424  * Parse num options at run time.
425  */
426 static int
427 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
428 {
429         char *end = NULL;
430         unsigned long num;
431
432         errno = 0;
433
434         /* parse unsigned int string */
435         num = strtoul(q_arg, &end, 10);
436         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
437                 return -1;
438
439         if (num > max_valid_value)
440                 return -1;
441
442         return num;
443
444 }
445
446 /*
447  * Display usage
448  */
449 static void
450 us_vhost_usage(const char *prgname)
451 {
452         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
453         "               --vm2vm [0|1|2]\n"
454         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
455         "               --socket-file <path>\n"
456         "               --nb-devices ND\n"
457         "               -p PORTMASK: Set mask for ports to be used by application\n"
458         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
459         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
460         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
461         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
462         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
463         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
464         "               --socket-file: The path of the socket file.\n"
465         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
466         "               --tso [0|1] disable/enable TCP segment offload.\n"
467         "               --client register a vhost-user socket as client mode.\n"
468         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
469         "               --dmas register dma channel for specific vhost device.\n",
470                prgname);
471 }
472
473 enum {
474 #define OPT_VM2VM               "vm2vm"
475         OPT_VM2VM_NUM = 256,
476 #define OPT_RX_RETRY            "rx-retry"
477         OPT_RX_RETRY_NUM,
478 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
479         OPT_RX_RETRY_DELAY_NUM,
480 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
481         OPT_RX_RETRY_NUMB_NUM,
482 #define OPT_MERGEABLE           "mergeable"
483         OPT_MERGEABLE_NUM,
484 #define OPT_STATS               "stats"
485         OPT_STATS_NUM,
486 #define OPT_SOCKET_FILE         "socket-file"
487         OPT_SOCKET_FILE_NUM,
488 #define OPT_TX_CSUM             "tx-csum"
489         OPT_TX_CSUM_NUM,
490 #define OPT_TSO                 "tso"
491         OPT_TSO_NUM,
492 #define OPT_CLIENT              "client"
493         OPT_CLIENT_NUM,
494 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
495         OPT_BUILTIN_NET_DRIVER_NUM,
496 #define OPT_DMA_TYPE            "dma-type"
497         OPT_DMA_TYPE_NUM,
498 #define OPT_DMAS                "dmas"
499         OPT_DMAS_NUM,
500 };
501
502 /*
503  * Parse the arguments given in the command line of the application.
504  */
505 static int
506 us_vhost_parse_args(int argc, char **argv)
507 {
508         int opt, ret;
509         int option_index;
510         unsigned i;
511         const char *prgname = argv[0];
512         static struct option long_option[] = {
513                 {OPT_VM2VM, required_argument,
514                                 NULL, OPT_VM2VM_NUM},
515                 {OPT_RX_RETRY, required_argument,
516                                 NULL, OPT_RX_RETRY_NUM},
517                 {OPT_RX_RETRY_DELAY, required_argument,
518                                 NULL, OPT_RX_RETRY_DELAY_NUM},
519                 {OPT_RX_RETRY_NUMB, required_argument,
520                                 NULL, OPT_RX_RETRY_NUMB_NUM},
521                 {OPT_MERGEABLE, required_argument,
522                                 NULL, OPT_MERGEABLE_NUM},
523                 {OPT_STATS, required_argument,
524                                 NULL, OPT_STATS_NUM},
525                 {OPT_SOCKET_FILE, required_argument,
526                                 NULL, OPT_SOCKET_FILE_NUM},
527                 {OPT_TX_CSUM, required_argument,
528                                 NULL, OPT_TX_CSUM_NUM},
529                 {OPT_TSO, required_argument,
530                                 NULL, OPT_TSO_NUM},
531                 {OPT_CLIENT, no_argument,
532                                 NULL, OPT_CLIENT_NUM},
533                 {OPT_BUILTIN_NET_DRIVER, no_argument,
534                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
535                 {OPT_DMA_TYPE, required_argument,
536                                 NULL, OPT_DMA_TYPE_NUM},
537                 {OPT_DMAS, required_argument,
538                                 NULL, OPT_DMAS_NUM},
539                 {NULL, 0, 0, 0},
540         };
541
542         /* Parse command line */
543         while ((opt = getopt_long(argc, argv, "p:P",
544                         long_option, &option_index)) != EOF) {
545                 switch (opt) {
546                 /* Portmask */
547                 case 'p':
548                         enabled_port_mask = parse_portmask(optarg);
549                         if (enabled_port_mask == 0) {
550                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
551                                 us_vhost_usage(prgname);
552                                 return -1;
553                         }
554                         break;
555
556                 case 'P':
557                         promiscuous = 1;
558                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
559                                 ETH_VMDQ_ACCEPT_BROADCAST |
560                                 ETH_VMDQ_ACCEPT_MULTICAST;
561                         break;
562
563                 case OPT_VM2VM_NUM:
564                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
565                         if (ret == -1) {
566                                 RTE_LOG(INFO, VHOST_CONFIG,
567                                         "Invalid argument for "
568                                         "vm2vm [0|1|2]\n");
569                                 us_vhost_usage(prgname);
570                                 return -1;
571                         }
572                         vm2vm_mode = (vm2vm_type)ret;
573                         break;
574
575                 case OPT_RX_RETRY_NUM:
576                         ret = parse_num_opt(optarg, 1);
577                         if (ret == -1) {
578                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
579                                 us_vhost_usage(prgname);
580                                 return -1;
581                         }
582                         enable_retry = ret;
583                         break;
584
585                 case OPT_TX_CSUM_NUM:
586                         ret = parse_num_opt(optarg, 1);
587                         if (ret == -1) {
588                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
589                                 us_vhost_usage(prgname);
590                                 return -1;
591                         }
592                         enable_tx_csum = ret;
593                         break;
594
595                 case OPT_TSO_NUM:
596                         ret = parse_num_opt(optarg, 1);
597                         if (ret == -1) {
598                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
599                                 us_vhost_usage(prgname);
600                                 return -1;
601                         }
602                         enable_tso = ret;
603                         break;
604
605                 case OPT_RX_RETRY_DELAY_NUM:
606                         ret = parse_num_opt(optarg, INT32_MAX);
607                         if (ret == -1) {
608                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
609                                 us_vhost_usage(prgname);
610                                 return -1;
611                         }
612                         burst_rx_delay_time = ret;
613                         break;
614
615                 case OPT_RX_RETRY_NUMB_NUM:
616                         ret = parse_num_opt(optarg, INT32_MAX);
617                         if (ret == -1) {
618                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
619                                 us_vhost_usage(prgname);
620                                 return -1;
621                         }
622                         burst_rx_retry_num = ret;
623                         break;
624
625                 case OPT_MERGEABLE_NUM:
626                         ret = parse_num_opt(optarg, 1);
627                         if (ret == -1) {
628                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
629                                 us_vhost_usage(prgname);
630                                 return -1;
631                         }
632                         mergeable = !!ret;
633                         if (ret) {
634                                 vmdq_conf_default.rxmode.offloads |=
635                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
636                                 vmdq_conf_default.rxmode.max_rx_pkt_len
637                                         = JUMBO_FRAME_MAX_SIZE;
638                         }
639                         break;
640
641                 case OPT_STATS_NUM:
642                         ret = parse_num_opt(optarg, INT32_MAX);
643                         if (ret == -1) {
644                                 RTE_LOG(INFO, VHOST_CONFIG,
645                                         "Invalid argument for stats [0..N]\n");
646                                 us_vhost_usage(prgname);
647                                 return -1;
648                         }
649                         enable_stats = ret;
650                         break;
651
652                 /* Set socket file path. */
653                 case OPT_SOCKET_FILE_NUM:
654                         if (us_vhost_parse_socket_path(optarg) == -1) {
655                                 RTE_LOG(INFO, VHOST_CONFIG,
656                                 "Invalid argument for socket name (Max %d characters)\n",
657                                 PATH_MAX);
658                                 us_vhost_usage(prgname);
659                                 return -1;
660                         }
661                         break;
662
663                 case OPT_DMA_TYPE_NUM:
664                         dma_type = optarg;
665                         break;
666
667                 case OPT_DMAS_NUM:
668                         if (open_dma(optarg) == -1) {
669                                 RTE_LOG(INFO, VHOST_CONFIG,
670                                         "Wrong DMA args\n");
671                                 us_vhost_usage(prgname);
672                                 return -1;
673                         }
674                         async_vhost_driver = 1;
675                         break;
676
677                 case OPT_CLIENT_NUM:
678                         client_mode = 1;
679                         break;
680
681                 case OPT_BUILTIN_NET_DRIVER_NUM:
682                         builtin_net_driver = 1;
683                         break;
684
685                 /* Invalid option - print options. */
686                 default:
687                         us_vhost_usage(prgname);
688                         return -1;
689                 }
690         }
691
692         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
693                 if (enabled_port_mask & (1 << i))
694                         ports[num_ports++] = i;
695         }
696
697         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
698                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
699                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
700                 return -1;
701         }
702
703         return 0;
704 }
705
706 /*
707  * Update the global var NUM_PORTS and array PORTS according to system ports number
708  * and return valid ports number
709  */
710 static unsigned check_ports_num(unsigned nb_ports)
711 {
712         unsigned valid_num_ports = num_ports;
713         unsigned portid;
714
715         if (num_ports > nb_ports) {
716                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
717                         num_ports, nb_ports);
718                 num_ports = nb_ports;
719         }
720
721         for (portid = 0; portid < num_ports; portid ++) {
722                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
723                         RTE_LOG(INFO, VHOST_PORT,
724                                 "\nSpecified port ID(%u) is not valid\n",
725                                 ports[portid]);
726                         ports[portid] = INVALID_PORT_ID;
727                         valid_num_ports--;
728                 }
729         }
730         return valid_num_ports;
731 }
732
733 static __rte_always_inline struct vhost_dev *
734 find_vhost_dev(struct rte_ether_addr *mac)
735 {
736         struct vhost_dev *vdev;
737
738         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
739                 if (vdev->ready == DEVICE_RX &&
740                     rte_is_same_ether_addr(mac, &vdev->mac_address))
741                         return vdev;
742         }
743
744         return NULL;
745 }
746
747 /*
748  * This function learns the MAC address of the device and registers this along with a
749  * vlan tag to a VMDQ.
750  */
751 static int
752 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
753 {
754         struct rte_ether_hdr *pkt_hdr;
755         int i, ret;
756
757         /* Learn MAC address of guest device from packet */
758         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
759
760         if (find_vhost_dev(&pkt_hdr->s_addr)) {
761                 RTE_LOG(ERR, VHOST_DATA,
762                         "(%d) device is using a registered MAC!\n",
763                         vdev->vid);
764                 return -1;
765         }
766
767         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
768                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
769
770         /* vlan_tag currently uses the device_id. */
771         vdev->vlan_tag = vlan_tags[vdev->vid];
772
773         /* Print out VMDQ registration info. */
774         RTE_LOG(INFO, VHOST_DATA,
775                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
776                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
777                 vdev->vlan_tag);
778
779         /* Register the MAC address. */
780         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
781                                 (uint32_t)vdev->vid + vmdq_pool_base);
782         if (ret)
783                 RTE_LOG(ERR, VHOST_DATA,
784                         "(%d) failed to add device MAC address to VMDQ\n",
785                         vdev->vid);
786
787         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
788
789         /* Set device as ready for RX. */
790         vdev->ready = DEVICE_RX;
791
792         return 0;
793 }
794
795 /*
796  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
797  * queue before disabling RX on the device.
798  */
799 static inline void
800 unlink_vmdq(struct vhost_dev *vdev)
801 {
802         unsigned i = 0;
803         unsigned rx_count;
804         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
805
806         if (vdev->ready == DEVICE_RX) {
807                 /*clear MAC and VLAN settings*/
808                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
809                 for (i = 0; i < 6; i++)
810                         vdev->mac_address.addr_bytes[i] = 0;
811
812                 vdev->vlan_tag = 0;
813
814                 /*Clear out the receive buffers*/
815                 rx_count = rte_eth_rx_burst(ports[0],
816                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
817
818                 while (rx_count) {
819                         for (i = 0; i < rx_count; i++)
820                                 rte_pktmbuf_free(pkts_burst[i]);
821
822                         rx_count = rte_eth_rx_burst(ports[0],
823                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
824                 }
825
826                 vdev->ready = DEVICE_MAC_LEARNING;
827         }
828 }
829
830 static inline void
831 free_pkts(struct rte_mbuf **pkts, uint16_t n)
832 {
833         while (n--)
834                 rte_pktmbuf_free(pkts[n]);
835 }
836
837 static __rte_always_inline void
838 complete_async_pkts(struct vhost_dev *vdev)
839 {
840         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
841         uint16_t complete_count;
842
843         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
844                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
845         if (complete_count) {
846                 free_pkts(p_cpl, complete_count);
847                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
848         }
849
850 }
851
852 static __rte_always_inline void
853 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
854             struct rte_mbuf *m)
855 {
856         uint16_t ret;
857
858         if (builtin_net_driver) {
859                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
860         } else {
861                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
862         }
863
864         if (enable_stats) {
865                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
866                                 __ATOMIC_SEQ_CST);
867                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
868                                 __ATOMIC_SEQ_CST);
869                 src_vdev->stats.tx_total++;
870                 src_vdev->stats.tx += ret;
871         }
872 }
873
874 static __rte_always_inline void
875 drain_vhost(struct vhost_dev *vdev)
876 {
877         uint16_t ret;
878         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
879         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
880         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
881
882         if (builtin_net_driver) {
883                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
884         } else if (async_vhost_driver) {
885                 uint32_t cpu_cpl_nr = 0;
886                 uint16_t enqueue_fail = 0;
887                 struct rte_mbuf *m_cpu_cpl[nr_xmit];
888
889                 complete_async_pkts(vdev);
890                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
891                                         m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
892                 __atomic_add_fetch(&vdev->pkts_inflight, ret - cpu_cpl_nr, __ATOMIC_SEQ_CST);
893
894                 if (cpu_cpl_nr)
895                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
896
897                 enqueue_fail = nr_xmit - ret;
898                 if (enqueue_fail)
899                         free_pkts(&m[ret], nr_xmit - ret);
900         } else {
901                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
902                                                 m, nr_xmit);
903         }
904
905         if (enable_stats) {
906                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
907                                 __ATOMIC_SEQ_CST);
908                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
909                                 __ATOMIC_SEQ_CST);
910         }
911
912         if (!async_vhost_driver)
913                 free_pkts(m, nr_xmit);
914 }
915
916 static __rte_always_inline void
917 drain_vhost_table(void)
918 {
919         uint16_t lcore_id = rte_lcore_id();
920         struct vhost_bufftable *vhost_txq;
921         struct vhost_dev *vdev;
922         uint64_t cur_tsc;
923
924         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
925                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
926                                                 + vdev->vid];
927
928                 cur_tsc = rte_rdtsc();
929                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
930                                 > MBUF_TABLE_DRAIN_TSC)) {
931                         RTE_LOG_DP(DEBUG, VHOST_DATA,
932                                 "Vhost TX queue drained after timeout with burst size %u\n",
933                                 vhost_txq->len);
934                         drain_vhost(vdev);
935                         vhost_txq->len = 0;
936                         vhost_txq->pre_tsc = cur_tsc;
937                 }
938         }
939 }
940
941 /*
942  * Check if the packet destination MAC address is for a local device. If so then put
943  * the packet on that devices RX queue. If not then return.
944  */
945 static __rte_always_inline int
946 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
947 {
948         struct rte_ether_hdr *pkt_hdr;
949         struct vhost_dev *dst_vdev;
950         struct vhost_bufftable *vhost_txq;
951         uint16_t lcore_id = rte_lcore_id();
952         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
953
954         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
955         if (!dst_vdev)
956                 return -1;
957
958         if (vdev->vid == dst_vdev->vid) {
959                 RTE_LOG_DP(DEBUG, VHOST_DATA,
960                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
961                         vdev->vid);
962                 return 0;
963         }
964
965         RTE_LOG_DP(DEBUG, VHOST_DATA,
966                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
967
968         if (unlikely(dst_vdev->remove)) {
969                 RTE_LOG_DP(DEBUG, VHOST_DATA,
970                         "(%d) device is marked for removal\n", dst_vdev->vid);
971                 return 0;
972         }
973
974         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
975         vhost_txq->m_table[vhost_txq->len++] = m;
976
977         if (enable_stats) {
978                 vdev->stats.tx_total++;
979                 vdev->stats.tx++;
980         }
981
982         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
983                 drain_vhost(dst_vdev);
984                 vhost_txq->len = 0;
985                 vhost_txq->pre_tsc = rte_rdtsc();
986         }
987         return 0;
988 }
989
990 /*
991  * Check if the destination MAC of a packet is one local VM,
992  * and get its vlan tag, and offset if it is.
993  */
994 static __rte_always_inline int
995 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
996         uint32_t *offset, uint16_t *vlan_tag)
997 {
998         struct vhost_dev *dst_vdev;
999         struct rte_ether_hdr *pkt_hdr =
1000                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1001
1002         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1003         if (!dst_vdev)
1004                 return 0;
1005
1006         if (vdev->vid == dst_vdev->vid) {
1007                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1008                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1009                         vdev->vid);
1010                 return -1;
1011         }
1012
1013         /*
1014          * HW vlan strip will reduce the packet length
1015          * by minus length of vlan tag, so need restore
1016          * the packet length by plus it.
1017          */
1018         *offset  = VLAN_HLEN;
1019         *vlan_tag = vlan_tags[vdev->vid];
1020
1021         RTE_LOG_DP(DEBUG, VHOST_DATA,
1022                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1023                 vdev->vid, dst_vdev->vid, *vlan_tag);
1024
1025         return 0;
1026 }
1027
1028 static void virtio_tx_offload(struct rte_mbuf *m)
1029 {
1030         struct rte_net_hdr_lens hdr_lens;
1031         struct rte_ipv4_hdr *ipv4_hdr;
1032         struct rte_tcp_hdr *tcp_hdr;
1033         uint32_t ptype;
1034         void *l3_hdr;
1035
1036         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1037         m->l2_len = hdr_lens.l2_len;
1038         m->l3_len = hdr_lens.l3_len;
1039         m->l4_len = hdr_lens.l4_len;
1040
1041         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1042         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1043                 m->l2_len + m->l3_len);
1044
1045         m->ol_flags |= PKT_TX_TCP_SEG;
1046         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1047                 m->ol_flags |= PKT_TX_IPV4;
1048                 m->ol_flags |= PKT_TX_IP_CKSUM;
1049                 ipv4_hdr = l3_hdr;
1050                 ipv4_hdr->hdr_checksum = 0;
1051                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1052         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1053                 m->ol_flags |= PKT_TX_IPV6;
1054                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1055         }
1056 }
1057
1058 static __rte_always_inline void
1059 do_drain_mbuf_table(struct mbuf_table *tx_q)
1060 {
1061         uint16_t count;
1062
1063         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1064                                  tx_q->m_table, tx_q->len);
1065         if (unlikely(count < tx_q->len))
1066                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1067
1068         tx_q->len = 0;
1069 }
1070
1071 /*
1072  * This function routes the TX packet to the correct interface. This
1073  * may be a local device or the physical port.
1074  */
1075 static __rte_always_inline void
1076 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1077 {
1078         struct mbuf_table *tx_q;
1079         unsigned offset = 0;
1080         const uint16_t lcore_id = rte_lcore_id();
1081         struct rte_ether_hdr *nh;
1082
1083
1084         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1085         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1086                 struct vhost_dev *vdev2;
1087
1088                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1089                         if (vdev2 != vdev)
1090                                 sync_virtio_xmit(vdev2, vdev, m);
1091                 }
1092                 goto queue2nic;
1093         }
1094
1095         /*check if destination is local VM*/
1096         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1097                 return;
1098
1099         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1100                 if (unlikely(find_local_dest(vdev, m, &offset,
1101                                              &vlan_tag) != 0)) {
1102                         rte_pktmbuf_free(m);
1103                         return;
1104                 }
1105         }
1106
1107         RTE_LOG_DP(DEBUG, VHOST_DATA,
1108                 "(%d) TX: MAC address is external\n", vdev->vid);
1109
1110 queue2nic:
1111
1112         /*Add packet to the port tx queue*/
1113         tx_q = &lcore_tx_queue[lcore_id];
1114
1115         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1116         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1117                 /* Guest has inserted the vlan tag. */
1118                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1119                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1120                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1121                         (vh->vlan_tci != vlan_tag_be))
1122                         vh->vlan_tci = vlan_tag_be;
1123         } else {
1124                 m->ol_flags |= PKT_TX_VLAN_PKT;
1125
1126                 /*
1127                  * Find the right seg to adjust the data len when offset is
1128                  * bigger than tail room size.
1129                  */
1130                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1131                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1132                                 m->data_len += offset;
1133                         else {
1134                                 struct rte_mbuf *seg = m;
1135
1136                                 while ((seg->next != NULL) &&
1137                                         (offset > rte_pktmbuf_tailroom(seg)))
1138                                         seg = seg->next;
1139
1140                                 seg->data_len += offset;
1141                         }
1142                         m->pkt_len += offset;
1143                 }
1144
1145                 m->vlan_tci = vlan_tag;
1146         }
1147
1148         if (m->ol_flags & PKT_RX_LRO)
1149                 virtio_tx_offload(m);
1150
1151         tx_q->m_table[tx_q->len++] = m;
1152         if (enable_stats) {
1153                 vdev->stats.tx_total++;
1154                 vdev->stats.tx++;
1155         }
1156
1157         if (unlikely(tx_q->len == MAX_PKT_BURST))
1158                 do_drain_mbuf_table(tx_q);
1159 }
1160
1161
1162 static __rte_always_inline void
1163 drain_mbuf_table(struct mbuf_table *tx_q)
1164 {
1165         static uint64_t prev_tsc;
1166         uint64_t cur_tsc;
1167
1168         if (tx_q->len == 0)
1169                 return;
1170
1171         cur_tsc = rte_rdtsc();
1172         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1173                 prev_tsc = cur_tsc;
1174
1175                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1176                         "TX queue drained after timeout with burst size %u\n",
1177                         tx_q->len);
1178                 do_drain_mbuf_table(tx_q);
1179         }
1180 }
1181
1182 static __rte_always_inline void
1183 drain_eth_rx(struct vhost_dev *vdev)
1184 {
1185         uint16_t rx_count, enqueue_count;
1186         struct rte_mbuf *pkts[MAX_PKT_BURST];
1187
1188         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1189                                     pkts, MAX_PKT_BURST);
1190
1191         if (!rx_count)
1192                 return;
1193
1194         /*
1195          * When "enable_retry" is set, here we wait and retry when there
1196          * is no enough free slots in the queue to hold @rx_count packets,
1197          * to diminish packet loss.
1198          */
1199         if (enable_retry &&
1200             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1201                         VIRTIO_RXQ))) {
1202                 uint32_t retry;
1203
1204                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1205                         rte_delay_us(burst_rx_delay_time);
1206                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1207                                         VIRTIO_RXQ))
1208                                 break;
1209                 }
1210         }
1211
1212         if (builtin_net_driver) {
1213                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1214                                                 pkts, rx_count);
1215         } else if (async_vhost_driver) {
1216                 uint32_t cpu_cpl_nr = 0;
1217                 uint16_t enqueue_fail = 0;
1218                 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1219
1220                 complete_async_pkts(vdev);
1221                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1222                                         VIRTIO_RXQ, pkts, rx_count,
1223                                         m_cpu_cpl, &cpu_cpl_nr);
1224                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count - cpu_cpl_nr,
1225                                         __ATOMIC_SEQ_CST);
1226
1227                 if (cpu_cpl_nr)
1228                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
1229
1230                 enqueue_fail = rx_count - enqueue_count;
1231                 if (enqueue_fail)
1232                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1233
1234         } else {
1235                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1236                                                 pkts, rx_count);
1237         }
1238
1239         if (enable_stats) {
1240                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1241                                 __ATOMIC_SEQ_CST);
1242                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1243                                 __ATOMIC_SEQ_CST);
1244         }
1245
1246         if (!async_vhost_driver)
1247                 free_pkts(pkts, rx_count);
1248 }
1249
1250 static __rte_always_inline void
1251 drain_virtio_tx(struct vhost_dev *vdev)
1252 {
1253         struct rte_mbuf *pkts[MAX_PKT_BURST];
1254         uint16_t count;
1255         uint16_t i;
1256
1257         if (builtin_net_driver) {
1258                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1259                                         pkts, MAX_PKT_BURST);
1260         } else {
1261                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1262                                         mbuf_pool, pkts, MAX_PKT_BURST);
1263         }
1264
1265         /* setup VMDq for the first packet */
1266         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1267                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1268                         free_pkts(pkts, count);
1269         }
1270
1271         for (i = 0; i < count; ++i)
1272                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1273 }
1274
1275 /*
1276  * Main function of vhost-switch. It basically does:
1277  *
1278  * for each vhost device {
1279  *    - drain_eth_rx()
1280  *
1281  *      Which drains the host eth Rx queue linked to the vhost device,
1282  *      and deliver all of them to guest virito Rx ring associated with
1283  *      this vhost device.
1284  *
1285  *    - drain_virtio_tx()
1286  *
1287  *      Which drains the guest virtio Tx queue and deliver all of them
1288  *      to the target, which could be another vhost device, or the
1289  *      physical eth dev. The route is done in function "virtio_tx_route".
1290  * }
1291  */
1292 static int
1293 switch_worker(void *arg __rte_unused)
1294 {
1295         unsigned i;
1296         unsigned lcore_id = rte_lcore_id();
1297         struct vhost_dev *vdev;
1298         struct mbuf_table *tx_q;
1299
1300         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1301
1302         tx_q = &lcore_tx_queue[lcore_id];
1303         for (i = 0; i < rte_lcore_count(); i++) {
1304                 if (lcore_ids[i] == lcore_id) {
1305                         tx_q->txq_id = i;
1306                         break;
1307                 }
1308         }
1309
1310         while(1) {
1311                 drain_mbuf_table(tx_q);
1312                 drain_vhost_table();
1313                 /*
1314                  * Inform the configuration core that we have exited the
1315                  * linked list and that no devices are in use if requested.
1316                  */
1317                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1318                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1319
1320                 /*
1321                  * Process vhost devices
1322                  */
1323                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1324                               lcore_vdev_entry) {
1325                         if (unlikely(vdev->remove)) {
1326                                 unlink_vmdq(vdev);
1327                                 vdev->ready = DEVICE_SAFE_REMOVE;
1328                                 continue;
1329                         }
1330
1331                         if (likely(vdev->ready == DEVICE_RX))
1332                                 drain_eth_rx(vdev);
1333
1334                         if (likely(!vdev->remove))
1335                                 drain_virtio_tx(vdev);
1336                 }
1337         }
1338
1339         return 0;
1340 }
1341
1342 /*
1343  * Remove a device from the specific data core linked list and from the
1344  * main linked list. Synchonization  occurs through the use of the
1345  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1346  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1347  */
1348 static void
1349 destroy_device(int vid)
1350 {
1351         struct vhost_dev *vdev = NULL;
1352         int lcore;
1353         uint16_t i;
1354
1355         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1356                 if (vdev->vid == vid)
1357                         break;
1358         }
1359         if (!vdev)
1360                 return;
1361         /*set the remove flag. */
1362         vdev->remove = 1;
1363         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1364                 rte_pause();
1365         }
1366
1367         for (i = 0; i < RTE_MAX_LCORE; i++)
1368                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1369
1370         if (builtin_net_driver)
1371                 vs_vhost_net_remove(vdev);
1372
1373         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1374                      lcore_vdev_entry);
1375         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1376
1377
1378         /* Set the dev_removal_flag on each lcore. */
1379         RTE_LCORE_FOREACH_WORKER(lcore)
1380                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1381
1382         /*
1383          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1384          * we can be sure that they can no longer access the device removed
1385          * from the linked lists and that the devices are no longer in use.
1386          */
1387         RTE_LCORE_FOREACH_WORKER(lcore) {
1388                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1389                         rte_pause();
1390         }
1391
1392         lcore_info[vdev->coreid].device_num--;
1393
1394         RTE_LOG(INFO, VHOST_DATA,
1395                 "(%d) device has been removed from data core\n",
1396                 vdev->vid);
1397
1398         if (async_vhost_driver) {
1399                 uint16_t n_pkt = 0;
1400                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1401
1402                 while (vdev->pkts_inflight) {
1403                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1404                                                 m_cpl, vdev->pkts_inflight);
1405                         free_pkts(m_cpl, n_pkt);
1406                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1407                 }
1408
1409                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1410         }
1411
1412         rte_free(vdev);
1413 }
1414
1415 /*
1416  * A new device is added to a data core. First the device is added to the main linked list
1417  * and then allocated to a specific data core.
1418  */
1419 static int
1420 new_device(int vid)
1421 {
1422         int lcore, core_add = 0;
1423         uint16_t i;
1424         uint32_t device_num_min = num_devices;
1425         struct vhost_dev *vdev;
1426         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1427         if (vdev == NULL) {
1428                 RTE_LOG(INFO, VHOST_DATA,
1429                         "(%d) couldn't allocate memory for vhost dev\n",
1430                         vid);
1431                 return -1;
1432         }
1433         vdev->vid = vid;
1434
1435         for (i = 0; i < RTE_MAX_LCORE; i++) {
1436                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1437                         = rte_zmalloc("vhost bufftable",
1438                                 sizeof(struct vhost_bufftable),
1439                                 RTE_CACHE_LINE_SIZE);
1440
1441                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1442                         RTE_LOG(INFO, VHOST_DATA,
1443                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1444                         return -1;
1445                 }
1446         }
1447
1448         if (builtin_net_driver)
1449                 vs_vhost_net_setup(vdev);
1450
1451         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1452         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1453
1454         /*reset ready flag*/
1455         vdev->ready = DEVICE_MAC_LEARNING;
1456         vdev->remove = 0;
1457
1458         /* Find a suitable lcore to add the device. */
1459         RTE_LCORE_FOREACH_WORKER(lcore) {
1460                 if (lcore_info[lcore].device_num < device_num_min) {
1461                         device_num_min = lcore_info[lcore].device_num;
1462                         core_add = lcore;
1463                 }
1464         }
1465         vdev->coreid = core_add;
1466
1467         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1468                           lcore_vdev_entry);
1469         lcore_info[vdev->coreid].device_num++;
1470
1471         /* Disable notifications. */
1472         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1473         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1474
1475         RTE_LOG(INFO, VHOST_DATA,
1476                 "(%d) device has been added to data core %d\n",
1477                 vid, vdev->coreid);
1478
1479         if (async_vhost_driver) {
1480                 struct rte_vhost_async_config config = {0};
1481                 struct rte_vhost_async_channel_ops channel_ops;
1482
1483                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1484                         channel_ops.transfer_data = ioat_transfer_data_cb;
1485                         channel_ops.check_completed_copies =
1486                                 ioat_check_completed_copies_cb;
1487
1488                         config.features = RTE_VHOST_ASYNC_INORDER;
1489                         config.async_threshold = 256;
1490
1491                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1492                                 config, &channel_ops);
1493                 }
1494         }
1495
1496         return 0;
1497 }
1498
1499 static int
1500 vring_state_changed(int vid, uint16_t queue_id, int enable)
1501 {
1502         struct vhost_dev *vdev = NULL;
1503
1504         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1505                 if (vdev->vid == vid)
1506                         break;
1507         }
1508         if (!vdev)
1509                 return -1;
1510
1511         if (queue_id != VIRTIO_RXQ)
1512                 return 0;
1513
1514         if (async_vhost_driver) {
1515                 if (!enable) {
1516                         uint16_t n_pkt = 0;
1517                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1518
1519                         while (vdev->pkts_inflight) {
1520                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1521                                                         m_cpl, vdev->pkts_inflight);
1522                                 free_pkts(m_cpl, n_pkt);
1523                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1524                         }
1525                 }
1526         }
1527
1528         return 0;
1529 }
1530
1531 /*
1532  * These callback allow devices to be added to the data core when configuration
1533  * has been fully complete.
1534  */
1535 static const struct vhost_device_ops virtio_net_device_ops =
1536 {
1537         .new_device =  new_device,
1538         .destroy_device = destroy_device,
1539         .vring_state_changed = vring_state_changed,
1540 };
1541
1542 /*
1543  * This is a thread will wake up after a period to print stats if the user has
1544  * enabled them.
1545  */
1546 static void *
1547 print_stats(__rte_unused void *arg)
1548 {
1549         struct vhost_dev *vdev;
1550         uint64_t tx_dropped, rx_dropped;
1551         uint64_t tx, tx_total, rx, rx_total;
1552         const char clr[] = { 27, '[', '2', 'J', '\0' };
1553         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1554
1555         while(1) {
1556                 sleep(enable_stats);
1557
1558                 /* Clear screen and move to top left */
1559                 printf("%s%s\n", clr, top_left);
1560                 printf("Device statistics =================================\n");
1561
1562                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1563                         tx_total   = vdev->stats.tx_total;
1564                         tx         = vdev->stats.tx;
1565                         tx_dropped = tx_total - tx;
1566
1567                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1568                                 __ATOMIC_SEQ_CST);
1569                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1570                                 __ATOMIC_SEQ_CST);
1571                         rx_dropped = rx_total - rx;
1572
1573                         printf("Statistics for device %d\n"
1574                                 "-----------------------\n"
1575                                 "TX total:              %" PRIu64 "\n"
1576                                 "TX dropped:            %" PRIu64 "\n"
1577                                 "TX successful:         %" PRIu64 "\n"
1578                                 "RX total:              %" PRIu64 "\n"
1579                                 "RX dropped:            %" PRIu64 "\n"
1580                                 "RX successful:         %" PRIu64 "\n",
1581                                 vdev->vid,
1582                                 tx_total, tx_dropped, tx,
1583                                 rx_total, rx_dropped, rx);
1584                 }
1585
1586                 printf("===================================================\n");
1587
1588                 fflush(stdout);
1589         }
1590
1591         return NULL;
1592 }
1593
1594 static void
1595 unregister_drivers(int socket_num)
1596 {
1597         int i, ret;
1598
1599         for (i = 0; i < socket_num; i++) {
1600                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1601                 if (ret != 0)
1602                         RTE_LOG(ERR, VHOST_CONFIG,
1603                                 "Fail to unregister vhost driver for %s.\n",
1604                                 socket_files + i * PATH_MAX);
1605         }
1606 }
1607
1608 /* When we receive a INT signal, unregister vhost driver */
1609 static void
1610 sigint_handler(__rte_unused int signum)
1611 {
1612         /* Unregister vhost driver. */
1613         unregister_drivers(nb_sockets);
1614
1615         exit(0);
1616 }
1617
1618 /*
1619  * While creating an mbuf pool, one key thing is to figure out how
1620  * many mbuf entries is enough for our use. FYI, here are some
1621  * guidelines:
1622  *
1623  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1624  *
1625  * - For each switch core (A CPU core does the packet switch), we need
1626  *   also make some reservation for receiving the packets from virtio
1627  *   Tx queue. How many is enough depends on the usage. It's normally
1628  *   a simple calculation like following:
1629  *
1630  *       MAX_PKT_BURST * max packet size / mbuf size
1631  *
1632  *   So, we definitely need allocate more mbufs when TSO is enabled.
1633  *
1634  * - Similarly, for each switching core, we should serve @nr_rx_desc
1635  *   mbufs for receiving the packets from physical NIC device.
1636  *
1637  * - We also need make sure, for each switch core, we have allocated
1638  *   enough mbufs to fill up the mbuf cache.
1639  */
1640 static void
1641 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1642         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1643 {
1644         uint32_t nr_mbufs;
1645         uint32_t nr_mbufs_per_core;
1646         uint32_t mtu = 1500;
1647
1648         if (mergeable)
1649                 mtu = 9000;
1650         if (enable_tso)
1651                 mtu = 64 * 1024;
1652
1653         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1654                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1655         nr_mbufs_per_core += nr_rx_desc;
1656         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1657
1658         nr_mbufs  = nr_queues * nr_rx_desc;
1659         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1660         nr_mbufs *= nr_port;
1661
1662         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1663                                             nr_mbuf_cache, 0, mbuf_size,
1664                                             rte_socket_id());
1665         if (mbuf_pool == NULL)
1666                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1667 }
1668
1669 /*
1670  * Main function, does initialisation and calls the per-lcore functions.
1671  */
1672 int
1673 main(int argc, char *argv[])
1674 {
1675         unsigned lcore_id, core_id = 0;
1676         unsigned nb_ports, valid_num_ports;
1677         int ret, i;
1678         uint16_t portid;
1679         static pthread_t tid;
1680         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1681
1682         signal(SIGINT, sigint_handler);
1683
1684         /* init EAL */
1685         ret = rte_eal_init(argc, argv);
1686         if (ret < 0)
1687                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1688         argc -= ret;
1689         argv += ret;
1690
1691         /* parse app arguments */
1692         ret = us_vhost_parse_args(argc, argv);
1693         if (ret < 0)
1694                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1695
1696         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1697                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1698
1699                 if (rte_lcore_is_enabled(lcore_id))
1700                         lcore_ids[core_id++] = lcore_id;
1701         }
1702
1703         if (rte_lcore_count() > RTE_MAX_LCORE)
1704                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1705
1706         /* Get the number of physical ports. */
1707         nb_ports = rte_eth_dev_count_avail();
1708
1709         /*
1710          * Update the global var NUM_PORTS and global array PORTS
1711          * and get value of var VALID_NUM_PORTS according to system ports number
1712          */
1713         valid_num_ports = check_ports_num(nb_ports);
1714
1715         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1716                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1717                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1718                 return -1;
1719         }
1720
1721         /*
1722          * FIXME: here we are trying to allocate mbufs big enough for
1723          * @MAX_QUEUES, but the truth is we're never going to use that
1724          * many queues here. We probably should only do allocation for
1725          * those queues we are going to use.
1726          */
1727         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1728                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1729
1730         if (vm2vm_mode == VM2VM_HARDWARE) {
1731                 /* Enable VT loop back to let L2 switch to do it. */
1732                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1733                 RTE_LOG(DEBUG, VHOST_CONFIG,
1734                         "Enable loop back for L2 switch in vmdq.\n");
1735         }
1736
1737         /* initialize all ports */
1738         RTE_ETH_FOREACH_DEV(portid) {
1739                 /* skip ports that are not enabled */
1740                 if ((enabled_port_mask & (1 << portid)) == 0) {
1741                         RTE_LOG(INFO, VHOST_PORT,
1742                                 "Skipping disabled port %d\n", portid);
1743                         continue;
1744                 }
1745                 if (port_init(portid) != 0)
1746                         rte_exit(EXIT_FAILURE,
1747                                 "Cannot initialize network ports\n");
1748         }
1749
1750         /* Enable stats if the user option is set. */
1751         if (enable_stats) {
1752                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1753                                         print_stats, NULL);
1754                 if (ret < 0)
1755                         rte_exit(EXIT_FAILURE,
1756                                 "Cannot create print-stats thread\n");
1757         }
1758
1759         /* Launch all data cores. */
1760         RTE_LCORE_FOREACH_WORKER(lcore_id)
1761                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1762
1763         if (client_mode)
1764                 flags |= RTE_VHOST_USER_CLIENT;
1765
1766         /* Register vhost user driver to handle vhost messages. */
1767         for (i = 0; i < nb_sockets; i++) {
1768                 char *file = socket_files + i * PATH_MAX;
1769
1770                 if (async_vhost_driver)
1771                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1772
1773                 ret = rte_vhost_driver_register(file, flags);
1774                 if (ret != 0) {
1775                         unregister_drivers(i);
1776                         rte_exit(EXIT_FAILURE,
1777                                 "vhost driver register failure.\n");
1778                 }
1779
1780                 if (builtin_net_driver)
1781                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1782
1783                 if (mergeable == 0) {
1784                         rte_vhost_driver_disable_features(file,
1785                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1786                 }
1787
1788                 if (enable_tx_csum == 0) {
1789                         rte_vhost_driver_disable_features(file,
1790                                 1ULL << VIRTIO_NET_F_CSUM);
1791                 }
1792
1793                 if (enable_tso == 0) {
1794                         rte_vhost_driver_disable_features(file,
1795                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1796                         rte_vhost_driver_disable_features(file,
1797                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1798                         rte_vhost_driver_disable_features(file,
1799                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1800                         rte_vhost_driver_disable_features(file,
1801                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1802                 }
1803
1804                 if (promiscuous) {
1805                         rte_vhost_driver_enable_features(file,
1806                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1807                 }
1808
1809                 ret = rte_vhost_driver_callback_register(file,
1810                         &virtio_net_device_ops);
1811                 if (ret != 0) {
1812                         rte_exit(EXIT_FAILURE,
1813                                 "failed to register vhost driver callbacks.\n");
1814                 }
1815
1816                 if (rte_vhost_driver_start(file) < 0) {
1817                         rte_exit(EXIT_FAILURE,
1818                                 "failed to start vhost driver.\n");
1819                 }
1820         }
1821
1822         RTE_LCORE_FOREACH_WORKER(lcore_id)
1823                 rte_eal_wait_lcore(lcore_id);
1824
1825         /* clean up the EAL */
1826         rte_eal_cleanup();
1827
1828         return 0;
1829 }