net: add macro for VLAN header length
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX                       1
52 #define DEVICE_SAFE_REMOVE      2
53
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57
58 #define INVALID_PORT_ID 0xFF
59
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75         VM2VM_DISABLED = 0,
76         VM2VM_SOFTWARE = 1,
77         VM2VM_HARDWARE = 2,
78         VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92
93 static int client_mode;
94
95 static int builtin_net_driver;
96
97 static int async_vhost_driver;
98
99 static char *dma_type;
100
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109
110 /* empty vmdq configuration structure. Filled in programatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112         .rxmode = {
113                 .mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
114                 .split_hdr_size = 0,
115                 /*
116                  * VLAN strip is necessary for 1G NIC such as I350,
117                  * this fixes bug of ipv4 forwarding in guest can't
118                  * forward pakets from one virtio dev to another virtio dev.
119                  */
120                 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
121         },
122
123         .txmode = {
124                 .mq_mode = RTE_ETH_MQ_TX_NONE,
125                 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
126                              RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
127                              RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
128                              RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
129                              RTE_ETH_TX_OFFLOAD_TCP_TSO),
130         },
131         .rx_adv_conf = {
132                 /*
133                  * should be overridden separately in code with
134                  * appropriate values
135                  */
136                 .vmdq_rx_conf = {
137                         .nb_queue_pools = RTE_ETH_8_POOLS,
138                         .enable_default_pool = 0,
139                         .default_pool = 0,
140                         .nb_pool_maps = 0,
141                         .pool_map = {{0, 0},},
142                 },
143         },
144 };
145
146
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153
154 const uint16_t vlan_tags[] = {
155         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
157         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167
168 static struct vhost_dev_tailq_list vhost_dev_list =
169         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175         unsigned len;
176         unsigned txq_id;
177         struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179
180 struct vhost_bufftable {
181         uint32_t len;
182         uint64_t pre_tsc;
183         struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195
196 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
197                                  / US_PER_S * BURST_TX_DRAIN_US)
198
199 static inline int
200 open_dma(const char *value)
201 {
202         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203                 return open_ioat(value);
204
205         return -1;
206 }
207
208 /*
209  * Builds up the correct configuration for VMDQ VLAN pool map
210  * according to the pool & queue limits.
211  */
212 static inline int
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 {
215         struct rte_eth_vmdq_rx_conf conf;
216         struct rte_eth_vmdq_rx_conf *def_conf =
217                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
218         unsigned i;
219
220         memset(&conf, 0, sizeof(conf));
221         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222         conf.nb_pool_maps = num_devices;
223         conf.enable_loop_back = def_conf->enable_loop_back;
224         conf.rx_mode = def_conf->rx_mode;
225
226         for (i = 0; i < conf.nb_pool_maps; i++) {
227                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
228                 conf.pool_map[i].pools = (1UL << i);
229         }
230
231         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
234         return 0;
235 }
236
237 /*
238  * Initialises a given port using global settings and with the rx buffers
239  * coming from the mbuf_pool passed as parameter
240  */
241 static inline int
242 port_init(uint16_t port)
243 {
244         struct rte_eth_dev_info dev_info;
245         struct rte_eth_conf port_conf;
246         struct rte_eth_rxconf *rxconf;
247         struct rte_eth_txconf *txconf;
248         int16_t rx_rings, tx_rings;
249         uint16_t rx_ring_size, tx_ring_size;
250         int retval;
251         uint16_t q;
252
253         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254         retval = rte_eth_dev_info_get(port, &dev_info);
255         if (retval != 0) {
256                 RTE_LOG(ERR, VHOST_PORT,
257                         "Error during getting device (port %u) info: %s\n",
258                         port, strerror(-retval));
259
260                 return retval;
261         }
262
263         rxconf = &dev_info.default_rxconf;
264         txconf = &dev_info.default_txconf;
265         rxconf->rx_drop_en = 1;
266
267         /*configure the number of supported virtio devices based on VMDQ limits */
268         num_devices = dev_info.max_vmdq_pools;
269
270         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272
273         tx_rings = (uint16_t)rte_lcore_count();
274
275         if (mergeable) {
276                 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
277                         vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
278                 else
279                         vmdq_conf_default.rxmode.mtu = MAX_MTU;
280         }
281
282         /* Get port configuration. */
283         retval = get_eth_conf(&port_conf, num_devices);
284         if (retval < 0)
285                 return retval;
286         /* NIC queues are divided into pf queues and vmdq queues.  */
287         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
288         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
289         num_vmdq_queues = num_devices * queues_per_pool;
290         num_queues = num_pf_queues + num_vmdq_queues;
291         vmdq_queue_base = dev_info.vmdq_queue_base;
292         vmdq_pool_base  = dev_info.vmdq_pool_base;
293         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
294                 num_pf_queues, num_devices, queues_per_pool);
295
296         if (!rte_eth_dev_is_valid_port(port))
297                 return -1;
298
299         rx_rings = (uint16_t)dev_info.max_rx_queues;
300         if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
301                 port_conf.txmode.offloads |=
302                         RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
303         /* Configure ethernet device. */
304         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
305         if (retval != 0) {
306                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
307                         port, strerror(-retval));
308                 return retval;
309         }
310
311         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
312                 &tx_ring_size);
313         if (retval != 0) {
314                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
315                         "for port %u: %s.\n", port, strerror(-retval));
316                 return retval;
317         }
318         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
319                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
320                         "for Rx queues on port %u.\n", port);
321                 return -1;
322         }
323
324         /* Setup the queues. */
325         rxconf->offloads = port_conf.rxmode.offloads;
326         for (q = 0; q < rx_rings; q ++) {
327                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
328                                                 rte_eth_dev_socket_id(port),
329                                                 rxconf,
330                                                 mbuf_pool);
331                 if (retval < 0) {
332                         RTE_LOG(ERR, VHOST_PORT,
333                                 "Failed to setup rx queue %u of port %u: %s.\n",
334                                 q, port, strerror(-retval));
335                         return retval;
336                 }
337         }
338         txconf->offloads = port_conf.txmode.offloads;
339         for (q = 0; q < tx_rings; q ++) {
340                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
341                                                 rte_eth_dev_socket_id(port),
342                                                 txconf);
343                 if (retval < 0) {
344                         RTE_LOG(ERR, VHOST_PORT,
345                                 "Failed to setup tx queue %u of port %u: %s.\n",
346                                 q, port, strerror(-retval));
347                         return retval;
348                 }
349         }
350
351         /* Start the device. */
352         retval  = rte_eth_dev_start(port);
353         if (retval < 0) {
354                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
355                         port, strerror(-retval));
356                 return retval;
357         }
358
359         if (promiscuous) {
360                 retval = rte_eth_promiscuous_enable(port);
361                 if (retval != 0) {
362                         RTE_LOG(ERR, VHOST_PORT,
363                                 "Failed to enable promiscuous mode on port %u: %s\n",
364                                 port, rte_strerror(-retval));
365                         return retval;
366                 }
367         }
368
369         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
370         if (retval < 0) {
371                 RTE_LOG(ERR, VHOST_PORT,
372                         "Failed to get MAC address on port %u: %s\n",
373                         port, rte_strerror(-retval));
374                 return retval;
375         }
376
377         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
378         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
379                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
380                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
381
382         return 0;
383 }
384
385 /*
386  * Set socket file path.
387  */
388 static int
389 us_vhost_parse_socket_path(const char *q_arg)
390 {
391         char *old;
392
393         /* parse number string */
394         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
395                 return -1;
396
397         old = socket_files;
398         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
399         if (socket_files == NULL) {
400                 free(old);
401                 return -1;
402         }
403
404         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
405         nb_sockets++;
406
407         return 0;
408 }
409
410 /*
411  * Parse the portmask provided at run time.
412  */
413 static int
414 parse_portmask(const char *portmask)
415 {
416         char *end = NULL;
417         unsigned long pm;
418
419         errno = 0;
420
421         /* parse hexadecimal string */
422         pm = strtoul(portmask, &end, 16);
423         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
424                 return 0;
425
426         return pm;
427
428 }
429
430 /*
431  * Parse num options at run time.
432  */
433 static int
434 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
435 {
436         char *end = NULL;
437         unsigned long num;
438
439         errno = 0;
440
441         /* parse unsigned int string */
442         num = strtoul(q_arg, &end, 10);
443         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
444                 return -1;
445
446         if (num > max_valid_value)
447                 return -1;
448
449         return num;
450
451 }
452
453 /*
454  * Display usage
455  */
456 static void
457 us_vhost_usage(const char *prgname)
458 {
459         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
460         "               --vm2vm [0|1|2]\n"
461         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
462         "               --socket-file <path>\n"
463         "               --nb-devices ND\n"
464         "               -p PORTMASK: Set mask for ports to be used by application\n"
465         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
466         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
467         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
468         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
469         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
470         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
471         "               --socket-file: The path of the socket file.\n"
472         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
473         "               --tso [0|1] disable/enable TCP segment offload.\n"
474         "               --client register a vhost-user socket as client mode.\n"
475         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
476         "               --dmas register dma channel for specific vhost device.\n",
477                prgname);
478 }
479
480 enum {
481 #define OPT_VM2VM               "vm2vm"
482         OPT_VM2VM_NUM = 256,
483 #define OPT_RX_RETRY            "rx-retry"
484         OPT_RX_RETRY_NUM,
485 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
486         OPT_RX_RETRY_DELAY_NUM,
487 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
488         OPT_RX_RETRY_NUMB_NUM,
489 #define OPT_MERGEABLE           "mergeable"
490         OPT_MERGEABLE_NUM,
491 #define OPT_STATS               "stats"
492         OPT_STATS_NUM,
493 #define OPT_SOCKET_FILE         "socket-file"
494         OPT_SOCKET_FILE_NUM,
495 #define OPT_TX_CSUM             "tx-csum"
496         OPT_TX_CSUM_NUM,
497 #define OPT_TSO                 "tso"
498         OPT_TSO_NUM,
499 #define OPT_CLIENT              "client"
500         OPT_CLIENT_NUM,
501 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
502         OPT_BUILTIN_NET_DRIVER_NUM,
503 #define OPT_DMA_TYPE            "dma-type"
504         OPT_DMA_TYPE_NUM,
505 #define OPT_DMAS                "dmas"
506         OPT_DMAS_NUM,
507 };
508
509 /*
510  * Parse the arguments given in the command line of the application.
511  */
512 static int
513 us_vhost_parse_args(int argc, char **argv)
514 {
515         int opt, ret;
516         int option_index;
517         unsigned i;
518         const char *prgname = argv[0];
519         static struct option long_option[] = {
520                 {OPT_VM2VM, required_argument,
521                                 NULL, OPT_VM2VM_NUM},
522                 {OPT_RX_RETRY, required_argument,
523                                 NULL, OPT_RX_RETRY_NUM},
524                 {OPT_RX_RETRY_DELAY, required_argument,
525                                 NULL, OPT_RX_RETRY_DELAY_NUM},
526                 {OPT_RX_RETRY_NUMB, required_argument,
527                                 NULL, OPT_RX_RETRY_NUMB_NUM},
528                 {OPT_MERGEABLE, required_argument,
529                                 NULL, OPT_MERGEABLE_NUM},
530                 {OPT_STATS, required_argument,
531                                 NULL, OPT_STATS_NUM},
532                 {OPT_SOCKET_FILE, required_argument,
533                                 NULL, OPT_SOCKET_FILE_NUM},
534                 {OPT_TX_CSUM, required_argument,
535                                 NULL, OPT_TX_CSUM_NUM},
536                 {OPT_TSO, required_argument,
537                                 NULL, OPT_TSO_NUM},
538                 {OPT_CLIENT, no_argument,
539                                 NULL, OPT_CLIENT_NUM},
540                 {OPT_BUILTIN_NET_DRIVER, no_argument,
541                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
542                 {OPT_DMA_TYPE, required_argument,
543                                 NULL, OPT_DMA_TYPE_NUM},
544                 {OPT_DMAS, required_argument,
545                                 NULL, OPT_DMAS_NUM},
546                 {NULL, 0, 0, 0},
547         };
548
549         /* Parse command line */
550         while ((opt = getopt_long(argc, argv, "p:P",
551                         long_option, &option_index)) != EOF) {
552                 switch (opt) {
553                 /* Portmask */
554                 case 'p':
555                         enabled_port_mask = parse_portmask(optarg);
556                         if (enabled_port_mask == 0) {
557                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
558                                 us_vhost_usage(prgname);
559                                 return -1;
560                         }
561                         break;
562
563                 case 'P':
564                         promiscuous = 1;
565                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
566                                 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
567                                 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
568                         break;
569
570                 case OPT_VM2VM_NUM:
571                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
572                         if (ret == -1) {
573                                 RTE_LOG(INFO, VHOST_CONFIG,
574                                         "Invalid argument for "
575                                         "vm2vm [0|1|2]\n");
576                                 us_vhost_usage(prgname);
577                                 return -1;
578                         }
579                         vm2vm_mode = (vm2vm_type)ret;
580                         break;
581
582                 case OPT_RX_RETRY_NUM:
583                         ret = parse_num_opt(optarg, 1);
584                         if (ret == -1) {
585                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
586                                 us_vhost_usage(prgname);
587                                 return -1;
588                         }
589                         enable_retry = ret;
590                         break;
591
592                 case OPT_TX_CSUM_NUM:
593                         ret = parse_num_opt(optarg, 1);
594                         if (ret == -1) {
595                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
596                                 us_vhost_usage(prgname);
597                                 return -1;
598                         }
599                         enable_tx_csum = ret;
600                         break;
601
602                 case OPT_TSO_NUM:
603                         ret = parse_num_opt(optarg, 1);
604                         if (ret == -1) {
605                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
606                                 us_vhost_usage(prgname);
607                                 return -1;
608                         }
609                         enable_tso = ret;
610                         break;
611
612                 case OPT_RX_RETRY_DELAY_NUM:
613                         ret = parse_num_opt(optarg, INT32_MAX);
614                         if (ret == -1) {
615                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
616                                 us_vhost_usage(prgname);
617                                 return -1;
618                         }
619                         burst_rx_delay_time = ret;
620                         break;
621
622                 case OPT_RX_RETRY_NUMB_NUM:
623                         ret = parse_num_opt(optarg, INT32_MAX);
624                         if (ret == -1) {
625                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
626                                 us_vhost_usage(prgname);
627                                 return -1;
628                         }
629                         burst_rx_retry_num = ret;
630                         break;
631
632                 case OPT_MERGEABLE_NUM:
633                         ret = parse_num_opt(optarg, 1);
634                         if (ret == -1) {
635                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
636                                 us_vhost_usage(prgname);
637                                 return -1;
638                         }
639                         mergeable = !!ret;
640                         break;
641
642                 case OPT_STATS_NUM:
643                         ret = parse_num_opt(optarg, INT32_MAX);
644                         if (ret == -1) {
645                                 RTE_LOG(INFO, VHOST_CONFIG,
646                                         "Invalid argument for stats [0..N]\n");
647                                 us_vhost_usage(prgname);
648                                 return -1;
649                         }
650                         enable_stats = ret;
651                         break;
652
653                 /* Set socket file path. */
654                 case OPT_SOCKET_FILE_NUM:
655                         if (us_vhost_parse_socket_path(optarg) == -1) {
656                                 RTE_LOG(INFO, VHOST_CONFIG,
657                                 "Invalid argument for socket name (Max %d characters)\n",
658                                 PATH_MAX);
659                                 us_vhost_usage(prgname);
660                                 return -1;
661                         }
662                         break;
663
664                 case OPT_DMA_TYPE_NUM:
665                         dma_type = optarg;
666                         break;
667
668                 case OPT_DMAS_NUM:
669                         if (open_dma(optarg) == -1) {
670                                 RTE_LOG(INFO, VHOST_CONFIG,
671                                         "Wrong DMA args\n");
672                                 us_vhost_usage(prgname);
673                                 return -1;
674                         }
675                         async_vhost_driver = 1;
676                         break;
677
678                 case OPT_CLIENT_NUM:
679                         client_mode = 1;
680                         break;
681
682                 case OPT_BUILTIN_NET_DRIVER_NUM:
683                         builtin_net_driver = 1;
684                         break;
685
686                 /* Invalid option - print options. */
687                 default:
688                         us_vhost_usage(prgname);
689                         return -1;
690                 }
691         }
692
693         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
694                 if (enabled_port_mask & (1 << i))
695                         ports[num_ports++] = i;
696         }
697
698         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
699                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
700                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
701                 return -1;
702         }
703
704         return 0;
705 }
706
707 /*
708  * Update the global var NUM_PORTS and array PORTS according to system ports number
709  * and return valid ports number
710  */
711 static unsigned check_ports_num(unsigned nb_ports)
712 {
713         unsigned valid_num_ports = num_ports;
714         unsigned portid;
715
716         if (num_ports > nb_ports) {
717                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
718                         num_ports, nb_ports);
719                 num_ports = nb_ports;
720         }
721
722         for (portid = 0; portid < num_ports; portid ++) {
723                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
724                         RTE_LOG(INFO, VHOST_PORT,
725                                 "\nSpecified port ID(%u) is not valid\n",
726                                 ports[portid]);
727                         ports[portid] = INVALID_PORT_ID;
728                         valid_num_ports--;
729                 }
730         }
731         return valid_num_ports;
732 }
733
734 static __rte_always_inline struct vhost_dev *
735 find_vhost_dev(struct rte_ether_addr *mac)
736 {
737         struct vhost_dev *vdev;
738
739         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
740                 if (vdev->ready == DEVICE_RX &&
741                     rte_is_same_ether_addr(mac, &vdev->mac_address))
742                         return vdev;
743         }
744
745         return NULL;
746 }
747
748 /*
749  * This function learns the MAC address of the device and registers this along with a
750  * vlan tag to a VMDQ.
751  */
752 static int
753 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
754 {
755         struct rte_ether_hdr *pkt_hdr;
756         int i, ret;
757
758         /* Learn MAC address of guest device from packet */
759         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
760
761         if (find_vhost_dev(&pkt_hdr->src_addr)) {
762                 RTE_LOG(ERR, VHOST_DATA,
763                         "(%d) device is using a registered MAC!\n",
764                         vdev->vid);
765                 return -1;
766         }
767
768         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
769                 vdev->mac_address.addr_bytes[i] =
770                         pkt_hdr->src_addr.addr_bytes[i];
771
772         /* vlan_tag currently uses the device_id. */
773         vdev->vlan_tag = vlan_tags[vdev->vid];
774
775         /* Print out VMDQ registration info. */
776         RTE_LOG(INFO, VHOST_DATA,
777                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
778                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
779                 vdev->vlan_tag);
780
781         /* Register the MAC address. */
782         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
783                                 (uint32_t)vdev->vid + vmdq_pool_base);
784         if (ret)
785                 RTE_LOG(ERR, VHOST_DATA,
786                         "(%d) failed to add device MAC address to VMDQ\n",
787                         vdev->vid);
788
789         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
790
791         /* Set device as ready for RX. */
792         vdev->ready = DEVICE_RX;
793
794         return 0;
795 }
796
797 /*
798  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
799  * queue before disabling RX on the device.
800  */
801 static inline void
802 unlink_vmdq(struct vhost_dev *vdev)
803 {
804         unsigned i = 0;
805         unsigned rx_count;
806         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
807
808         if (vdev->ready == DEVICE_RX) {
809                 /*clear MAC and VLAN settings*/
810                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
811                 for (i = 0; i < 6; i++)
812                         vdev->mac_address.addr_bytes[i] = 0;
813
814                 vdev->vlan_tag = 0;
815
816                 /*Clear out the receive buffers*/
817                 rx_count = rte_eth_rx_burst(ports[0],
818                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
819
820                 while (rx_count) {
821                         for (i = 0; i < rx_count; i++)
822                                 rte_pktmbuf_free(pkts_burst[i]);
823
824                         rx_count = rte_eth_rx_burst(ports[0],
825                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
826                 }
827
828                 vdev->ready = DEVICE_MAC_LEARNING;
829         }
830 }
831
832 static inline void
833 free_pkts(struct rte_mbuf **pkts, uint16_t n)
834 {
835         while (n--)
836                 rte_pktmbuf_free(pkts[n]);
837 }
838
839 static __rte_always_inline void
840 complete_async_pkts(struct vhost_dev *vdev)
841 {
842         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
843         uint16_t complete_count;
844
845         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
846                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
847         if (complete_count) {
848                 free_pkts(p_cpl, complete_count);
849                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
850         }
851
852 }
853
854 static __rte_always_inline void
855 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
856             struct rte_mbuf *m)
857 {
858         uint16_t ret;
859
860         if (builtin_net_driver) {
861                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
862         } else {
863                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
864         }
865
866         if (enable_stats) {
867                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
868                                 __ATOMIC_SEQ_CST);
869                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
870                                 __ATOMIC_SEQ_CST);
871                 src_vdev->stats.tx_total++;
872                 src_vdev->stats.tx += ret;
873         }
874 }
875
876 static __rte_always_inline void
877 drain_vhost(struct vhost_dev *vdev)
878 {
879         uint16_t ret;
880         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
881         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
882         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
883
884         if (builtin_net_driver) {
885                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
886         } else if (async_vhost_driver) {
887                 uint16_t enqueue_fail = 0;
888
889                 complete_async_pkts(vdev);
890                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
891                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
892
893                 enqueue_fail = nr_xmit - ret;
894                 if (enqueue_fail)
895                         free_pkts(&m[ret], nr_xmit - ret);
896         } else {
897                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
898                                                 m, nr_xmit);
899         }
900
901         if (enable_stats) {
902                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
903                                 __ATOMIC_SEQ_CST);
904                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
905                                 __ATOMIC_SEQ_CST);
906         }
907
908         if (!async_vhost_driver)
909                 free_pkts(m, nr_xmit);
910 }
911
912 static __rte_always_inline void
913 drain_vhost_table(void)
914 {
915         uint16_t lcore_id = rte_lcore_id();
916         struct vhost_bufftable *vhost_txq;
917         struct vhost_dev *vdev;
918         uint64_t cur_tsc;
919
920         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
921                 if (unlikely(vdev->remove == 1))
922                         continue;
923
924                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
925                                                 + vdev->vid];
926
927                 cur_tsc = rte_rdtsc();
928                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
929                                 > MBUF_TABLE_DRAIN_TSC)) {
930                         RTE_LOG_DP(DEBUG, VHOST_DATA,
931                                 "Vhost TX queue drained after timeout with burst size %u\n",
932                                 vhost_txq->len);
933                         drain_vhost(vdev);
934                         vhost_txq->len = 0;
935                         vhost_txq->pre_tsc = cur_tsc;
936                 }
937         }
938 }
939
940 /*
941  * Check if the packet destination MAC address is for a local device. If so then put
942  * the packet on that devices RX queue. If not then return.
943  */
944 static __rte_always_inline int
945 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
946 {
947         struct rte_ether_hdr *pkt_hdr;
948         struct vhost_dev *dst_vdev;
949         struct vhost_bufftable *vhost_txq;
950         uint16_t lcore_id = rte_lcore_id();
951         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
952
953         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
954         if (!dst_vdev)
955                 return -1;
956
957         if (vdev->vid == dst_vdev->vid) {
958                 RTE_LOG_DP(DEBUG, VHOST_DATA,
959                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
960                         vdev->vid);
961                 return 0;
962         }
963
964         RTE_LOG_DP(DEBUG, VHOST_DATA,
965                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
966
967         if (unlikely(dst_vdev->remove)) {
968                 RTE_LOG_DP(DEBUG, VHOST_DATA,
969                         "(%d) device is marked for removal\n", dst_vdev->vid);
970                 return 0;
971         }
972
973         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
974         vhost_txq->m_table[vhost_txq->len++] = m;
975
976         if (enable_stats) {
977                 vdev->stats.tx_total++;
978                 vdev->stats.tx++;
979         }
980
981         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
982                 drain_vhost(dst_vdev);
983                 vhost_txq->len = 0;
984                 vhost_txq->pre_tsc = rte_rdtsc();
985         }
986         return 0;
987 }
988
989 /*
990  * Check if the destination MAC of a packet is one local VM,
991  * and get its vlan tag, and offset if it is.
992  */
993 static __rte_always_inline int
994 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
995         uint32_t *offset, uint16_t *vlan_tag)
996 {
997         struct vhost_dev *dst_vdev;
998         struct rte_ether_hdr *pkt_hdr =
999                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1000
1001         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1002         if (!dst_vdev)
1003                 return 0;
1004
1005         if (vdev->vid == dst_vdev->vid) {
1006                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1007                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1008                         vdev->vid);
1009                 return -1;
1010         }
1011
1012         /*
1013          * HW vlan strip will reduce the packet length
1014          * by minus length of vlan tag, so need restore
1015          * the packet length by plus it.
1016          */
1017         *offset  = RTE_VLAN_HLEN;
1018         *vlan_tag = vlan_tags[vdev->vid];
1019
1020         RTE_LOG_DP(DEBUG, VHOST_DATA,
1021                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1022                 vdev->vid, dst_vdev->vid, *vlan_tag);
1023
1024         return 0;
1025 }
1026
1027 static void virtio_tx_offload(struct rte_mbuf *m)
1028 {
1029         struct rte_net_hdr_lens hdr_lens;
1030         struct rte_ipv4_hdr *ipv4_hdr;
1031         struct rte_tcp_hdr *tcp_hdr;
1032         uint32_t ptype;
1033         void *l3_hdr;
1034
1035         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1036         m->l2_len = hdr_lens.l2_len;
1037         m->l3_len = hdr_lens.l3_len;
1038         m->l4_len = hdr_lens.l4_len;
1039
1040         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1041         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1042                 m->l2_len + m->l3_len);
1043
1044         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1045         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1046                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1047                 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1048                 ipv4_hdr = l3_hdr;
1049                 ipv4_hdr->hdr_checksum = 0;
1050                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1051         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1052                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1053                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1054         }
1055 }
1056
1057 static __rte_always_inline void
1058 do_drain_mbuf_table(struct mbuf_table *tx_q)
1059 {
1060         uint16_t count;
1061
1062         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1063                                  tx_q->m_table, tx_q->len);
1064         if (unlikely(count < tx_q->len))
1065                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1066
1067         tx_q->len = 0;
1068 }
1069
1070 /*
1071  * This function routes the TX packet to the correct interface. This
1072  * may be a local device or the physical port.
1073  */
1074 static __rte_always_inline void
1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1076 {
1077         struct mbuf_table *tx_q;
1078         unsigned offset = 0;
1079         const uint16_t lcore_id = rte_lcore_id();
1080         struct rte_ether_hdr *nh;
1081
1082
1083         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1084         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1085                 struct vhost_dev *vdev2;
1086
1087                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1088                         if (vdev2 != vdev)
1089                                 sync_virtio_xmit(vdev2, vdev, m);
1090                 }
1091                 goto queue2nic;
1092         }
1093
1094         /*check if destination is local VM*/
1095         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1096                 return;
1097
1098         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1099                 if (unlikely(find_local_dest(vdev, m, &offset,
1100                                              &vlan_tag) != 0)) {
1101                         rte_pktmbuf_free(m);
1102                         return;
1103                 }
1104         }
1105
1106         RTE_LOG_DP(DEBUG, VHOST_DATA,
1107                 "(%d) TX: MAC address is external\n", vdev->vid);
1108
1109 queue2nic:
1110
1111         /*Add packet to the port tx queue*/
1112         tx_q = &lcore_tx_queue[lcore_id];
1113
1114         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1115         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1116                 /* Guest has inserted the vlan tag. */
1117                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1118                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1119                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1120                         (vh->vlan_tci != vlan_tag_be))
1121                         vh->vlan_tci = vlan_tag_be;
1122         } else {
1123                 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1124
1125                 /*
1126                  * Find the right seg to adjust the data len when offset is
1127                  * bigger than tail room size.
1128                  */
1129                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1130                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1131                                 m->data_len += offset;
1132                         else {
1133                                 struct rte_mbuf *seg = m;
1134
1135                                 while ((seg->next != NULL) &&
1136                                         (offset > rte_pktmbuf_tailroom(seg)))
1137                                         seg = seg->next;
1138
1139                                 seg->data_len += offset;
1140                         }
1141                         m->pkt_len += offset;
1142                 }
1143
1144                 m->vlan_tci = vlan_tag;
1145         }
1146
1147         if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1148                 virtio_tx_offload(m);
1149
1150         tx_q->m_table[tx_q->len++] = m;
1151         if (enable_stats) {
1152                 vdev->stats.tx_total++;
1153                 vdev->stats.tx++;
1154         }
1155
1156         if (unlikely(tx_q->len == MAX_PKT_BURST))
1157                 do_drain_mbuf_table(tx_q);
1158 }
1159
1160
1161 static __rte_always_inline void
1162 drain_mbuf_table(struct mbuf_table *tx_q)
1163 {
1164         static uint64_t prev_tsc;
1165         uint64_t cur_tsc;
1166
1167         if (tx_q->len == 0)
1168                 return;
1169
1170         cur_tsc = rte_rdtsc();
1171         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1172                 prev_tsc = cur_tsc;
1173
1174                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1175                         "TX queue drained after timeout with burst size %u\n",
1176                         tx_q->len);
1177                 do_drain_mbuf_table(tx_q);
1178         }
1179 }
1180
1181 static __rte_always_inline void
1182 drain_eth_rx(struct vhost_dev *vdev)
1183 {
1184         uint16_t rx_count, enqueue_count;
1185         struct rte_mbuf *pkts[MAX_PKT_BURST];
1186
1187         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1188                                     pkts, MAX_PKT_BURST);
1189
1190         if (!rx_count)
1191                 return;
1192
1193         /*
1194          * When "enable_retry" is set, here we wait and retry when there
1195          * is no enough free slots in the queue to hold @rx_count packets,
1196          * to diminish packet loss.
1197          */
1198         if (enable_retry &&
1199             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1200                         VIRTIO_RXQ))) {
1201                 uint32_t retry;
1202
1203                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1204                         rte_delay_us(burst_rx_delay_time);
1205                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1206                                         VIRTIO_RXQ))
1207                                 break;
1208                 }
1209         }
1210
1211         if (builtin_net_driver) {
1212                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1213                                                 pkts, rx_count);
1214         } else if (async_vhost_driver) {
1215                 uint16_t enqueue_fail = 0;
1216
1217                 complete_async_pkts(vdev);
1218                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1219                                         VIRTIO_RXQ, pkts, rx_count);
1220                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1221
1222                 enqueue_fail = rx_count - enqueue_count;
1223                 if (enqueue_fail)
1224                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1225
1226         } else {
1227                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1228                                                 pkts, rx_count);
1229         }
1230
1231         if (enable_stats) {
1232                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1233                                 __ATOMIC_SEQ_CST);
1234                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1235                                 __ATOMIC_SEQ_CST);
1236         }
1237
1238         if (!async_vhost_driver)
1239                 free_pkts(pkts, rx_count);
1240 }
1241
1242 static __rte_always_inline void
1243 drain_virtio_tx(struct vhost_dev *vdev)
1244 {
1245         struct rte_mbuf *pkts[MAX_PKT_BURST];
1246         uint16_t count;
1247         uint16_t i;
1248
1249         if (builtin_net_driver) {
1250                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1251                                         pkts, MAX_PKT_BURST);
1252         } else {
1253                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1254                                         mbuf_pool, pkts, MAX_PKT_BURST);
1255         }
1256
1257         /* setup VMDq for the first packet */
1258         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1259                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1260                         free_pkts(pkts, count);
1261         }
1262
1263         for (i = 0; i < count; ++i)
1264                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1265 }
1266
1267 /*
1268  * Main function of vhost-switch. It basically does:
1269  *
1270  * for each vhost device {
1271  *    - drain_eth_rx()
1272  *
1273  *      Which drains the host eth Rx queue linked to the vhost device,
1274  *      and deliver all of them to guest virito Rx ring associated with
1275  *      this vhost device.
1276  *
1277  *    - drain_virtio_tx()
1278  *
1279  *      Which drains the guest virtio Tx queue and deliver all of them
1280  *      to the target, which could be another vhost device, or the
1281  *      physical eth dev. The route is done in function "virtio_tx_route".
1282  * }
1283  */
1284 static int
1285 switch_worker(void *arg __rte_unused)
1286 {
1287         unsigned i;
1288         unsigned lcore_id = rte_lcore_id();
1289         struct vhost_dev *vdev;
1290         struct mbuf_table *tx_q;
1291
1292         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1293
1294         tx_q = &lcore_tx_queue[lcore_id];
1295         for (i = 0; i < rte_lcore_count(); i++) {
1296                 if (lcore_ids[i] == lcore_id) {
1297                         tx_q->txq_id = i;
1298                         break;
1299                 }
1300         }
1301
1302         while(1) {
1303                 drain_mbuf_table(tx_q);
1304                 drain_vhost_table();
1305                 /*
1306                  * Inform the configuration core that we have exited the
1307                  * linked list and that no devices are in use if requested.
1308                  */
1309                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1310                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1311
1312                 /*
1313                  * Process vhost devices
1314                  */
1315                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1316                               lcore_vdev_entry) {
1317                         if (unlikely(vdev->remove)) {
1318                                 unlink_vmdq(vdev);
1319                                 vdev->ready = DEVICE_SAFE_REMOVE;
1320                                 continue;
1321                         }
1322
1323                         if (likely(vdev->ready == DEVICE_RX))
1324                                 drain_eth_rx(vdev);
1325
1326                         if (likely(!vdev->remove))
1327                                 drain_virtio_tx(vdev);
1328                 }
1329         }
1330
1331         return 0;
1332 }
1333
1334 /*
1335  * Remove a device from the specific data core linked list and from the
1336  * main linked list. Synchonization  occurs through the use of the
1337  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1338  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1339  */
1340 static void
1341 destroy_device(int vid)
1342 {
1343         struct vhost_dev *vdev = NULL;
1344         int lcore;
1345         uint16_t i;
1346
1347         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1348                 if (vdev->vid == vid)
1349                         break;
1350         }
1351         if (!vdev)
1352                 return;
1353         /*set the remove flag. */
1354         vdev->remove = 1;
1355         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1356                 rte_pause();
1357         }
1358
1359         for (i = 0; i < RTE_MAX_LCORE; i++)
1360                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1361
1362         if (builtin_net_driver)
1363                 vs_vhost_net_remove(vdev);
1364
1365         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1366                      lcore_vdev_entry);
1367         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1368
1369
1370         /* Set the dev_removal_flag on each lcore. */
1371         RTE_LCORE_FOREACH_WORKER(lcore)
1372                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1373
1374         /*
1375          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1376          * we can be sure that they can no longer access the device removed
1377          * from the linked lists and that the devices are no longer in use.
1378          */
1379         RTE_LCORE_FOREACH_WORKER(lcore) {
1380                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1381                         rte_pause();
1382         }
1383
1384         lcore_info[vdev->coreid].device_num--;
1385
1386         RTE_LOG(INFO, VHOST_DATA,
1387                 "(%d) device has been removed from data core\n",
1388                 vdev->vid);
1389
1390         if (async_vhost_driver) {
1391                 uint16_t n_pkt = 0;
1392                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1393
1394                 while (vdev->pkts_inflight) {
1395                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1396                                                 m_cpl, vdev->pkts_inflight);
1397                         free_pkts(m_cpl, n_pkt);
1398                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1399                 }
1400
1401                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1402         }
1403
1404         rte_free(vdev);
1405 }
1406
1407 /*
1408  * A new device is added to a data core. First the device is added to the main linked list
1409  * and then allocated to a specific data core.
1410  */
1411 static int
1412 new_device(int vid)
1413 {
1414         int lcore, core_add = 0;
1415         uint16_t i;
1416         uint32_t device_num_min = num_devices;
1417         struct vhost_dev *vdev;
1418         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1419         if (vdev == NULL) {
1420                 RTE_LOG(INFO, VHOST_DATA,
1421                         "(%d) couldn't allocate memory for vhost dev\n",
1422                         vid);
1423                 return -1;
1424         }
1425         vdev->vid = vid;
1426
1427         for (i = 0; i < RTE_MAX_LCORE; i++) {
1428                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1429                         = rte_zmalloc("vhost bufftable",
1430                                 sizeof(struct vhost_bufftable),
1431                                 RTE_CACHE_LINE_SIZE);
1432
1433                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1434                         RTE_LOG(INFO, VHOST_DATA,
1435                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1436                         return -1;
1437                 }
1438         }
1439
1440         if (builtin_net_driver)
1441                 vs_vhost_net_setup(vdev);
1442
1443         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1444         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1445
1446         /*reset ready flag*/
1447         vdev->ready = DEVICE_MAC_LEARNING;
1448         vdev->remove = 0;
1449
1450         /* Find a suitable lcore to add the device. */
1451         RTE_LCORE_FOREACH_WORKER(lcore) {
1452                 if (lcore_info[lcore].device_num < device_num_min) {
1453                         device_num_min = lcore_info[lcore].device_num;
1454                         core_add = lcore;
1455                 }
1456         }
1457         vdev->coreid = core_add;
1458
1459         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1460                           lcore_vdev_entry);
1461         lcore_info[vdev->coreid].device_num++;
1462
1463         /* Disable notifications. */
1464         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1465         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1466
1467         RTE_LOG(INFO, VHOST_DATA,
1468                 "(%d) device has been added to data core %d\n",
1469                 vid, vdev->coreid);
1470
1471         if (async_vhost_driver) {
1472                 struct rte_vhost_async_config config = {0};
1473                 struct rte_vhost_async_channel_ops channel_ops;
1474
1475                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1476                         channel_ops.transfer_data = ioat_transfer_data_cb;
1477                         channel_ops.check_completed_copies =
1478                                 ioat_check_completed_copies_cb;
1479
1480                         config.features = RTE_VHOST_ASYNC_INORDER;
1481
1482                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1483                                 config, &channel_ops);
1484                 }
1485         }
1486
1487         return 0;
1488 }
1489
1490 static int
1491 vring_state_changed(int vid, uint16_t queue_id, int enable)
1492 {
1493         struct vhost_dev *vdev = NULL;
1494
1495         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1496                 if (vdev->vid == vid)
1497                         break;
1498         }
1499         if (!vdev)
1500                 return -1;
1501
1502         if (queue_id != VIRTIO_RXQ)
1503                 return 0;
1504
1505         if (async_vhost_driver) {
1506                 if (!enable) {
1507                         uint16_t n_pkt = 0;
1508                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1509
1510                         while (vdev->pkts_inflight) {
1511                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1512                                                         m_cpl, vdev->pkts_inflight);
1513                                 free_pkts(m_cpl, n_pkt);
1514                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1515                         }
1516                 }
1517         }
1518
1519         return 0;
1520 }
1521
1522 /*
1523  * These callback allow devices to be added to the data core when configuration
1524  * has been fully complete.
1525  */
1526 static const struct rte_vhost_device_ops virtio_net_device_ops =
1527 {
1528         .new_device =  new_device,
1529         .destroy_device = destroy_device,
1530         .vring_state_changed = vring_state_changed,
1531 };
1532
1533 /*
1534  * This is a thread will wake up after a period to print stats if the user has
1535  * enabled them.
1536  */
1537 static void *
1538 print_stats(__rte_unused void *arg)
1539 {
1540         struct vhost_dev *vdev;
1541         uint64_t tx_dropped, rx_dropped;
1542         uint64_t tx, tx_total, rx, rx_total;
1543         const char clr[] = { 27, '[', '2', 'J', '\0' };
1544         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1545
1546         while(1) {
1547                 sleep(enable_stats);
1548
1549                 /* Clear screen and move to top left */
1550                 printf("%s%s\n", clr, top_left);
1551                 printf("Device statistics =================================\n");
1552
1553                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1554                         tx_total   = vdev->stats.tx_total;
1555                         tx         = vdev->stats.tx;
1556                         tx_dropped = tx_total - tx;
1557
1558                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1559                                 __ATOMIC_SEQ_CST);
1560                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1561                                 __ATOMIC_SEQ_CST);
1562                         rx_dropped = rx_total - rx;
1563
1564                         printf("Statistics for device %d\n"
1565                                 "-----------------------\n"
1566                                 "TX total:              %" PRIu64 "\n"
1567                                 "TX dropped:            %" PRIu64 "\n"
1568                                 "TX successful:         %" PRIu64 "\n"
1569                                 "RX total:              %" PRIu64 "\n"
1570                                 "RX dropped:            %" PRIu64 "\n"
1571                                 "RX successful:         %" PRIu64 "\n",
1572                                 vdev->vid,
1573                                 tx_total, tx_dropped, tx,
1574                                 rx_total, rx_dropped, rx);
1575                 }
1576
1577                 printf("===================================================\n");
1578
1579                 fflush(stdout);
1580         }
1581
1582         return NULL;
1583 }
1584
1585 static void
1586 unregister_drivers(int socket_num)
1587 {
1588         int i, ret;
1589
1590         for (i = 0; i < socket_num; i++) {
1591                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1592                 if (ret != 0)
1593                         RTE_LOG(ERR, VHOST_CONFIG,
1594                                 "Fail to unregister vhost driver for %s.\n",
1595                                 socket_files + i * PATH_MAX);
1596         }
1597 }
1598
1599 /* When we receive a INT signal, unregister vhost driver */
1600 static void
1601 sigint_handler(__rte_unused int signum)
1602 {
1603         /* Unregister vhost driver. */
1604         unregister_drivers(nb_sockets);
1605
1606         exit(0);
1607 }
1608
1609 /*
1610  * While creating an mbuf pool, one key thing is to figure out how
1611  * many mbuf entries is enough for our use. FYI, here are some
1612  * guidelines:
1613  *
1614  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1615  *
1616  * - For each switch core (A CPU core does the packet switch), we need
1617  *   also make some reservation for receiving the packets from virtio
1618  *   Tx queue. How many is enough depends on the usage. It's normally
1619  *   a simple calculation like following:
1620  *
1621  *       MAX_PKT_BURST * max packet size / mbuf size
1622  *
1623  *   So, we definitely need allocate more mbufs when TSO is enabled.
1624  *
1625  * - Similarly, for each switching core, we should serve @nr_rx_desc
1626  *   mbufs for receiving the packets from physical NIC device.
1627  *
1628  * - We also need make sure, for each switch core, we have allocated
1629  *   enough mbufs to fill up the mbuf cache.
1630  */
1631 static void
1632 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1633         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1634 {
1635         uint32_t nr_mbufs;
1636         uint32_t nr_mbufs_per_core;
1637         uint32_t mtu = 1500;
1638
1639         if (mergeable)
1640                 mtu = 9000;
1641         if (enable_tso)
1642                 mtu = 64 * 1024;
1643
1644         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1645                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1646         nr_mbufs_per_core += nr_rx_desc;
1647         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1648
1649         nr_mbufs  = nr_queues * nr_rx_desc;
1650         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1651         nr_mbufs *= nr_port;
1652
1653         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1654                                             nr_mbuf_cache, 0, mbuf_size,
1655                                             rte_socket_id());
1656         if (mbuf_pool == NULL)
1657                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1658 }
1659
1660 /*
1661  * Main function, does initialisation and calls the per-lcore functions.
1662  */
1663 int
1664 main(int argc, char *argv[])
1665 {
1666         unsigned lcore_id, core_id = 0;
1667         unsigned nb_ports, valid_num_ports;
1668         int ret, i;
1669         uint16_t portid;
1670         static pthread_t tid;
1671         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1672
1673         signal(SIGINT, sigint_handler);
1674
1675         /* init EAL */
1676         ret = rte_eal_init(argc, argv);
1677         if (ret < 0)
1678                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1679         argc -= ret;
1680         argv += ret;
1681
1682         /* parse app arguments */
1683         ret = us_vhost_parse_args(argc, argv);
1684         if (ret < 0)
1685                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1686
1687         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1688                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1689
1690                 if (rte_lcore_is_enabled(lcore_id))
1691                         lcore_ids[core_id++] = lcore_id;
1692         }
1693
1694         if (rte_lcore_count() > RTE_MAX_LCORE)
1695                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1696
1697         /* Get the number of physical ports. */
1698         nb_ports = rte_eth_dev_count_avail();
1699
1700         /*
1701          * Update the global var NUM_PORTS and global array PORTS
1702          * and get value of var VALID_NUM_PORTS according to system ports number
1703          */
1704         valid_num_ports = check_ports_num(nb_ports);
1705
1706         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1707                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1708                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1709                 return -1;
1710         }
1711
1712         /*
1713          * FIXME: here we are trying to allocate mbufs big enough for
1714          * @MAX_QUEUES, but the truth is we're never going to use that
1715          * many queues here. We probably should only do allocation for
1716          * those queues we are going to use.
1717          */
1718         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1719                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1720
1721         if (vm2vm_mode == VM2VM_HARDWARE) {
1722                 /* Enable VT loop back to let L2 switch to do it. */
1723                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1724                 RTE_LOG(DEBUG, VHOST_CONFIG,
1725                         "Enable loop back for L2 switch in vmdq.\n");
1726         }
1727
1728         /* initialize all ports */
1729         RTE_ETH_FOREACH_DEV(portid) {
1730                 /* skip ports that are not enabled */
1731                 if ((enabled_port_mask & (1 << portid)) == 0) {
1732                         RTE_LOG(INFO, VHOST_PORT,
1733                                 "Skipping disabled port %d\n", portid);
1734                         continue;
1735                 }
1736                 if (port_init(portid) != 0)
1737                         rte_exit(EXIT_FAILURE,
1738                                 "Cannot initialize network ports\n");
1739         }
1740
1741         /* Enable stats if the user option is set. */
1742         if (enable_stats) {
1743                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1744                                         print_stats, NULL);
1745                 if (ret < 0)
1746                         rte_exit(EXIT_FAILURE,
1747                                 "Cannot create print-stats thread\n");
1748         }
1749
1750         /* Launch all data cores. */
1751         RTE_LCORE_FOREACH_WORKER(lcore_id)
1752                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1753
1754         if (client_mode)
1755                 flags |= RTE_VHOST_USER_CLIENT;
1756
1757         /* Register vhost user driver to handle vhost messages. */
1758         for (i = 0; i < nb_sockets; i++) {
1759                 char *file = socket_files + i * PATH_MAX;
1760
1761                 if (async_vhost_driver)
1762                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1763
1764                 ret = rte_vhost_driver_register(file, flags);
1765                 if (ret != 0) {
1766                         unregister_drivers(i);
1767                         rte_exit(EXIT_FAILURE,
1768                                 "vhost driver register failure.\n");
1769                 }
1770
1771                 if (builtin_net_driver)
1772                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1773
1774                 if (mergeable == 0) {
1775                         rte_vhost_driver_disable_features(file,
1776                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1777                 }
1778
1779                 if (enable_tx_csum == 0) {
1780                         rte_vhost_driver_disable_features(file,
1781                                 1ULL << VIRTIO_NET_F_CSUM);
1782                 }
1783
1784                 if (enable_tso == 0) {
1785                         rte_vhost_driver_disable_features(file,
1786                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1787                         rte_vhost_driver_disable_features(file,
1788                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1789                         rte_vhost_driver_disable_features(file,
1790                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1791                         rte_vhost_driver_disable_features(file,
1792                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1793                 }
1794
1795                 if (promiscuous) {
1796                         rte_vhost_driver_enable_features(file,
1797                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1798                 }
1799
1800                 ret = rte_vhost_driver_callback_register(file,
1801                         &virtio_net_device_ops);
1802                 if (ret != 0) {
1803                         rte_exit(EXIT_FAILURE,
1804                                 "failed to register vhost driver callbacks.\n");
1805                 }
1806
1807                 if (rte_vhost_driver_start(file) < 0) {
1808                         rte_exit(EXIT_FAILURE,
1809                                 "failed to start vhost driver.\n");
1810                 }
1811         }
1812
1813         RTE_LCORE_FOREACH_WORKER(lcore_id)
1814                 rte_eal_wait_lcore(lcore_id);
1815
1816         /* clean up the EAL */
1817         rte_eal_cleanup();
1818
1819         return 0;
1820 }