mbuf: add namespace to offload flags
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX                       1
52 #define DEVICE_SAFE_REMOVE      2
53
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57
58 #define INVALID_PORT_ID 0xFF
59
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75         VM2VM_DISABLED = 0,
76         VM2VM_SOFTWARE = 1,
77         VM2VM_HARDWARE = 2,
78         VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92
93 static int client_mode;
94
95 static int builtin_net_driver;
96
97 static int async_vhost_driver;
98
99 static char *dma_type;
100
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109
110 /* empty vmdq configuration structure. Filled in programatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112         .rxmode = {
113                 .mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
114                 .split_hdr_size = 0,
115                 /*
116                  * VLAN strip is necessary for 1G NIC such as I350,
117                  * this fixes bug of ipv4 forwarding in guest can't
118                  * forward pakets from one virtio dev to another virtio dev.
119                  */
120                 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
121         },
122
123         .txmode = {
124                 .mq_mode = RTE_ETH_MQ_TX_NONE,
125                 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
126                              RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
127                              RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
128                              RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
129                              RTE_ETH_TX_OFFLOAD_TCP_TSO),
130         },
131         .rx_adv_conf = {
132                 /*
133                  * should be overridden separately in code with
134                  * appropriate values
135                  */
136                 .vmdq_rx_conf = {
137                         .nb_queue_pools = RTE_ETH_8_POOLS,
138                         .enable_default_pool = 0,
139                         .default_pool = 0,
140                         .nb_pool_maps = 0,
141                         .pool_map = {{0, 0},},
142                 },
143         },
144 };
145
146
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153
154 const uint16_t vlan_tags[] = {
155         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
157         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167
168 static struct vhost_dev_tailq_list vhost_dev_list =
169         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175         unsigned len;
176         unsigned txq_id;
177         struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179
180 struct vhost_bufftable {
181         uint32_t len;
182         uint64_t pre_tsc;
183         struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195
196 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
197                                  / US_PER_S * BURST_TX_DRAIN_US)
198 #define VLAN_HLEN       4
199
200 static inline int
201 open_dma(const char *value)
202 {
203         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
204                 return open_ioat(value);
205
206         return -1;
207 }
208
209 /*
210  * Builds up the correct configuration for VMDQ VLAN pool map
211  * according to the pool & queue limits.
212  */
213 static inline int
214 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215 {
216         struct rte_eth_vmdq_rx_conf conf;
217         struct rte_eth_vmdq_rx_conf *def_conf =
218                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219         unsigned i;
220
221         memset(&conf, 0, sizeof(conf));
222         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
223         conf.nb_pool_maps = num_devices;
224         conf.enable_loop_back = def_conf->enable_loop_back;
225         conf.rx_mode = def_conf->rx_mode;
226
227         for (i = 0; i < conf.nb_pool_maps; i++) {
228                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
229                 conf.pool_map[i].pools = (1UL << i);
230         }
231
232         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
233         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
234                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
235         return 0;
236 }
237
238 /*
239  * Initialises a given port using global settings and with the rx buffers
240  * coming from the mbuf_pool passed as parameter
241  */
242 static inline int
243 port_init(uint16_t port)
244 {
245         struct rte_eth_dev_info dev_info;
246         struct rte_eth_conf port_conf;
247         struct rte_eth_rxconf *rxconf;
248         struct rte_eth_txconf *txconf;
249         int16_t rx_rings, tx_rings;
250         uint16_t rx_ring_size, tx_ring_size;
251         int retval;
252         uint16_t q;
253
254         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
255         retval = rte_eth_dev_info_get(port, &dev_info);
256         if (retval != 0) {
257                 RTE_LOG(ERR, VHOST_PORT,
258                         "Error during getting device (port %u) info: %s\n",
259                         port, strerror(-retval));
260
261                 return retval;
262         }
263
264         rxconf = &dev_info.default_rxconf;
265         txconf = &dev_info.default_txconf;
266         rxconf->rx_drop_en = 1;
267
268         /*configure the number of supported virtio devices based on VMDQ limits */
269         num_devices = dev_info.max_vmdq_pools;
270
271         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
272         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
273
274         tx_rings = (uint16_t)rte_lcore_count();
275
276         /* Get port configuration. */
277         retval = get_eth_conf(&port_conf, num_devices);
278         if (retval < 0)
279                 return retval;
280         /* NIC queues are divided into pf queues and vmdq queues.  */
281         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
282         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
283         num_vmdq_queues = num_devices * queues_per_pool;
284         num_queues = num_pf_queues + num_vmdq_queues;
285         vmdq_queue_base = dev_info.vmdq_queue_base;
286         vmdq_pool_base  = dev_info.vmdq_pool_base;
287         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
288                 num_pf_queues, num_devices, queues_per_pool);
289
290         if (!rte_eth_dev_is_valid_port(port))
291                 return -1;
292
293         rx_rings = (uint16_t)dev_info.max_rx_queues;
294         if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
295                 port_conf.txmode.offloads |=
296                         RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
297         /* Configure ethernet device. */
298         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
299         if (retval != 0) {
300                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
301                         port, strerror(-retval));
302                 return retval;
303         }
304
305         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
306                 &tx_ring_size);
307         if (retval != 0) {
308                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
309                         "for port %u: %s.\n", port, strerror(-retval));
310                 return retval;
311         }
312         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
313                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
314                         "for Rx queues on port %u.\n", port);
315                 return -1;
316         }
317
318         /* Setup the queues. */
319         rxconf->offloads = port_conf.rxmode.offloads;
320         for (q = 0; q < rx_rings; q ++) {
321                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
322                                                 rte_eth_dev_socket_id(port),
323                                                 rxconf,
324                                                 mbuf_pool);
325                 if (retval < 0) {
326                         RTE_LOG(ERR, VHOST_PORT,
327                                 "Failed to setup rx queue %u of port %u: %s.\n",
328                                 q, port, strerror(-retval));
329                         return retval;
330                 }
331         }
332         txconf->offloads = port_conf.txmode.offloads;
333         for (q = 0; q < tx_rings; q ++) {
334                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
335                                                 rte_eth_dev_socket_id(port),
336                                                 txconf);
337                 if (retval < 0) {
338                         RTE_LOG(ERR, VHOST_PORT,
339                                 "Failed to setup tx queue %u of port %u: %s.\n",
340                                 q, port, strerror(-retval));
341                         return retval;
342                 }
343         }
344
345         /* Start the device. */
346         retval  = rte_eth_dev_start(port);
347         if (retval < 0) {
348                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
349                         port, strerror(-retval));
350                 return retval;
351         }
352
353         if (promiscuous) {
354                 retval = rte_eth_promiscuous_enable(port);
355                 if (retval != 0) {
356                         RTE_LOG(ERR, VHOST_PORT,
357                                 "Failed to enable promiscuous mode on port %u: %s\n",
358                                 port, rte_strerror(-retval));
359                         return retval;
360                 }
361         }
362
363         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
364         if (retval < 0) {
365                 RTE_LOG(ERR, VHOST_PORT,
366                         "Failed to get MAC address on port %u: %s\n",
367                         port, rte_strerror(-retval));
368                 return retval;
369         }
370
371         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
372         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
373                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
375
376         return 0;
377 }
378
379 /*
380  * Set socket file path.
381  */
382 static int
383 us_vhost_parse_socket_path(const char *q_arg)
384 {
385         char *old;
386
387         /* parse number string */
388         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
389                 return -1;
390
391         old = socket_files;
392         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
393         if (socket_files == NULL) {
394                 free(old);
395                 return -1;
396         }
397
398         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
399         nb_sockets++;
400
401         return 0;
402 }
403
404 /*
405  * Parse the portmask provided at run time.
406  */
407 static int
408 parse_portmask(const char *portmask)
409 {
410         char *end = NULL;
411         unsigned long pm;
412
413         errno = 0;
414
415         /* parse hexadecimal string */
416         pm = strtoul(portmask, &end, 16);
417         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
418                 return 0;
419
420         return pm;
421
422 }
423
424 /*
425  * Parse num options at run time.
426  */
427 static int
428 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
429 {
430         char *end = NULL;
431         unsigned long num;
432
433         errno = 0;
434
435         /* parse unsigned int string */
436         num = strtoul(q_arg, &end, 10);
437         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
438                 return -1;
439
440         if (num > max_valid_value)
441                 return -1;
442
443         return num;
444
445 }
446
447 /*
448  * Display usage
449  */
450 static void
451 us_vhost_usage(const char *prgname)
452 {
453         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
454         "               --vm2vm [0|1|2]\n"
455         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
456         "               --socket-file <path>\n"
457         "               --nb-devices ND\n"
458         "               -p PORTMASK: Set mask for ports to be used by application\n"
459         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
460         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
461         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
462         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
463         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
464         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
465         "               --socket-file: The path of the socket file.\n"
466         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
467         "               --tso [0|1] disable/enable TCP segment offload.\n"
468         "               --client register a vhost-user socket as client mode.\n"
469         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
470         "               --dmas register dma channel for specific vhost device.\n",
471                prgname);
472 }
473
474 enum {
475 #define OPT_VM2VM               "vm2vm"
476         OPT_VM2VM_NUM = 256,
477 #define OPT_RX_RETRY            "rx-retry"
478         OPT_RX_RETRY_NUM,
479 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
480         OPT_RX_RETRY_DELAY_NUM,
481 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
482         OPT_RX_RETRY_NUMB_NUM,
483 #define OPT_MERGEABLE           "mergeable"
484         OPT_MERGEABLE_NUM,
485 #define OPT_STATS               "stats"
486         OPT_STATS_NUM,
487 #define OPT_SOCKET_FILE         "socket-file"
488         OPT_SOCKET_FILE_NUM,
489 #define OPT_TX_CSUM             "tx-csum"
490         OPT_TX_CSUM_NUM,
491 #define OPT_TSO                 "tso"
492         OPT_TSO_NUM,
493 #define OPT_CLIENT              "client"
494         OPT_CLIENT_NUM,
495 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
496         OPT_BUILTIN_NET_DRIVER_NUM,
497 #define OPT_DMA_TYPE            "dma-type"
498         OPT_DMA_TYPE_NUM,
499 #define OPT_DMAS                "dmas"
500         OPT_DMAS_NUM,
501 };
502
503 /*
504  * Parse the arguments given in the command line of the application.
505  */
506 static int
507 us_vhost_parse_args(int argc, char **argv)
508 {
509         int opt, ret;
510         int option_index;
511         unsigned i;
512         const char *prgname = argv[0];
513         static struct option long_option[] = {
514                 {OPT_VM2VM, required_argument,
515                                 NULL, OPT_VM2VM_NUM},
516                 {OPT_RX_RETRY, required_argument,
517                                 NULL, OPT_RX_RETRY_NUM},
518                 {OPT_RX_RETRY_DELAY, required_argument,
519                                 NULL, OPT_RX_RETRY_DELAY_NUM},
520                 {OPT_RX_RETRY_NUMB, required_argument,
521                                 NULL, OPT_RX_RETRY_NUMB_NUM},
522                 {OPT_MERGEABLE, required_argument,
523                                 NULL, OPT_MERGEABLE_NUM},
524                 {OPT_STATS, required_argument,
525                                 NULL, OPT_STATS_NUM},
526                 {OPT_SOCKET_FILE, required_argument,
527                                 NULL, OPT_SOCKET_FILE_NUM},
528                 {OPT_TX_CSUM, required_argument,
529                                 NULL, OPT_TX_CSUM_NUM},
530                 {OPT_TSO, required_argument,
531                                 NULL, OPT_TSO_NUM},
532                 {OPT_CLIENT, no_argument,
533                                 NULL, OPT_CLIENT_NUM},
534                 {OPT_BUILTIN_NET_DRIVER, no_argument,
535                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
536                 {OPT_DMA_TYPE, required_argument,
537                                 NULL, OPT_DMA_TYPE_NUM},
538                 {OPT_DMAS, required_argument,
539                                 NULL, OPT_DMAS_NUM},
540                 {NULL, 0, 0, 0},
541         };
542
543         /* Parse command line */
544         while ((opt = getopt_long(argc, argv, "p:P",
545                         long_option, &option_index)) != EOF) {
546                 switch (opt) {
547                 /* Portmask */
548                 case 'p':
549                         enabled_port_mask = parse_portmask(optarg);
550                         if (enabled_port_mask == 0) {
551                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
552                                 us_vhost_usage(prgname);
553                                 return -1;
554                         }
555                         break;
556
557                 case 'P':
558                         promiscuous = 1;
559                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
560                                 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
561                                 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
562                         break;
563
564                 case OPT_VM2VM_NUM:
565                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
566                         if (ret == -1) {
567                                 RTE_LOG(INFO, VHOST_CONFIG,
568                                         "Invalid argument for "
569                                         "vm2vm [0|1|2]\n");
570                                 us_vhost_usage(prgname);
571                                 return -1;
572                         }
573                         vm2vm_mode = (vm2vm_type)ret;
574                         break;
575
576                 case OPT_RX_RETRY_NUM:
577                         ret = parse_num_opt(optarg, 1);
578                         if (ret == -1) {
579                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
580                                 us_vhost_usage(prgname);
581                                 return -1;
582                         }
583                         enable_retry = ret;
584                         break;
585
586                 case OPT_TX_CSUM_NUM:
587                         ret = parse_num_opt(optarg, 1);
588                         if (ret == -1) {
589                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
590                                 us_vhost_usage(prgname);
591                                 return -1;
592                         }
593                         enable_tx_csum = ret;
594                         break;
595
596                 case OPT_TSO_NUM:
597                         ret = parse_num_opt(optarg, 1);
598                         if (ret == -1) {
599                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
600                                 us_vhost_usage(prgname);
601                                 return -1;
602                         }
603                         enable_tso = ret;
604                         break;
605
606                 case OPT_RX_RETRY_DELAY_NUM:
607                         ret = parse_num_opt(optarg, INT32_MAX);
608                         if (ret == -1) {
609                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
610                                 us_vhost_usage(prgname);
611                                 return -1;
612                         }
613                         burst_rx_delay_time = ret;
614                         break;
615
616                 case OPT_RX_RETRY_NUMB_NUM:
617                         ret = parse_num_opt(optarg, INT32_MAX);
618                         if (ret == -1) {
619                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620                                 us_vhost_usage(prgname);
621                                 return -1;
622                         }
623                         burst_rx_retry_num = ret;
624                         break;
625
626                 case OPT_MERGEABLE_NUM:
627                         ret = parse_num_opt(optarg, 1);
628                         if (ret == -1) {
629                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
630                                 us_vhost_usage(prgname);
631                                 return -1;
632                         }
633                         mergeable = !!ret;
634                         if (ret)
635                                 vmdq_conf_default.rxmode.mtu = MAX_MTU;
636                         break;
637
638                 case OPT_STATS_NUM:
639                         ret = parse_num_opt(optarg, INT32_MAX);
640                         if (ret == -1) {
641                                 RTE_LOG(INFO, VHOST_CONFIG,
642                                         "Invalid argument for stats [0..N]\n");
643                                 us_vhost_usage(prgname);
644                                 return -1;
645                         }
646                         enable_stats = ret;
647                         break;
648
649                 /* Set socket file path. */
650                 case OPT_SOCKET_FILE_NUM:
651                         if (us_vhost_parse_socket_path(optarg) == -1) {
652                                 RTE_LOG(INFO, VHOST_CONFIG,
653                                 "Invalid argument for socket name (Max %d characters)\n",
654                                 PATH_MAX);
655                                 us_vhost_usage(prgname);
656                                 return -1;
657                         }
658                         break;
659
660                 case OPT_DMA_TYPE_NUM:
661                         dma_type = optarg;
662                         break;
663
664                 case OPT_DMAS_NUM:
665                         if (open_dma(optarg) == -1) {
666                                 RTE_LOG(INFO, VHOST_CONFIG,
667                                         "Wrong DMA args\n");
668                                 us_vhost_usage(prgname);
669                                 return -1;
670                         }
671                         async_vhost_driver = 1;
672                         break;
673
674                 case OPT_CLIENT_NUM:
675                         client_mode = 1;
676                         break;
677
678                 case OPT_BUILTIN_NET_DRIVER_NUM:
679                         builtin_net_driver = 1;
680                         break;
681
682                 /* Invalid option - print options. */
683                 default:
684                         us_vhost_usage(prgname);
685                         return -1;
686                 }
687         }
688
689         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
690                 if (enabled_port_mask & (1 << i))
691                         ports[num_ports++] = i;
692         }
693
694         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
695                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
696                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
697                 return -1;
698         }
699
700         return 0;
701 }
702
703 /*
704  * Update the global var NUM_PORTS and array PORTS according to system ports number
705  * and return valid ports number
706  */
707 static unsigned check_ports_num(unsigned nb_ports)
708 {
709         unsigned valid_num_ports = num_ports;
710         unsigned portid;
711
712         if (num_ports > nb_ports) {
713                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
714                         num_ports, nb_ports);
715                 num_ports = nb_ports;
716         }
717
718         for (portid = 0; portid < num_ports; portid ++) {
719                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
720                         RTE_LOG(INFO, VHOST_PORT,
721                                 "\nSpecified port ID(%u) is not valid\n",
722                                 ports[portid]);
723                         ports[portid] = INVALID_PORT_ID;
724                         valid_num_ports--;
725                 }
726         }
727         return valid_num_ports;
728 }
729
730 static __rte_always_inline struct vhost_dev *
731 find_vhost_dev(struct rte_ether_addr *mac)
732 {
733         struct vhost_dev *vdev;
734
735         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
736                 if (vdev->ready == DEVICE_RX &&
737                     rte_is_same_ether_addr(mac, &vdev->mac_address))
738                         return vdev;
739         }
740
741         return NULL;
742 }
743
744 /*
745  * This function learns the MAC address of the device and registers this along with a
746  * vlan tag to a VMDQ.
747  */
748 static int
749 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
750 {
751         struct rte_ether_hdr *pkt_hdr;
752         int i, ret;
753
754         /* Learn MAC address of guest device from packet */
755         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
756
757         if (find_vhost_dev(&pkt_hdr->src_addr)) {
758                 RTE_LOG(ERR, VHOST_DATA,
759                         "(%d) device is using a registered MAC!\n",
760                         vdev->vid);
761                 return -1;
762         }
763
764         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
765                 vdev->mac_address.addr_bytes[i] =
766                         pkt_hdr->src_addr.addr_bytes[i];
767
768         /* vlan_tag currently uses the device_id. */
769         vdev->vlan_tag = vlan_tags[vdev->vid];
770
771         /* Print out VMDQ registration info. */
772         RTE_LOG(INFO, VHOST_DATA,
773                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
774                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
775                 vdev->vlan_tag);
776
777         /* Register the MAC address. */
778         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
779                                 (uint32_t)vdev->vid + vmdq_pool_base);
780         if (ret)
781                 RTE_LOG(ERR, VHOST_DATA,
782                         "(%d) failed to add device MAC address to VMDQ\n",
783                         vdev->vid);
784
785         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
786
787         /* Set device as ready for RX. */
788         vdev->ready = DEVICE_RX;
789
790         return 0;
791 }
792
793 /*
794  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
795  * queue before disabling RX on the device.
796  */
797 static inline void
798 unlink_vmdq(struct vhost_dev *vdev)
799 {
800         unsigned i = 0;
801         unsigned rx_count;
802         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
803
804         if (vdev->ready == DEVICE_RX) {
805                 /*clear MAC and VLAN settings*/
806                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
807                 for (i = 0; i < 6; i++)
808                         vdev->mac_address.addr_bytes[i] = 0;
809
810                 vdev->vlan_tag = 0;
811
812                 /*Clear out the receive buffers*/
813                 rx_count = rte_eth_rx_burst(ports[0],
814                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
815
816                 while (rx_count) {
817                         for (i = 0; i < rx_count; i++)
818                                 rte_pktmbuf_free(pkts_burst[i]);
819
820                         rx_count = rte_eth_rx_burst(ports[0],
821                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
822                 }
823
824                 vdev->ready = DEVICE_MAC_LEARNING;
825         }
826 }
827
828 static inline void
829 free_pkts(struct rte_mbuf **pkts, uint16_t n)
830 {
831         while (n--)
832                 rte_pktmbuf_free(pkts[n]);
833 }
834
835 static __rte_always_inline void
836 complete_async_pkts(struct vhost_dev *vdev)
837 {
838         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
839         uint16_t complete_count;
840
841         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
842                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
843         if (complete_count) {
844                 free_pkts(p_cpl, complete_count);
845                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
846         }
847
848 }
849
850 static __rte_always_inline void
851 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
852             struct rte_mbuf *m)
853 {
854         uint16_t ret;
855
856         if (builtin_net_driver) {
857                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
858         } else {
859                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
860         }
861
862         if (enable_stats) {
863                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
864                                 __ATOMIC_SEQ_CST);
865                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
866                                 __ATOMIC_SEQ_CST);
867                 src_vdev->stats.tx_total++;
868                 src_vdev->stats.tx += ret;
869         }
870 }
871
872 static __rte_always_inline void
873 drain_vhost(struct vhost_dev *vdev)
874 {
875         uint16_t ret;
876         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
877         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
878         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
879
880         if (builtin_net_driver) {
881                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
882         } else if (async_vhost_driver) {
883                 uint16_t enqueue_fail = 0;
884
885                 complete_async_pkts(vdev);
886                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
887                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
888
889                 enqueue_fail = nr_xmit - ret;
890                 if (enqueue_fail)
891                         free_pkts(&m[ret], nr_xmit - ret);
892         } else {
893                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
894                                                 m, nr_xmit);
895         }
896
897         if (enable_stats) {
898                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
899                                 __ATOMIC_SEQ_CST);
900                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
901                                 __ATOMIC_SEQ_CST);
902         }
903
904         if (!async_vhost_driver)
905                 free_pkts(m, nr_xmit);
906 }
907
908 static __rte_always_inline void
909 drain_vhost_table(void)
910 {
911         uint16_t lcore_id = rte_lcore_id();
912         struct vhost_bufftable *vhost_txq;
913         struct vhost_dev *vdev;
914         uint64_t cur_tsc;
915
916         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
917                 if (unlikely(vdev->remove == 1))
918                         continue;
919
920                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
921                                                 + vdev->vid];
922
923                 cur_tsc = rte_rdtsc();
924                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
925                                 > MBUF_TABLE_DRAIN_TSC)) {
926                         RTE_LOG_DP(DEBUG, VHOST_DATA,
927                                 "Vhost TX queue drained after timeout with burst size %u\n",
928                                 vhost_txq->len);
929                         drain_vhost(vdev);
930                         vhost_txq->len = 0;
931                         vhost_txq->pre_tsc = cur_tsc;
932                 }
933         }
934 }
935
936 /*
937  * Check if the packet destination MAC address is for a local device. If so then put
938  * the packet on that devices RX queue. If not then return.
939  */
940 static __rte_always_inline int
941 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
942 {
943         struct rte_ether_hdr *pkt_hdr;
944         struct vhost_dev *dst_vdev;
945         struct vhost_bufftable *vhost_txq;
946         uint16_t lcore_id = rte_lcore_id();
947         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
948
949         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
950         if (!dst_vdev)
951                 return -1;
952
953         if (vdev->vid == dst_vdev->vid) {
954                 RTE_LOG_DP(DEBUG, VHOST_DATA,
955                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
956                         vdev->vid);
957                 return 0;
958         }
959
960         RTE_LOG_DP(DEBUG, VHOST_DATA,
961                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
962
963         if (unlikely(dst_vdev->remove)) {
964                 RTE_LOG_DP(DEBUG, VHOST_DATA,
965                         "(%d) device is marked for removal\n", dst_vdev->vid);
966                 return 0;
967         }
968
969         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
970         vhost_txq->m_table[vhost_txq->len++] = m;
971
972         if (enable_stats) {
973                 vdev->stats.tx_total++;
974                 vdev->stats.tx++;
975         }
976
977         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
978                 drain_vhost(dst_vdev);
979                 vhost_txq->len = 0;
980                 vhost_txq->pre_tsc = rte_rdtsc();
981         }
982         return 0;
983 }
984
985 /*
986  * Check if the destination MAC of a packet is one local VM,
987  * and get its vlan tag, and offset if it is.
988  */
989 static __rte_always_inline int
990 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
991         uint32_t *offset, uint16_t *vlan_tag)
992 {
993         struct vhost_dev *dst_vdev;
994         struct rte_ether_hdr *pkt_hdr =
995                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
996
997         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
998         if (!dst_vdev)
999                 return 0;
1000
1001         if (vdev->vid == dst_vdev->vid) {
1002                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1003                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1004                         vdev->vid);
1005                 return -1;
1006         }
1007
1008         /*
1009          * HW vlan strip will reduce the packet length
1010          * by minus length of vlan tag, so need restore
1011          * the packet length by plus it.
1012          */
1013         *offset  = VLAN_HLEN;
1014         *vlan_tag = vlan_tags[vdev->vid];
1015
1016         RTE_LOG_DP(DEBUG, VHOST_DATA,
1017                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1018                 vdev->vid, dst_vdev->vid, *vlan_tag);
1019
1020         return 0;
1021 }
1022
1023 static void virtio_tx_offload(struct rte_mbuf *m)
1024 {
1025         struct rte_net_hdr_lens hdr_lens;
1026         struct rte_ipv4_hdr *ipv4_hdr;
1027         struct rte_tcp_hdr *tcp_hdr;
1028         uint32_t ptype;
1029         void *l3_hdr;
1030
1031         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1032         m->l2_len = hdr_lens.l2_len;
1033         m->l3_len = hdr_lens.l3_len;
1034         m->l4_len = hdr_lens.l4_len;
1035
1036         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1037         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1038                 m->l2_len + m->l3_len);
1039
1040         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1041         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1042                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1043                 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1044                 ipv4_hdr = l3_hdr;
1045                 ipv4_hdr->hdr_checksum = 0;
1046                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1047         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1048                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1049                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1050         }
1051 }
1052
1053 static __rte_always_inline void
1054 do_drain_mbuf_table(struct mbuf_table *tx_q)
1055 {
1056         uint16_t count;
1057
1058         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1059                                  tx_q->m_table, tx_q->len);
1060         if (unlikely(count < tx_q->len))
1061                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1062
1063         tx_q->len = 0;
1064 }
1065
1066 /*
1067  * This function routes the TX packet to the correct interface. This
1068  * may be a local device or the physical port.
1069  */
1070 static __rte_always_inline void
1071 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1072 {
1073         struct mbuf_table *tx_q;
1074         unsigned offset = 0;
1075         const uint16_t lcore_id = rte_lcore_id();
1076         struct rte_ether_hdr *nh;
1077
1078
1079         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1080         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1081                 struct vhost_dev *vdev2;
1082
1083                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1084                         if (vdev2 != vdev)
1085                                 sync_virtio_xmit(vdev2, vdev, m);
1086                 }
1087                 goto queue2nic;
1088         }
1089
1090         /*check if destination is local VM*/
1091         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1092                 return;
1093
1094         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1095                 if (unlikely(find_local_dest(vdev, m, &offset,
1096                                              &vlan_tag) != 0)) {
1097                         rte_pktmbuf_free(m);
1098                         return;
1099                 }
1100         }
1101
1102         RTE_LOG_DP(DEBUG, VHOST_DATA,
1103                 "(%d) TX: MAC address is external\n", vdev->vid);
1104
1105 queue2nic:
1106
1107         /*Add packet to the port tx queue*/
1108         tx_q = &lcore_tx_queue[lcore_id];
1109
1110         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1111         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1112                 /* Guest has inserted the vlan tag. */
1113                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1114                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1115                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1116                         (vh->vlan_tci != vlan_tag_be))
1117                         vh->vlan_tci = vlan_tag_be;
1118         } else {
1119                 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1120
1121                 /*
1122                  * Find the right seg to adjust the data len when offset is
1123                  * bigger than tail room size.
1124                  */
1125                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1126                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1127                                 m->data_len += offset;
1128                         else {
1129                                 struct rte_mbuf *seg = m;
1130
1131                                 while ((seg->next != NULL) &&
1132                                         (offset > rte_pktmbuf_tailroom(seg)))
1133                                         seg = seg->next;
1134
1135                                 seg->data_len += offset;
1136                         }
1137                         m->pkt_len += offset;
1138                 }
1139
1140                 m->vlan_tci = vlan_tag;
1141         }
1142
1143         if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1144                 virtio_tx_offload(m);
1145
1146         tx_q->m_table[tx_q->len++] = m;
1147         if (enable_stats) {
1148                 vdev->stats.tx_total++;
1149                 vdev->stats.tx++;
1150         }
1151
1152         if (unlikely(tx_q->len == MAX_PKT_BURST))
1153                 do_drain_mbuf_table(tx_q);
1154 }
1155
1156
1157 static __rte_always_inline void
1158 drain_mbuf_table(struct mbuf_table *tx_q)
1159 {
1160         static uint64_t prev_tsc;
1161         uint64_t cur_tsc;
1162
1163         if (tx_q->len == 0)
1164                 return;
1165
1166         cur_tsc = rte_rdtsc();
1167         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1168                 prev_tsc = cur_tsc;
1169
1170                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1171                         "TX queue drained after timeout with burst size %u\n",
1172                         tx_q->len);
1173                 do_drain_mbuf_table(tx_q);
1174         }
1175 }
1176
1177 static __rte_always_inline void
1178 drain_eth_rx(struct vhost_dev *vdev)
1179 {
1180         uint16_t rx_count, enqueue_count;
1181         struct rte_mbuf *pkts[MAX_PKT_BURST];
1182
1183         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1184                                     pkts, MAX_PKT_BURST);
1185
1186         if (!rx_count)
1187                 return;
1188
1189         /*
1190          * When "enable_retry" is set, here we wait and retry when there
1191          * is no enough free slots in the queue to hold @rx_count packets,
1192          * to diminish packet loss.
1193          */
1194         if (enable_retry &&
1195             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1196                         VIRTIO_RXQ))) {
1197                 uint32_t retry;
1198
1199                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1200                         rte_delay_us(burst_rx_delay_time);
1201                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1202                                         VIRTIO_RXQ))
1203                                 break;
1204                 }
1205         }
1206
1207         if (builtin_net_driver) {
1208                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1209                                                 pkts, rx_count);
1210         } else if (async_vhost_driver) {
1211                 uint16_t enqueue_fail = 0;
1212
1213                 complete_async_pkts(vdev);
1214                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1215                                         VIRTIO_RXQ, pkts, rx_count);
1216                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1217
1218                 enqueue_fail = rx_count - enqueue_count;
1219                 if (enqueue_fail)
1220                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1221
1222         } else {
1223                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1224                                                 pkts, rx_count);
1225         }
1226
1227         if (enable_stats) {
1228                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1229                                 __ATOMIC_SEQ_CST);
1230                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1231                                 __ATOMIC_SEQ_CST);
1232         }
1233
1234         if (!async_vhost_driver)
1235                 free_pkts(pkts, rx_count);
1236 }
1237
1238 static __rte_always_inline void
1239 drain_virtio_tx(struct vhost_dev *vdev)
1240 {
1241         struct rte_mbuf *pkts[MAX_PKT_BURST];
1242         uint16_t count;
1243         uint16_t i;
1244
1245         if (builtin_net_driver) {
1246                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1247                                         pkts, MAX_PKT_BURST);
1248         } else {
1249                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1250                                         mbuf_pool, pkts, MAX_PKT_BURST);
1251         }
1252
1253         /* setup VMDq for the first packet */
1254         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1255                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1256                         free_pkts(pkts, count);
1257         }
1258
1259         for (i = 0; i < count; ++i)
1260                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1261 }
1262
1263 /*
1264  * Main function of vhost-switch. It basically does:
1265  *
1266  * for each vhost device {
1267  *    - drain_eth_rx()
1268  *
1269  *      Which drains the host eth Rx queue linked to the vhost device,
1270  *      and deliver all of them to guest virito Rx ring associated with
1271  *      this vhost device.
1272  *
1273  *    - drain_virtio_tx()
1274  *
1275  *      Which drains the guest virtio Tx queue and deliver all of them
1276  *      to the target, which could be another vhost device, or the
1277  *      physical eth dev. The route is done in function "virtio_tx_route".
1278  * }
1279  */
1280 static int
1281 switch_worker(void *arg __rte_unused)
1282 {
1283         unsigned i;
1284         unsigned lcore_id = rte_lcore_id();
1285         struct vhost_dev *vdev;
1286         struct mbuf_table *tx_q;
1287
1288         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1289
1290         tx_q = &lcore_tx_queue[lcore_id];
1291         for (i = 0; i < rte_lcore_count(); i++) {
1292                 if (lcore_ids[i] == lcore_id) {
1293                         tx_q->txq_id = i;
1294                         break;
1295                 }
1296         }
1297
1298         while(1) {
1299                 drain_mbuf_table(tx_q);
1300                 drain_vhost_table();
1301                 /*
1302                  * Inform the configuration core that we have exited the
1303                  * linked list and that no devices are in use if requested.
1304                  */
1305                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1306                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1307
1308                 /*
1309                  * Process vhost devices
1310                  */
1311                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1312                               lcore_vdev_entry) {
1313                         if (unlikely(vdev->remove)) {
1314                                 unlink_vmdq(vdev);
1315                                 vdev->ready = DEVICE_SAFE_REMOVE;
1316                                 continue;
1317                         }
1318
1319                         if (likely(vdev->ready == DEVICE_RX))
1320                                 drain_eth_rx(vdev);
1321
1322                         if (likely(!vdev->remove))
1323                                 drain_virtio_tx(vdev);
1324                 }
1325         }
1326
1327         return 0;
1328 }
1329
1330 /*
1331  * Remove a device from the specific data core linked list and from the
1332  * main linked list. Synchonization  occurs through the use of the
1333  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1334  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1335  */
1336 static void
1337 destroy_device(int vid)
1338 {
1339         struct vhost_dev *vdev = NULL;
1340         int lcore;
1341         uint16_t i;
1342
1343         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1344                 if (vdev->vid == vid)
1345                         break;
1346         }
1347         if (!vdev)
1348                 return;
1349         /*set the remove flag. */
1350         vdev->remove = 1;
1351         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1352                 rte_pause();
1353         }
1354
1355         for (i = 0; i < RTE_MAX_LCORE; i++)
1356                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1357
1358         if (builtin_net_driver)
1359                 vs_vhost_net_remove(vdev);
1360
1361         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1362                      lcore_vdev_entry);
1363         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1364
1365
1366         /* Set the dev_removal_flag on each lcore. */
1367         RTE_LCORE_FOREACH_WORKER(lcore)
1368                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1369
1370         /*
1371          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1372          * we can be sure that they can no longer access the device removed
1373          * from the linked lists and that the devices are no longer in use.
1374          */
1375         RTE_LCORE_FOREACH_WORKER(lcore) {
1376                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1377                         rte_pause();
1378         }
1379
1380         lcore_info[vdev->coreid].device_num--;
1381
1382         RTE_LOG(INFO, VHOST_DATA,
1383                 "(%d) device has been removed from data core\n",
1384                 vdev->vid);
1385
1386         if (async_vhost_driver) {
1387                 uint16_t n_pkt = 0;
1388                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1389
1390                 while (vdev->pkts_inflight) {
1391                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1392                                                 m_cpl, vdev->pkts_inflight);
1393                         free_pkts(m_cpl, n_pkt);
1394                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1395                 }
1396
1397                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1398         }
1399
1400         rte_free(vdev);
1401 }
1402
1403 /*
1404  * A new device is added to a data core. First the device is added to the main linked list
1405  * and then allocated to a specific data core.
1406  */
1407 static int
1408 new_device(int vid)
1409 {
1410         int lcore, core_add = 0;
1411         uint16_t i;
1412         uint32_t device_num_min = num_devices;
1413         struct vhost_dev *vdev;
1414         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1415         if (vdev == NULL) {
1416                 RTE_LOG(INFO, VHOST_DATA,
1417                         "(%d) couldn't allocate memory for vhost dev\n",
1418                         vid);
1419                 return -1;
1420         }
1421         vdev->vid = vid;
1422
1423         for (i = 0; i < RTE_MAX_LCORE; i++) {
1424                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1425                         = rte_zmalloc("vhost bufftable",
1426                                 sizeof(struct vhost_bufftable),
1427                                 RTE_CACHE_LINE_SIZE);
1428
1429                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1430                         RTE_LOG(INFO, VHOST_DATA,
1431                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1432                         return -1;
1433                 }
1434         }
1435
1436         if (builtin_net_driver)
1437                 vs_vhost_net_setup(vdev);
1438
1439         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1440         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1441
1442         /*reset ready flag*/
1443         vdev->ready = DEVICE_MAC_LEARNING;
1444         vdev->remove = 0;
1445
1446         /* Find a suitable lcore to add the device. */
1447         RTE_LCORE_FOREACH_WORKER(lcore) {
1448                 if (lcore_info[lcore].device_num < device_num_min) {
1449                         device_num_min = lcore_info[lcore].device_num;
1450                         core_add = lcore;
1451                 }
1452         }
1453         vdev->coreid = core_add;
1454
1455         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1456                           lcore_vdev_entry);
1457         lcore_info[vdev->coreid].device_num++;
1458
1459         /* Disable notifications. */
1460         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1461         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1462
1463         RTE_LOG(INFO, VHOST_DATA,
1464                 "(%d) device has been added to data core %d\n",
1465                 vid, vdev->coreid);
1466
1467         if (async_vhost_driver) {
1468                 struct rte_vhost_async_config config = {0};
1469                 struct rte_vhost_async_channel_ops channel_ops;
1470
1471                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1472                         channel_ops.transfer_data = ioat_transfer_data_cb;
1473                         channel_ops.check_completed_copies =
1474                                 ioat_check_completed_copies_cb;
1475
1476                         config.features = RTE_VHOST_ASYNC_INORDER;
1477
1478                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1479                                 config, &channel_ops);
1480                 }
1481         }
1482
1483         return 0;
1484 }
1485
1486 static int
1487 vring_state_changed(int vid, uint16_t queue_id, int enable)
1488 {
1489         struct vhost_dev *vdev = NULL;
1490
1491         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1492                 if (vdev->vid == vid)
1493                         break;
1494         }
1495         if (!vdev)
1496                 return -1;
1497
1498         if (queue_id != VIRTIO_RXQ)
1499                 return 0;
1500
1501         if (async_vhost_driver) {
1502                 if (!enable) {
1503                         uint16_t n_pkt = 0;
1504                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1505
1506                         while (vdev->pkts_inflight) {
1507                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1508                                                         m_cpl, vdev->pkts_inflight);
1509                                 free_pkts(m_cpl, n_pkt);
1510                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1511                         }
1512                 }
1513         }
1514
1515         return 0;
1516 }
1517
1518 /*
1519  * These callback allow devices to be added to the data core when configuration
1520  * has been fully complete.
1521  */
1522 static const struct vhost_device_ops virtio_net_device_ops =
1523 {
1524         .new_device =  new_device,
1525         .destroy_device = destroy_device,
1526         .vring_state_changed = vring_state_changed,
1527 };
1528
1529 /*
1530  * This is a thread will wake up after a period to print stats if the user has
1531  * enabled them.
1532  */
1533 static void *
1534 print_stats(__rte_unused void *arg)
1535 {
1536         struct vhost_dev *vdev;
1537         uint64_t tx_dropped, rx_dropped;
1538         uint64_t tx, tx_total, rx, rx_total;
1539         const char clr[] = { 27, '[', '2', 'J', '\0' };
1540         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1541
1542         while(1) {
1543                 sleep(enable_stats);
1544
1545                 /* Clear screen and move to top left */
1546                 printf("%s%s\n", clr, top_left);
1547                 printf("Device statistics =================================\n");
1548
1549                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1550                         tx_total   = vdev->stats.tx_total;
1551                         tx         = vdev->stats.tx;
1552                         tx_dropped = tx_total - tx;
1553
1554                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1555                                 __ATOMIC_SEQ_CST);
1556                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1557                                 __ATOMIC_SEQ_CST);
1558                         rx_dropped = rx_total - rx;
1559
1560                         printf("Statistics for device %d\n"
1561                                 "-----------------------\n"
1562                                 "TX total:              %" PRIu64 "\n"
1563                                 "TX dropped:            %" PRIu64 "\n"
1564                                 "TX successful:         %" PRIu64 "\n"
1565                                 "RX total:              %" PRIu64 "\n"
1566                                 "RX dropped:            %" PRIu64 "\n"
1567                                 "RX successful:         %" PRIu64 "\n",
1568                                 vdev->vid,
1569                                 tx_total, tx_dropped, tx,
1570                                 rx_total, rx_dropped, rx);
1571                 }
1572
1573                 printf("===================================================\n");
1574
1575                 fflush(stdout);
1576         }
1577
1578         return NULL;
1579 }
1580
1581 static void
1582 unregister_drivers(int socket_num)
1583 {
1584         int i, ret;
1585
1586         for (i = 0; i < socket_num; i++) {
1587                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1588                 if (ret != 0)
1589                         RTE_LOG(ERR, VHOST_CONFIG,
1590                                 "Fail to unregister vhost driver for %s.\n",
1591                                 socket_files + i * PATH_MAX);
1592         }
1593 }
1594
1595 /* When we receive a INT signal, unregister vhost driver */
1596 static void
1597 sigint_handler(__rte_unused int signum)
1598 {
1599         /* Unregister vhost driver. */
1600         unregister_drivers(nb_sockets);
1601
1602         exit(0);
1603 }
1604
1605 /*
1606  * While creating an mbuf pool, one key thing is to figure out how
1607  * many mbuf entries is enough for our use. FYI, here are some
1608  * guidelines:
1609  *
1610  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1611  *
1612  * - For each switch core (A CPU core does the packet switch), we need
1613  *   also make some reservation for receiving the packets from virtio
1614  *   Tx queue. How many is enough depends on the usage. It's normally
1615  *   a simple calculation like following:
1616  *
1617  *       MAX_PKT_BURST * max packet size / mbuf size
1618  *
1619  *   So, we definitely need allocate more mbufs when TSO is enabled.
1620  *
1621  * - Similarly, for each switching core, we should serve @nr_rx_desc
1622  *   mbufs for receiving the packets from physical NIC device.
1623  *
1624  * - We also need make sure, for each switch core, we have allocated
1625  *   enough mbufs to fill up the mbuf cache.
1626  */
1627 static void
1628 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1629         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1630 {
1631         uint32_t nr_mbufs;
1632         uint32_t nr_mbufs_per_core;
1633         uint32_t mtu = 1500;
1634
1635         if (mergeable)
1636                 mtu = 9000;
1637         if (enable_tso)
1638                 mtu = 64 * 1024;
1639
1640         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1641                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1642         nr_mbufs_per_core += nr_rx_desc;
1643         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1644
1645         nr_mbufs  = nr_queues * nr_rx_desc;
1646         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1647         nr_mbufs *= nr_port;
1648
1649         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1650                                             nr_mbuf_cache, 0, mbuf_size,
1651                                             rte_socket_id());
1652         if (mbuf_pool == NULL)
1653                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1654 }
1655
1656 /*
1657  * Main function, does initialisation and calls the per-lcore functions.
1658  */
1659 int
1660 main(int argc, char *argv[])
1661 {
1662         unsigned lcore_id, core_id = 0;
1663         unsigned nb_ports, valid_num_ports;
1664         int ret, i;
1665         uint16_t portid;
1666         static pthread_t tid;
1667         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1668
1669         signal(SIGINT, sigint_handler);
1670
1671         /* init EAL */
1672         ret = rte_eal_init(argc, argv);
1673         if (ret < 0)
1674                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1675         argc -= ret;
1676         argv += ret;
1677
1678         /* parse app arguments */
1679         ret = us_vhost_parse_args(argc, argv);
1680         if (ret < 0)
1681                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1682
1683         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1684                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1685
1686                 if (rte_lcore_is_enabled(lcore_id))
1687                         lcore_ids[core_id++] = lcore_id;
1688         }
1689
1690         if (rte_lcore_count() > RTE_MAX_LCORE)
1691                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1692
1693         /* Get the number of physical ports. */
1694         nb_ports = rte_eth_dev_count_avail();
1695
1696         /*
1697          * Update the global var NUM_PORTS and global array PORTS
1698          * and get value of var VALID_NUM_PORTS according to system ports number
1699          */
1700         valid_num_ports = check_ports_num(nb_ports);
1701
1702         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1703                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1704                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1705                 return -1;
1706         }
1707
1708         /*
1709          * FIXME: here we are trying to allocate mbufs big enough for
1710          * @MAX_QUEUES, but the truth is we're never going to use that
1711          * many queues here. We probably should only do allocation for
1712          * those queues we are going to use.
1713          */
1714         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1715                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1716
1717         if (vm2vm_mode == VM2VM_HARDWARE) {
1718                 /* Enable VT loop back to let L2 switch to do it. */
1719                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1720                 RTE_LOG(DEBUG, VHOST_CONFIG,
1721                         "Enable loop back for L2 switch in vmdq.\n");
1722         }
1723
1724         /* initialize all ports */
1725         RTE_ETH_FOREACH_DEV(portid) {
1726                 /* skip ports that are not enabled */
1727                 if ((enabled_port_mask & (1 << portid)) == 0) {
1728                         RTE_LOG(INFO, VHOST_PORT,
1729                                 "Skipping disabled port %d\n", portid);
1730                         continue;
1731                 }
1732                 if (port_init(portid) != 0)
1733                         rte_exit(EXIT_FAILURE,
1734                                 "Cannot initialize network ports\n");
1735         }
1736
1737         /* Enable stats if the user option is set. */
1738         if (enable_stats) {
1739                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1740                                         print_stats, NULL);
1741                 if (ret < 0)
1742                         rte_exit(EXIT_FAILURE,
1743                                 "Cannot create print-stats thread\n");
1744         }
1745
1746         /* Launch all data cores. */
1747         RTE_LCORE_FOREACH_WORKER(lcore_id)
1748                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1749
1750         if (client_mode)
1751                 flags |= RTE_VHOST_USER_CLIENT;
1752
1753         /* Register vhost user driver to handle vhost messages. */
1754         for (i = 0; i < nb_sockets; i++) {
1755                 char *file = socket_files + i * PATH_MAX;
1756
1757                 if (async_vhost_driver)
1758                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1759
1760                 ret = rte_vhost_driver_register(file, flags);
1761                 if (ret != 0) {
1762                         unregister_drivers(i);
1763                         rte_exit(EXIT_FAILURE,
1764                                 "vhost driver register failure.\n");
1765                 }
1766
1767                 if (builtin_net_driver)
1768                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1769
1770                 if (mergeable == 0) {
1771                         rte_vhost_driver_disable_features(file,
1772                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1773                 }
1774
1775                 if (enable_tx_csum == 0) {
1776                         rte_vhost_driver_disable_features(file,
1777                                 1ULL << VIRTIO_NET_F_CSUM);
1778                 }
1779
1780                 if (enable_tso == 0) {
1781                         rte_vhost_driver_disable_features(file,
1782                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1783                         rte_vhost_driver_disable_features(file,
1784                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1785                         rte_vhost_driver_disable_features(file,
1786                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1787                         rte_vhost_driver_disable_features(file,
1788                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1789                 }
1790
1791                 if (promiscuous) {
1792                         rte_vhost_driver_enable_features(file,
1793                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1794                 }
1795
1796                 ret = rte_vhost_driver_callback_register(file,
1797                         &virtio_net_device_ops);
1798                 if (ret != 0) {
1799                         rte_exit(EXIT_FAILURE,
1800                                 "failed to register vhost driver callbacks.\n");
1801                 }
1802
1803                 if (rte_vhost_driver_start(file) < 0) {
1804                         rte_exit(EXIT_FAILURE,
1805                                 "failed to start vhost driver.\n");
1806                 }
1807         }
1808
1809         RTE_LCORE_FOREACH_WORKER(lcore_id)
1810                 rte_eal_wait_lcore(lcore_id);
1811
1812         /* clean up the EAL */
1813         rte_eal_cleanup();
1814
1815         return 0;
1816 }