ethdev: fix max Rx packet length
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27
28 #include "ioat.h"
29 #include "main.h"
30
31 #ifndef MAX_QUEUES
32 #define MAX_QUEUES 128
33 #endif
34
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
37
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
40
41 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
42
43 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
45
46 #define JUMBO_FRAME_MAX_SIZE    0x2600
47 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
48
49 /* State of virtio device. */
50 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_RX                       1
52 #define DEVICE_SAFE_REMOVE      2
53
54 /* Configurable number of RX/TX ring descriptors */
55 #define RTE_TEST_RX_DESC_DEFAULT 1024
56 #define RTE_TEST_TX_DESC_DEFAULT 512
57
58 #define INVALID_PORT_ID 0xFF
59
60 /* mask of enabled ports */
61 static uint32_t enabled_port_mask = 0;
62
63 /* Promiscuous mode */
64 static uint32_t promiscuous;
65
66 /* number of devices/queues to support*/
67 static uint32_t num_queues = 0;
68 static uint32_t num_devices;
69
70 static struct rte_mempool *mbuf_pool;
71 static int mergeable;
72
73 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
74 typedef enum {
75         VM2VM_DISABLED = 0,
76         VM2VM_SOFTWARE = 1,
77         VM2VM_HARDWARE = 2,
78         VM2VM_LAST
79 } vm2vm_type;
80 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81
82 /* Enable stats. */
83 static uint32_t enable_stats = 0;
84 /* Enable retries on RX. */
85 static uint32_t enable_retry = 1;
86
87 /* Disable TX checksum offload */
88 static uint32_t enable_tx_csum;
89
90 /* Disable TSO offload */
91 static uint32_t enable_tso;
92
93 static int client_mode;
94
95 static int builtin_net_driver;
96
97 static int async_vhost_driver;
98
99 static char *dma_type;
100
101 /* Specify timeout (in useconds) between retries on RX. */
102 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
103 /* Specify the number of retries on RX. */
104 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105
106 /* Socket file paths. Can be set by user */
107 static char *socket_files;
108 static int nb_sockets;
109
110 /* empty vmdq configuration structure. Filled in programatically */
111 static struct rte_eth_conf vmdq_conf_default = {
112         .rxmode = {
113                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
114                 .split_hdr_size = 0,
115                 /*
116                  * VLAN strip is necessary for 1G NIC such as I350,
117                  * this fixes bug of ipv4 forwarding in guest can't
118                  * forward pakets from one virtio dev to another virtio dev.
119                  */
120                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
121         },
122
123         .txmode = {
124                 .mq_mode = ETH_MQ_TX_NONE,
125                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
126                              DEV_TX_OFFLOAD_TCP_CKSUM |
127                              DEV_TX_OFFLOAD_VLAN_INSERT |
128                              DEV_TX_OFFLOAD_MULTI_SEGS |
129                              DEV_TX_OFFLOAD_TCP_TSO),
130         },
131         .rx_adv_conf = {
132                 /*
133                  * should be overridden separately in code with
134                  * appropriate values
135                  */
136                 .vmdq_rx_conf = {
137                         .nb_queue_pools = ETH_8_POOLS,
138                         .enable_default_pool = 0,
139                         .default_pool = 0,
140                         .nb_pool_maps = 0,
141                         .pool_map = {{0, 0},},
142                 },
143         },
144 };
145
146
147 static unsigned lcore_ids[RTE_MAX_LCORE];
148 static uint16_t ports[RTE_MAX_ETHPORTS];
149 static unsigned num_ports = 0; /**< The number of ports specified in command line */
150 static uint16_t num_pf_queues, num_vmdq_queues;
151 static uint16_t vmdq_pool_base, vmdq_queue_base;
152 static uint16_t queues_per_pool;
153
154 const uint16_t vlan_tags[] = {
155         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
156         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
157         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
158         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
159         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
160         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
161         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
162         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 };
164
165 /* ethernet addresses of ports */
166 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167
168 static struct vhost_dev_tailq_list vhost_dev_list =
169         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170
171 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172
173 /* Used for queueing bursts of TX packets. */
174 struct mbuf_table {
175         unsigned len;
176         unsigned txq_id;
177         struct rte_mbuf *m_table[MAX_PKT_BURST];
178 };
179
180 struct vhost_bufftable {
181         uint32_t len;
182         uint64_t pre_tsc;
183         struct rte_mbuf *m_table[MAX_PKT_BURST];
184 };
185
186 /* TX queue for each data core. */
187 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188
189 /*
190  * Vhost TX buffer for each data core.
191  * Every data core maintains a TX buffer for every vhost device,
192  * which is used for batch pkts enqueue for higher performance.
193  */
194 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195
196 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
197                                  / US_PER_S * BURST_TX_DRAIN_US)
198 #define VLAN_HLEN       4
199
200 static inline int
201 open_dma(const char *value)
202 {
203         if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
204                 return open_ioat(value);
205
206         return -1;
207 }
208
209 /*
210  * Builds up the correct configuration for VMDQ VLAN pool map
211  * according to the pool & queue limits.
212  */
213 static inline int
214 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215 {
216         struct rte_eth_vmdq_rx_conf conf;
217         struct rte_eth_vmdq_rx_conf *def_conf =
218                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219         unsigned i;
220
221         memset(&conf, 0, sizeof(conf));
222         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
223         conf.nb_pool_maps = num_devices;
224         conf.enable_loop_back = def_conf->enable_loop_back;
225         conf.rx_mode = def_conf->rx_mode;
226
227         for (i = 0; i < conf.nb_pool_maps; i++) {
228                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
229                 conf.pool_map[i].pools = (1UL << i);
230         }
231
232         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
233         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
234                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
235         return 0;
236 }
237
238 /*
239  * Initialises a given port using global settings and with the rx buffers
240  * coming from the mbuf_pool passed as parameter
241  */
242 static inline int
243 port_init(uint16_t port)
244 {
245         struct rte_eth_dev_info dev_info;
246         struct rte_eth_conf port_conf;
247         struct rte_eth_rxconf *rxconf;
248         struct rte_eth_txconf *txconf;
249         int16_t rx_rings, tx_rings;
250         uint16_t rx_ring_size, tx_ring_size;
251         int retval;
252         uint16_t q;
253
254         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
255         retval = rte_eth_dev_info_get(port, &dev_info);
256         if (retval != 0) {
257                 RTE_LOG(ERR, VHOST_PORT,
258                         "Error during getting device (port %u) info: %s\n",
259                         port, strerror(-retval));
260
261                 return retval;
262         }
263
264         rxconf = &dev_info.default_rxconf;
265         txconf = &dev_info.default_txconf;
266         rxconf->rx_drop_en = 1;
267
268         /*configure the number of supported virtio devices based on VMDQ limits */
269         num_devices = dev_info.max_vmdq_pools;
270
271         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
272         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
273
274         tx_rings = (uint16_t)rte_lcore_count();
275
276         /* Get port configuration. */
277         retval = get_eth_conf(&port_conf, num_devices);
278         if (retval < 0)
279                 return retval;
280         /* NIC queues are divided into pf queues and vmdq queues.  */
281         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
282         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
283         num_vmdq_queues = num_devices * queues_per_pool;
284         num_queues = num_pf_queues + num_vmdq_queues;
285         vmdq_queue_base = dev_info.vmdq_queue_base;
286         vmdq_pool_base  = dev_info.vmdq_pool_base;
287         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
288                 num_pf_queues, num_devices, queues_per_pool);
289
290         if (!rte_eth_dev_is_valid_port(port))
291                 return -1;
292
293         rx_rings = (uint16_t)dev_info.max_rx_queues;
294         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
295                 port_conf.txmode.offloads |=
296                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
297         /* Configure ethernet device. */
298         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
299         if (retval != 0) {
300                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
301                         port, strerror(-retval));
302                 return retval;
303         }
304
305         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
306                 &tx_ring_size);
307         if (retval != 0) {
308                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
309                         "for port %u: %s.\n", port, strerror(-retval));
310                 return retval;
311         }
312         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
313                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
314                         "for Rx queues on port %u.\n", port);
315                 return -1;
316         }
317
318         /* Setup the queues. */
319         rxconf->offloads = port_conf.rxmode.offloads;
320         for (q = 0; q < rx_rings; q ++) {
321                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
322                                                 rte_eth_dev_socket_id(port),
323                                                 rxconf,
324                                                 mbuf_pool);
325                 if (retval < 0) {
326                         RTE_LOG(ERR, VHOST_PORT,
327                                 "Failed to setup rx queue %u of port %u: %s.\n",
328                                 q, port, strerror(-retval));
329                         return retval;
330                 }
331         }
332         txconf->offloads = port_conf.txmode.offloads;
333         for (q = 0; q < tx_rings; q ++) {
334                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
335                                                 rte_eth_dev_socket_id(port),
336                                                 txconf);
337                 if (retval < 0) {
338                         RTE_LOG(ERR, VHOST_PORT,
339                                 "Failed to setup tx queue %u of port %u: %s.\n",
340                                 q, port, strerror(-retval));
341                         return retval;
342                 }
343         }
344
345         /* Start the device. */
346         retval  = rte_eth_dev_start(port);
347         if (retval < 0) {
348                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
349                         port, strerror(-retval));
350                 return retval;
351         }
352
353         if (promiscuous) {
354                 retval = rte_eth_promiscuous_enable(port);
355                 if (retval != 0) {
356                         RTE_LOG(ERR, VHOST_PORT,
357                                 "Failed to enable promiscuous mode on port %u: %s\n",
358                                 port, rte_strerror(-retval));
359                         return retval;
360                 }
361         }
362
363         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
364         if (retval < 0) {
365                 RTE_LOG(ERR, VHOST_PORT,
366                         "Failed to get MAC address on port %u: %s\n",
367                         port, rte_strerror(-retval));
368                 return retval;
369         }
370
371         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
372         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
373                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
375
376         return 0;
377 }
378
379 /*
380  * Set socket file path.
381  */
382 static int
383 us_vhost_parse_socket_path(const char *q_arg)
384 {
385         char *old;
386
387         /* parse number string */
388         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
389                 return -1;
390
391         old = socket_files;
392         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
393         if (socket_files == NULL) {
394                 free(old);
395                 return -1;
396         }
397
398         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
399         nb_sockets++;
400
401         return 0;
402 }
403
404 /*
405  * Parse the portmask provided at run time.
406  */
407 static int
408 parse_portmask(const char *portmask)
409 {
410         char *end = NULL;
411         unsigned long pm;
412
413         errno = 0;
414
415         /* parse hexadecimal string */
416         pm = strtoul(portmask, &end, 16);
417         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
418                 return 0;
419
420         return pm;
421
422 }
423
424 /*
425  * Parse num options at run time.
426  */
427 static int
428 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
429 {
430         char *end = NULL;
431         unsigned long num;
432
433         errno = 0;
434
435         /* parse unsigned int string */
436         num = strtoul(q_arg, &end, 10);
437         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
438                 return -1;
439
440         if (num > max_valid_value)
441                 return -1;
442
443         return num;
444
445 }
446
447 /*
448  * Display usage
449  */
450 static void
451 us_vhost_usage(const char *prgname)
452 {
453         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
454         "               --vm2vm [0|1|2]\n"
455         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
456         "               --socket-file <path>\n"
457         "               --nb-devices ND\n"
458         "               -p PORTMASK: Set mask for ports to be used by application\n"
459         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
460         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
461         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
462         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
463         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
464         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
465         "               --socket-file: The path of the socket file.\n"
466         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
467         "               --tso [0|1] disable/enable TCP segment offload.\n"
468         "               --client register a vhost-user socket as client mode.\n"
469         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
470         "               --dmas register dma channel for specific vhost device.\n",
471                prgname);
472 }
473
474 enum {
475 #define OPT_VM2VM               "vm2vm"
476         OPT_VM2VM_NUM = 256,
477 #define OPT_RX_RETRY            "rx-retry"
478         OPT_RX_RETRY_NUM,
479 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
480         OPT_RX_RETRY_DELAY_NUM,
481 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
482         OPT_RX_RETRY_NUMB_NUM,
483 #define OPT_MERGEABLE           "mergeable"
484         OPT_MERGEABLE_NUM,
485 #define OPT_STATS               "stats"
486         OPT_STATS_NUM,
487 #define OPT_SOCKET_FILE         "socket-file"
488         OPT_SOCKET_FILE_NUM,
489 #define OPT_TX_CSUM             "tx-csum"
490         OPT_TX_CSUM_NUM,
491 #define OPT_TSO                 "tso"
492         OPT_TSO_NUM,
493 #define OPT_CLIENT              "client"
494         OPT_CLIENT_NUM,
495 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
496         OPT_BUILTIN_NET_DRIVER_NUM,
497 #define OPT_DMA_TYPE            "dma-type"
498         OPT_DMA_TYPE_NUM,
499 #define OPT_DMAS                "dmas"
500         OPT_DMAS_NUM,
501 };
502
503 /*
504  * Parse the arguments given in the command line of the application.
505  */
506 static int
507 us_vhost_parse_args(int argc, char **argv)
508 {
509         int opt, ret;
510         int option_index;
511         unsigned i;
512         const char *prgname = argv[0];
513         static struct option long_option[] = {
514                 {OPT_VM2VM, required_argument,
515                                 NULL, OPT_VM2VM_NUM},
516                 {OPT_RX_RETRY, required_argument,
517                                 NULL, OPT_RX_RETRY_NUM},
518                 {OPT_RX_RETRY_DELAY, required_argument,
519                                 NULL, OPT_RX_RETRY_DELAY_NUM},
520                 {OPT_RX_RETRY_NUMB, required_argument,
521                                 NULL, OPT_RX_RETRY_NUMB_NUM},
522                 {OPT_MERGEABLE, required_argument,
523                                 NULL, OPT_MERGEABLE_NUM},
524                 {OPT_STATS, required_argument,
525                                 NULL, OPT_STATS_NUM},
526                 {OPT_SOCKET_FILE, required_argument,
527                                 NULL, OPT_SOCKET_FILE_NUM},
528                 {OPT_TX_CSUM, required_argument,
529                                 NULL, OPT_TX_CSUM_NUM},
530                 {OPT_TSO, required_argument,
531                                 NULL, OPT_TSO_NUM},
532                 {OPT_CLIENT, no_argument,
533                                 NULL, OPT_CLIENT_NUM},
534                 {OPT_BUILTIN_NET_DRIVER, no_argument,
535                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
536                 {OPT_DMA_TYPE, required_argument,
537                                 NULL, OPT_DMA_TYPE_NUM},
538                 {OPT_DMAS, required_argument,
539                                 NULL, OPT_DMAS_NUM},
540                 {NULL, 0, 0, 0},
541         };
542
543         /* Parse command line */
544         while ((opt = getopt_long(argc, argv, "p:P",
545                         long_option, &option_index)) != EOF) {
546                 switch (opt) {
547                 /* Portmask */
548                 case 'p':
549                         enabled_port_mask = parse_portmask(optarg);
550                         if (enabled_port_mask == 0) {
551                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
552                                 us_vhost_usage(prgname);
553                                 return -1;
554                         }
555                         break;
556
557                 case 'P':
558                         promiscuous = 1;
559                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
560                                 ETH_VMDQ_ACCEPT_BROADCAST |
561                                 ETH_VMDQ_ACCEPT_MULTICAST;
562                         break;
563
564                 case OPT_VM2VM_NUM:
565                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
566                         if (ret == -1) {
567                                 RTE_LOG(INFO, VHOST_CONFIG,
568                                         "Invalid argument for "
569                                         "vm2vm [0|1|2]\n");
570                                 us_vhost_usage(prgname);
571                                 return -1;
572                         }
573                         vm2vm_mode = (vm2vm_type)ret;
574                         break;
575
576                 case OPT_RX_RETRY_NUM:
577                         ret = parse_num_opt(optarg, 1);
578                         if (ret == -1) {
579                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
580                                 us_vhost_usage(prgname);
581                                 return -1;
582                         }
583                         enable_retry = ret;
584                         break;
585
586                 case OPT_TX_CSUM_NUM:
587                         ret = parse_num_opt(optarg, 1);
588                         if (ret == -1) {
589                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
590                                 us_vhost_usage(prgname);
591                                 return -1;
592                         }
593                         enable_tx_csum = ret;
594                         break;
595
596                 case OPT_TSO_NUM:
597                         ret = parse_num_opt(optarg, 1);
598                         if (ret == -1) {
599                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
600                                 us_vhost_usage(prgname);
601                                 return -1;
602                         }
603                         enable_tso = ret;
604                         break;
605
606                 case OPT_RX_RETRY_DELAY_NUM:
607                         ret = parse_num_opt(optarg, INT32_MAX);
608                         if (ret == -1) {
609                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
610                                 us_vhost_usage(prgname);
611                                 return -1;
612                         }
613                         burst_rx_delay_time = ret;
614                         break;
615
616                 case OPT_RX_RETRY_NUMB_NUM:
617                         ret = parse_num_opt(optarg, INT32_MAX);
618                         if (ret == -1) {
619                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
620                                 us_vhost_usage(prgname);
621                                 return -1;
622                         }
623                         burst_rx_retry_num = ret;
624                         break;
625
626                 case OPT_MERGEABLE_NUM:
627                         ret = parse_num_opt(optarg, 1);
628                         if (ret == -1) {
629                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
630                                 us_vhost_usage(prgname);
631                                 return -1;
632                         }
633                         mergeable = !!ret;
634                         if (ret) {
635                                 vmdq_conf_default.rxmode.offloads |=
636                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
637                                 vmdq_conf_default.rxmode.mtu = MAX_MTU;
638                         }
639                         break;
640
641                 case OPT_STATS_NUM:
642                         ret = parse_num_opt(optarg, INT32_MAX);
643                         if (ret == -1) {
644                                 RTE_LOG(INFO, VHOST_CONFIG,
645                                         "Invalid argument for stats [0..N]\n");
646                                 us_vhost_usage(prgname);
647                                 return -1;
648                         }
649                         enable_stats = ret;
650                         break;
651
652                 /* Set socket file path. */
653                 case OPT_SOCKET_FILE_NUM:
654                         if (us_vhost_parse_socket_path(optarg) == -1) {
655                                 RTE_LOG(INFO, VHOST_CONFIG,
656                                 "Invalid argument for socket name (Max %d characters)\n",
657                                 PATH_MAX);
658                                 us_vhost_usage(prgname);
659                                 return -1;
660                         }
661                         break;
662
663                 case OPT_DMA_TYPE_NUM:
664                         dma_type = optarg;
665                         break;
666
667                 case OPT_DMAS_NUM:
668                         if (open_dma(optarg) == -1) {
669                                 RTE_LOG(INFO, VHOST_CONFIG,
670                                         "Wrong DMA args\n");
671                                 us_vhost_usage(prgname);
672                                 return -1;
673                         }
674                         async_vhost_driver = 1;
675                         break;
676
677                 case OPT_CLIENT_NUM:
678                         client_mode = 1;
679                         break;
680
681                 case OPT_BUILTIN_NET_DRIVER_NUM:
682                         builtin_net_driver = 1;
683                         break;
684
685                 /* Invalid option - print options. */
686                 default:
687                         us_vhost_usage(prgname);
688                         return -1;
689                 }
690         }
691
692         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
693                 if (enabled_port_mask & (1 << i))
694                         ports[num_ports++] = i;
695         }
696
697         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
698                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
699                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
700                 return -1;
701         }
702
703         return 0;
704 }
705
706 /*
707  * Update the global var NUM_PORTS and array PORTS according to system ports number
708  * and return valid ports number
709  */
710 static unsigned check_ports_num(unsigned nb_ports)
711 {
712         unsigned valid_num_ports = num_ports;
713         unsigned portid;
714
715         if (num_ports > nb_ports) {
716                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
717                         num_ports, nb_ports);
718                 num_ports = nb_ports;
719         }
720
721         for (portid = 0; portid < num_ports; portid ++) {
722                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
723                         RTE_LOG(INFO, VHOST_PORT,
724                                 "\nSpecified port ID(%u) is not valid\n",
725                                 ports[portid]);
726                         ports[portid] = INVALID_PORT_ID;
727                         valid_num_ports--;
728                 }
729         }
730         return valid_num_ports;
731 }
732
733 static __rte_always_inline struct vhost_dev *
734 find_vhost_dev(struct rte_ether_addr *mac)
735 {
736         struct vhost_dev *vdev;
737
738         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
739                 if (vdev->ready == DEVICE_RX &&
740                     rte_is_same_ether_addr(mac, &vdev->mac_address))
741                         return vdev;
742         }
743
744         return NULL;
745 }
746
747 /*
748  * This function learns the MAC address of the device and registers this along with a
749  * vlan tag to a VMDQ.
750  */
751 static int
752 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
753 {
754         struct rte_ether_hdr *pkt_hdr;
755         int i, ret;
756
757         /* Learn MAC address of guest device from packet */
758         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
759
760         if (find_vhost_dev(&pkt_hdr->src_addr)) {
761                 RTE_LOG(ERR, VHOST_DATA,
762                         "(%d) device is using a registered MAC!\n",
763                         vdev->vid);
764                 return -1;
765         }
766
767         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
768                 vdev->mac_address.addr_bytes[i] =
769                         pkt_hdr->src_addr.addr_bytes[i];
770
771         /* vlan_tag currently uses the device_id. */
772         vdev->vlan_tag = vlan_tags[vdev->vid];
773
774         /* Print out VMDQ registration info. */
775         RTE_LOG(INFO, VHOST_DATA,
776                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
777                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
778                 vdev->vlan_tag);
779
780         /* Register the MAC address. */
781         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
782                                 (uint32_t)vdev->vid + vmdq_pool_base);
783         if (ret)
784                 RTE_LOG(ERR, VHOST_DATA,
785                         "(%d) failed to add device MAC address to VMDQ\n",
786                         vdev->vid);
787
788         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
789
790         /* Set device as ready for RX. */
791         vdev->ready = DEVICE_RX;
792
793         return 0;
794 }
795
796 /*
797  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
798  * queue before disabling RX on the device.
799  */
800 static inline void
801 unlink_vmdq(struct vhost_dev *vdev)
802 {
803         unsigned i = 0;
804         unsigned rx_count;
805         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
806
807         if (vdev->ready == DEVICE_RX) {
808                 /*clear MAC and VLAN settings*/
809                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
810                 for (i = 0; i < 6; i++)
811                         vdev->mac_address.addr_bytes[i] = 0;
812
813                 vdev->vlan_tag = 0;
814
815                 /*Clear out the receive buffers*/
816                 rx_count = rte_eth_rx_burst(ports[0],
817                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
818
819                 while (rx_count) {
820                         for (i = 0; i < rx_count; i++)
821                                 rte_pktmbuf_free(pkts_burst[i]);
822
823                         rx_count = rte_eth_rx_burst(ports[0],
824                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
825                 }
826
827                 vdev->ready = DEVICE_MAC_LEARNING;
828         }
829 }
830
831 static inline void
832 free_pkts(struct rte_mbuf **pkts, uint16_t n)
833 {
834         while (n--)
835                 rte_pktmbuf_free(pkts[n]);
836 }
837
838 static __rte_always_inline void
839 complete_async_pkts(struct vhost_dev *vdev)
840 {
841         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
842         uint16_t complete_count;
843
844         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
845                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
846         if (complete_count) {
847                 free_pkts(p_cpl, complete_count);
848                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
849         }
850
851 }
852
853 static __rte_always_inline void
854 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
855             struct rte_mbuf *m)
856 {
857         uint16_t ret;
858
859         if (builtin_net_driver) {
860                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
861         } else {
862                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
863         }
864
865         if (enable_stats) {
866                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
867                                 __ATOMIC_SEQ_CST);
868                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
869                                 __ATOMIC_SEQ_CST);
870                 src_vdev->stats.tx_total++;
871                 src_vdev->stats.tx += ret;
872         }
873 }
874
875 static __rte_always_inline void
876 drain_vhost(struct vhost_dev *vdev)
877 {
878         uint16_t ret;
879         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
880         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
881         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
882
883         if (builtin_net_driver) {
884                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
885         } else if (async_vhost_driver) {
886                 uint16_t enqueue_fail = 0;
887
888                 complete_async_pkts(vdev);
889                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit);
890                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
891
892                 enqueue_fail = nr_xmit - ret;
893                 if (enqueue_fail)
894                         free_pkts(&m[ret], nr_xmit - ret);
895         } else {
896                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
897                                                 m, nr_xmit);
898         }
899
900         if (enable_stats) {
901                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
902                                 __ATOMIC_SEQ_CST);
903                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
904                                 __ATOMIC_SEQ_CST);
905         }
906
907         if (!async_vhost_driver)
908                 free_pkts(m, nr_xmit);
909 }
910
911 static __rte_always_inline void
912 drain_vhost_table(void)
913 {
914         uint16_t lcore_id = rte_lcore_id();
915         struct vhost_bufftable *vhost_txq;
916         struct vhost_dev *vdev;
917         uint64_t cur_tsc;
918
919         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
920                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
921                                                 + vdev->vid];
922
923                 cur_tsc = rte_rdtsc();
924                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
925                                 > MBUF_TABLE_DRAIN_TSC)) {
926                         RTE_LOG_DP(DEBUG, VHOST_DATA,
927                                 "Vhost TX queue drained after timeout with burst size %u\n",
928                                 vhost_txq->len);
929                         drain_vhost(vdev);
930                         vhost_txq->len = 0;
931                         vhost_txq->pre_tsc = cur_tsc;
932                 }
933         }
934 }
935
936 /*
937  * Check if the packet destination MAC address is for a local device. If so then put
938  * the packet on that devices RX queue. If not then return.
939  */
940 static __rte_always_inline int
941 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
942 {
943         struct rte_ether_hdr *pkt_hdr;
944         struct vhost_dev *dst_vdev;
945         struct vhost_bufftable *vhost_txq;
946         uint16_t lcore_id = rte_lcore_id();
947         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
948
949         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
950         if (!dst_vdev)
951                 return -1;
952
953         if (vdev->vid == dst_vdev->vid) {
954                 RTE_LOG_DP(DEBUG, VHOST_DATA,
955                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
956                         vdev->vid);
957                 return 0;
958         }
959
960         RTE_LOG_DP(DEBUG, VHOST_DATA,
961                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
962
963         if (unlikely(dst_vdev->remove)) {
964                 RTE_LOG_DP(DEBUG, VHOST_DATA,
965                         "(%d) device is marked for removal\n", dst_vdev->vid);
966                 return 0;
967         }
968
969         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
970         vhost_txq->m_table[vhost_txq->len++] = m;
971
972         if (enable_stats) {
973                 vdev->stats.tx_total++;
974                 vdev->stats.tx++;
975         }
976
977         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
978                 drain_vhost(dst_vdev);
979                 vhost_txq->len = 0;
980                 vhost_txq->pre_tsc = rte_rdtsc();
981         }
982         return 0;
983 }
984
985 /*
986  * Check if the destination MAC of a packet is one local VM,
987  * and get its vlan tag, and offset if it is.
988  */
989 static __rte_always_inline int
990 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
991         uint32_t *offset, uint16_t *vlan_tag)
992 {
993         struct vhost_dev *dst_vdev;
994         struct rte_ether_hdr *pkt_hdr =
995                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
996
997         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
998         if (!dst_vdev)
999                 return 0;
1000
1001         if (vdev->vid == dst_vdev->vid) {
1002                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1003                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1004                         vdev->vid);
1005                 return -1;
1006         }
1007
1008         /*
1009          * HW vlan strip will reduce the packet length
1010          * by minus length of vlan tag, so need restore
1011          * the packet length by plus it.
1012          */
1013         *offset  = VLAN_HLEN;
1014         *vlan_tag = vlan_tags[vdev->vid];
1015
1016         RTE_LOG_DP(DEBUG, VHOST_DATA,
1017                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1018                 vdev->vid, dst_vdev->vid, *vlan_tag);
1019
1020         return 0;
1021 }
1022
1023 static void virtio_tx_offload(struct rte_mbuf *m)
1024 {
1025         struct rte_net_hdr_lens hdr_lens;
1026         struct rte_ipv4_hdr *ipv4_hdr;
1027         struct rte_tcp_hdr *tcp_hdr;
1028         uint32_t ptype;
1029         void *l3_hdr;
1030
1031         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1032         m->l2_len = hdr_lens.l2_len;
1033         m->l3_len = hdr_lens.l3_len;
1034         m->l4_len = hdr_lens.l4_len;
1035
1036         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1037         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1038                 m->l2_len + m->l3_len);
1039
1040         m->ol_flags |= PKT_TX_TCP_SEG;
1041         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1042                 m->ol_flags |= PKT_TX_IPV4;
1043                 m->ol_flags |= PKT_TX_IP_CKSUM;
1044                 ipv4_hdr = l3_hdr;
1045                 ipv4_hdr->hdr_checksum = 0;
1046                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1047         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1048                 m->ol_flags |= PKT_TX_IPV6;
1049                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1050         }
1051 }
1052
1053 static __rte_always_inline void
1054 do_drain_mbuf_table(struct mbuf_table *tx_q)
1055 {
1056         uint16_t count;
1057
1058         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1059                                  tx_q->m_table, tx_q->len);
1060         if (unlikely(count < tx_q->len))
1061                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1062
1063         tx_q->len = 0;
1064 }
1065
1066 /*
1067  * This function routes the TX packet to the correct interface. This
1068  * may be a local device or the physical port.
1069  */
1070 static __rte_always_inline void
1071 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1072 {
1073         struct mbuf_table *tx_q;
1074         unsigned offset = 0;
1075         const uint16_t lcore_id = rte_lcore_id();
1076         struct rte_ether_hdr *nh;
1077
1078
1079         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1080         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1081                 struct vhost_dev *vdev2;
1082
1083                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1084                         if (vdev2 != vdev)
1085                                 sync_virtio_xmit(vdev2, vdev, m);
1086                 }
1087                 goto queue2nic;
1088         }
1089
1090         /*check if destination is local VM*/
1091         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1092                 return;
1093
1094         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1095                 if (unlikely(find_local_dest(vdev, m, &offset,
1096                                              &vlan_tag) != 0)) {
1097                         rte_pktmbuf_free(m);
1098                         return;
1099                 }
1100         }
1101
1102         RTE_LOG_DP(DEBUG, VHOST_DATA,
1103                 "(%d) TX: MAC address is external\n", vdev->vid);
1104
1105 queue2nic:
1106
1107         /*Add packet to the port tx queue*/
1108         tx_q = &lcore_tx_queue[lcore_id];
1109
1110         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1111         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1112                 /* Guest has inserted the vlan tag. */
1113                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1114                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1115                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1116                         (vh->vlan_tci != vlan_tag_be))
1117                         vh->vlan_tci = vlan_tag_be;
1118         } else {
1119                 m->ol_flags |= PKT_TX_VLAN_PKT;
1120
1121                 /*
1122                  * Find the right seg to adjust the data len when offset is
1123                  * bigger than tail room size.
1124                  */
1125                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1126                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1127                                 m->data_len += offset;
1128                         else {
1129                                 struct rte_mbuf *seg = m;
1130
1131                                 while ((seg->next != NULL) &&
1132                                         (offset > rte_pktmbuf_tailroom(seg)))
1133                                         seg = seg->next;
1134
1135                                 seg->data_len += offset;
1136                         }
1137                         m->pkt_len += offset;
1138                 }
1139
1140                 m->vlan_tci = vlan_tag;
1141         }
1142
1143         if (m->ol_flags & PKT_RX_LRO)
1144                 virtio_tx_offload(m);
1145
1146         tx_q->m_table[tx_q->len++] = m;
1147         if (enable_stats) {
1148                 vdev->stats.tx_total++;
1149                 vdev->stats.tx++;
1150         }
1151
1152         if (unlikely(tx_q->len == MAX_PKT_BURST))
1153                 do_drain_mbuf_table(tx_q);
1154 }
1155
1156
1157 static __rte_always_inline void
1158 drain_mbuf_table(struct mbuf_table *tx_q)
1159 {
1160         static uint64_t prev_tsc;
1161         uint64_t cur_tsc;
1162
1163         if (tx_q->len == 0)
1164                 return;
1165
1166         cur_tsc = rte_rdtsc();
1167         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1168                 prev_tsc = cur_tsc;
1169
1170                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1171                         "TX queue drained after timeout with burst size %u\n",
1172                         tx_q->len);
1173                 do_drain_mbuf_table(tx_q);
1174         }
1175 }
1176
1177 static __rte_always_inline void
1178 drain_eth_rx(struct vhost_dev *vdev)
1179 {
1180         uint16_t rx_count, enqueue_count;
1181         struct rte_mbuf *pkts[MAX_PKT_BURST];
1182
1183         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1184                                     pkts, MAX_PKT_BURST);
1185
1186         if (!rx_count)
1187                 return;
1188
1189         /*
1190          * When "enable_retry" is set, here we wait and retry when there
1191          * is no enough free slots in the queue to hold @rx_count packets,
1192          * to diminish packet loss.
1193          */
1194         if (enable_retry &&
1195             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1196                         VIRTIO_RXQ))) {
1197                 uint32_t retry;
1198
1199                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1200                         rte_delay_us(burst_rx_delay_time);
1201                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1202                                         VIRTIO_RXQ))
1203                                 break;
1204                 }
1205         }
1206
1207         if (builtin_net_driver) {
1208                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1209                                                 pkts, rx_count);
1210         } else if (async_vhost_driver) {
1211                 uint16_t enqueue_fail = 0;
1212
1213                 complete_async_pkts(vdev);
1214                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1215                                         VIRTIO_RXQ, pkts, rx_count);
1216                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1217
1218                 enqueue_fail = rx_count - enqueue_count;
1219                 if (enqueue_fail)
1220                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1221
1222         } else {
1223                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1224                                                 pkts, rx_count);
1225         }
1226
1227         if (enable_stats) {
1228                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1229                                 __ATOMIC_SEQ_CST);
1230                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1231                                 __ATOMIC_SEQ_CST);
1232         }
1233
1234         if (!async_vhost_driver)
1235                 free_pkts(pkts, rx_count);
1236 }
1237
1238 static __rte_always_inline void
1239 drain_virtio_tx(struct vhost_dev *vdev)
1240 {
1241         struct rte_mbuf *pkts[MAX_PKT_BURST];
1242         uint16_t count;
1243         uint16_t i;
1244
1245         if (builtin_net_driver) {
1246                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1247                                         pkts, MAX_PKT_BURST);
1248         } else {
1249                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1250                                         mbuf_pool, pkts, MAX_PKT_BURST);
1251         }
1252
1253         /* setup VMDq for the first packet */
1254         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1255                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1256                         free_pkts(pkts, count);
1257         }
1258
1259         for (i = 0; i < count; ++i)
1260                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1261 }
1262
1263 /*
1264  * Main function of vhost-switch. It basically does:
1265  *
1266  * for each vhost device {
1267  *    - drain_eth_rx()
1268  *
1269  *      Which drains the host eth Rx queue linked to the vhost device,
1270  *      and deliver all of them to guest virito Rx ring associated with
1271  *      this vhost device.
1272  *
1273  *    - drain_virtio_tx()
1274  *
1275  *      Which drains the guest virtio Tx queue and deliver all of them
1276  *      to the target, which could be another vhost device, or the
1277  *      physical eth dev. The route is done in function "virtio_tx_route".
1278  * }
1279  */
1280 static int
1281 switch_worker(void *arg __rte_unused)
1282 {
1283         unsigned i;
1284         unsigned lcore_id = rte_lcore_id();
1285         struct vhost_dev *vdev;
1286         struct mbuf_table *tx_q;
1287
1288         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1289
1290         tx_q = &lcore_tx_queue[lcore_id];
1291         for (i = 0; i < rte_lcore_count(); i++) {
1292                 if (lcore_ids[i] == lcore_id) {
1293                         tx_q->txq_id = i;
1294                         break;
1295                 }
1296         }
1297
1298         while(1) {
1299                 drain_mbuf_table(tx_q);
1300                 drain_vhost_table();
1301                 /*
1302                  * Inform the configuration core that we have exited the
1303                  * linked list and that no devices are in use if requested.
1304                  */
1305                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1306                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1307
1308                 /*
1309                  * Process vhost devices
1310                  */
1311                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1312                               lcore_vdev_entry) {
1313                         if (unlikely(vdev->remove)) {
1314                                 unlink_vmdq(vdev);
1315                                 vdev->ready = DEVICE_SAFE_REMOVE;
1316                                 continue;
1317                         }
1318
1319                         if (likely(vdev->ready == DEVICE_RX))
1320                                 drain_eth_rx(vdev);
1321
1322                         if (likely(!vdev->remove))
1323                                 drain_virtio_tx(vdev);
1324                 }
1325         }
1326
1327         return 0;
1328 }
1329
1330 /*
1331  * Remove a device from the specific data core linked list and from the
1332  * main linked list. Synchonization  occurs through the use of the
1333  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1334  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1335  */
1336 static void
1337 destroy_device(int vid)
1338 {
1339         struct vhost_dev *vdev = NULL;
1340         int lcore;
1341         uint16_t i;
1342
1343         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1344                 if (vdev->vid == vid)
1345                         break;
1346         }
1347         if (!vdev)
1348                 return;
1349         /*set the remove flag. */
1350         vdev->remove = 1;
1351         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1352                 rte_pause();
1353         }
1354
1355         for (i = 0; i < RTE_MAX_LCORE; i++)
1356                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1357
1358         if (builtin_net_driver)
1359                 vs_vhost_net_remove(vdev);
1360
1361         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1362                      lcore_vdev_entry);
1363         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1364
1365
1366         /* Set the dev_removal_flag on each lcore. */
1367         RTE_LCORE_FOREACH_WORKER(lcore)
1368                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1369
1370         /*
1371          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1372          * we can be sure that they can no longer access the device removed
1373          * from the linked lists and that the devices are no longer in use.
1374          */
1375         RTE_LCORE_FOREACH_WORKER(lcore) {
1376                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1377                         rte_pause();
1378         }
1379
1380         lcore_info[vdev->coreid].device_num--;
1381
1382         RTE_LOG(INFO, VHOST_DATA,
1383                 "(%d) device has been removed from data core\n",
1384                 vdev->vid);
1385
1386         if (async_vhost_driver) {
1387                 uint16_t n_pkt = 0;
1388                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1389
1390                 while (vdev->pkts_inflight) {
1391                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1392                                                 m_cpl, vdev->pkts_inflight);
1393                         free_pkts(m_cpl, n_pkt);
1394                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1395                 }
1396
1397                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1398         }
1399
1400         rte_free(vdev);
1401 }
1402
1403 /*
1404  * A new device is added to a data core. First the device is added to the main linked list
1405  * and then allocated to a specific data core.
1406  */
1407 static int
1408 new_device(int vid)
1409 {
1410         int lcore, core_add = 0;
1411         uint16_t i;
1412         uint32_t device_num_min = num_devices;
1413         struct vhost_dev *vdev;
1414         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1415         if (vdev == NULL) {
1416                 RTE_LOG(INFO, VHOST_DATA,
1417                         "(%d) couldn't allocate memory for vhost dev\n",
1418                         vid);
1419                 return -1;
1420         }
1421         vdev->vid = vid;
1422
1423         for (i = 0; i < RTE_MAX_LCORE; i++) {
1424                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1425                         = rte_zmalloc("vhost bufftable",
1426                                 sizeof(struct vhost_bufftable),
1427                                 RTE_CACHE_LINE_SIZE);
1428
1429                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1430                         RTE_LOG(INFO, VHOST_DATA,
1431                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1432                         return -1;
1433                 }
1434         }
1435
1436         if (builtin_net_driver)
1437                 vs_vhost_net_setup(vdev);
1438
1439         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1440         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1441
1442         /*reset ready flag*/
1443         vdev->ready = DEVICE_MAC_LEARNING;
1444         vdev->remove = 0;
1445
1446         /* Find a suitable lcore to add the device. */
1447         RTE_LCORE_FOREACH_WORKER(lcore) {
1448                 if (lcore_info[lcore].device_num < device_num_min) {
1449                         device_num_min = lcore_info[lcore].device_num;
1450                         core_add = lcore;
1451                 }
1452         }
1453         vdev->coreid = core_add;
1454
1455         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1456                           lcore_vdev_entry);
1457         lcore_info[vdev->coreid].device_num++;
1458
1459         /* Disable notifications. */
1460         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1461         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1462
1463         RTE_LOG(INFO, VHOST_DATA,
1464                 "(%d) device has been added to data core %d\n",
1465                 vid, vdev->coreid);
1466
1467         if (async_vhost_driver) {
1468                 struct rte_vhost_async_config config = {0};
1469                 struct rte_vhost_async_channel_ops channel_ops;
1470
1471                 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1472                         channel_ops.transfer_data = ioat_transfer_data_cb;
1473                         channel_ops.check_completed_copies =
1474                                 ioat_check_completed_copies_cb;
1475
1476                         config.features = RTE_VHOST_ASYNC_INORDER;
1477
1478                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1479                                 config, &channel_ops);
1480                 }
1481         }
1482
1483         return 0;
1484 }
1485
1486 static int
1487 vring_state_changed(int vid, uint16_t queue_id, int enable)
1488 {
1489         struct vhost_dev *vdev = NULL;
1490
1491         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1492                 if (vdev->vid == vid)
1493                         break;
1494         }
1495         if (!vdev)
1496                 return -1;
1497
1498         if (queue_id != VIRTIO_RXQ)
1499                 return 0;
1500
1501         if (async_vhost_driver) {
1502                 if (!enable) {
1503                         uint16_t n_pkt = 0;
1504                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1505
1506                         while (vdev->pkts_inflight) {
1507                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1508                                                         m_cpl, vdev->pkts_inflight);
1509                                 free_pkts(m_cpl, n_pkt);
1510                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1511                         }
1512                 }
1513         }
1514
1515         return 0;
1516 }
1517
1518 /*
1519  * These callback allow devices to be added to the data core when configuration
1520  * has been fully complete.
1521  */
1522 static const struct vhost_device_ops virtio_net_device_ops =
1523 {
1524         .new_device =  new_device,
1525         .destroy_device = destroy_device,
1526         .vring_state_changed = vring_state_changed,
1527 };
1528
1529 /*
1530  * This is a thread will wake up after a period to print stats if the user has
1531  * enabled them.
1532  */
1533 static void *
1534 print_stats(__rte_unused void *arg)
1535 {
1536         struct vhost_dev *vdev;
1537         uint64_t tx_dropped, rx_dropped;
1538         uint64_t tx, tx_total, rx, rx_total;
1539         const char clr[] = { 27, '[', '2', 'J', '\0' };
1540         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1541
1542         while(1) {
1543                 sleep(enable_stats);
1544
1545                 /* Clear screen and move to top left */
1546                 printf("%s%s\n", clr, top_left);
1547                 printf("Device statistics =================================\n");
1548
1549                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1550                         tx_total   = vdev->stats.tx_total;
1551                         tx         = vdev->stats.tx;
1552                         tx_dropped = tx_total - tx;
1553
1554                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1555                                 __ATOMIC_SEQ_CST);
1556                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1557                                 __ATOMIC_SEQ_CST);
1558                         rx_dropped = rx_total - rx;
1559
1560                         printf("Statistics for device %d\n"
1561                                 "-----------------------\n"
1562                                 "TX total:              %" PRIu64 "\n"
1563                                 "TX dropped:            %" PRIu64 "\n"
1564                                 "TX successful:         %" PRIu64 "\n"
1565                                 "RX total:              %" PRIu64 "\n"
1566                                 "RX dropped:            %" PRIu64 "\n"
1567                                 "RX successful:         %" PRIu64 "\n",
1568                                 vdev->vid,
1569                                 tx_total, tx_dropped, tx,
1570                                 rx_total, rx_dropped, rx);
1571                 }
1572
1573                 printf("===================================================\n");
1574
1575                 fflush(stdout);
1576         }
1577
1578         return NULL;
1579 }
1580
1581 static void
1582 unregister_drivers(int socket_num)
1583 {
1584         int i, ret;
1585
1586         for (i = 0; i < socket_num; i++) {
1587                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1588                 if (ret != 0)
1589                         RTE_LOG(ERR, VHOST_CONFIG,
1590                                 "Fail to unregister vhost driver for %s.\n",
1591                                 socket_files + i * PATH_MAX);
1592         }
1593 }
1594
1595 /* When we receive a INT signal, unregister vhost driver */
1596 static void
1597 sigint_handler(__rte_unused int signum)
1598 {
1599         /* Unregister vhost driver. */
1600         unregister_drivers(nb_sockets);
1601
1602         exit(0);
1603 }
1604
1605 /*
1606  * While creating an mbuf pool, one key thing is to figure out how
1607  * many mbuf entries is enough for our use. FYI, here are some
1608  * guidelines:
1609  *
1610  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1611  *
1612  * - For each switch core (A CPU core does the packet switch), we need
1613  *   also make some reservation for receiving the packets from virtio
1614  *   Tx queue. How many is enough depends on the usage. It's normally
1615  *   a simple calculation like following:
1616  *
1617  *       MAX_PKT_BURST * max packet size / mbuf size
1618  *
1619  *   So, we definitely need allocate more mbufs when TSO is enabled.
1620  *
1621  * - Similarly, for each switching core, we should serve @nr_rx_desc
1622  *   mbufs for receiving the packets from physical NIC device.
1623  *
1624  * - We also need make sure, for each switch core, we have allocated
1625  *   enough mbufs to fill up the mbuf cache.
1626  */
1627 static void
1628 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1629         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1630 {
1631         uint32_t nr_mbufs;
1632         uint32_t nr_mbufs_per_core;
1633         uint32_t mtu = 1500;
1634
1635         if (mergeable)
1636                 mtu = 9000;
1637         if (enable_tso)
1638                 mtu = 64 * 1024;
1639
1640         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1641                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1642         nr_mbufs_per_core += nr_rx_desc;
1643         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1644
1645         nr_mbufs  = nr_queues * nr_rx_desc;
1646         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1647         nr_mbufs *= nr_port;
1648
1649         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1650                                             nr_mbuf_cache, 0, mbuf_size,
1651                                             rte_socket_id());
1652         if (mbuf_pool == NULL)
1653                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1654 }
1655
1656 /*
1657  * Main function, does initialisation and calls the per-lcore functions.
1658  */
1659 int
1660 main(int argc, char *argv[])
1661 {
1662         unsigned lcore_id, core_id = 0;
1663         unsigned nb_ports, valid_num_ports;
1664         int ret, i;
1665         uint16_t portid;
1666         static pthread_t tid;
1667         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1668
1669         signal(SIGINT, sigint_handler);
1670
1671         /* init EAL */
1672         ret = rte_eal_init(argc, argv);
1673         if (ret < 0)
1674                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1675         argc -= ret;
1676         argv += ret;
1677
1678         /* parse app arguments */
1679         ret = us_vhost_parse_args(argc, argv);
1680         if (ret < 0)
1681                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1682
1683         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1684                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1685
1686                 if (rte_lcore_is_enabled(lcore_id))
1687                         lcore_ids[core_id++] = lcore_id;
1688         }
1689
1690         if (rte_lcore_count() > RTE_MAX_LCORE)
1691                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1692
1693         /* Get the number of physical ports. */
1694         nb_ports = rte_eth_dev_count_avail();
1695
1696         /*
1697          * Update the global var NUM_PORTS and global array PORTS
1698          * and get value of var VALID_NUM_PORTS according to system ports number
1699          */
1700         valid_num_ports = check_ports_num(nb_ports);
1701
1702         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1703                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1704                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1705                 return -1;
1706         }
1707
1708         /*
1709          * FIXME: here we are trying to allocate mbufs big enough for
1710          * @MAX_QUEUES, but the truth is we're never going to use that
1711          * many queues here. We probably should only do allocation for
1712          * those queues we are going to use.
1713          */
1714         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1715                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1716
1717         if (vm2vm_mode == VM2VM_HARDWARE) {
1718                 /* Enable VT loop back to let L2 switch to do it. */
1719                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1720                 RTE_LOG(DEBUG, VHOST_CONFIG,
1721                         "Enable loop back for L2 switch in vmdq.\n");
1722         }
1723
1724         /* initialize all ports */
1725         RTE_ETH_FOREACH_DEV(portid) {
1726                 /* skip ports that are not enabled */
1727                 if ((enabled_port_mask & (1 << portid)) == 0) {
1728                         RTE_LOG(INFO, VHOST_PORT,
1729                                 "Skipping disabled port %d\n", portid);
1730                         continue;
1731                 }
1732                 if (port_init(portid) != 0)
1733                         rte_exit(EXIT_FAILURE,
1734                                 "Cannot initialize network ports\n");
1735         }
1736
1737         /* Enable stats if the user option is set. */
1738         if (enable_stats) {
1739                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1740                                         print_stats, NULL);
1741                 if (ret < 0)
1742                         rte_exit(EXIT_FAILURE,
1743                                 "Cannot create print-stats thread\n");
1744         }
1745
1746         /* Launch all data cores. */
1747         RTE_LCORE_FOREACH_WORKER(lcore_id)
1748                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1749
1750         if (client_mode)
1751                 flags |= RTE_VHOST_USER_CLIENT;
1752
1753         /* Register vhost user driver to handle vhost messages. */
1754         for (i = 0; i < nb_sockets; i++) {
1755                 char *file = socket_files + i * PATH_MAX;
1756
1757                 if (async_vhost_driver)
1758                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1759
1760                 ret = rte_vhost_driver_register(file, flags);
1761                 if (ret != 0) {
1762                         unregister_drivers(i);
1763                         rte_exit(EXIT_FAILURE,
1764                                 "vhost driver register failure.\n");
1765                 }
1766
1767                 if (builtin_net_driver)
1768                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1769
1770                 if (mergeable == 0) {
1771                         rte_vhost_driver_disable_features(file,
1772                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1773                 }
1774
1775                 if (enable_tx_csum == 0) {
1776                         rte_vhost_driver_disable_features(file,
1777                                 1ULL << VIRTIO_NET_F_CSUM);
1778                 }
1779
1780                 if (enable_tso == 0) {
1781                         rte_vhost_driver_disable_features(file,
1782                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1783                         rte_vhost_driver_disable_features(file,
1784                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1785                         rte_vhost_driver_disable_features(file,
1786                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1787                         rte_vhost_driver_disable_features(file,
1788                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1789                 }
1790
1791                 if (promiscuous) {
1792                         rte_vhost_driver_enable_features(file,
1793                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1794                 }
1795
1796                 ret = rte_vhost_driver_callback_register(file,
1797                         &virtio_net_device_ops);
1798                 if (ret != 0) {
1799                         rte_exit(EXIT_FAILURE,
1800                                 "failed to register vhost driver callbacks.\n");
1801                 }
1802
1803                 if (rte_vhost_driver_start(file) < 0) {
1804                         rte_exit(EXIT_FAILURE,
1805                                 "failed to start vhost driver.\n");
1806                 }
1807         }
1808
1809         RTE_LCORE_FOREACH_WORKER(lcore_id)
1810                 rte_eal_wait_lcore(lcore_id);
1811
1812         /* clean up the EAL */
1813         rte_eal_cleanup();
1814
1815         return 0;
1816 }