ca73e708600da2c7d07c5b8c936e94cd6da99030
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26
27 #include "ioat.h"
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95
96 static int builtin_net_driver;
97
98 static int async_vhost_driver;
99
100 static char dma_type[MAX_LONG_OPT_SZ];
101
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113         .rxmode = {
114                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115                 .split_hdr_size = 0,
116                 /*
117                  * VLAN strip is necessary for 1G NIC such as I350,
118                  * this fixes bug of ipv4 forwarding in guest can't
119                  * forward pakets from one virtio dev to another virtio dev.
120                  */
121                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122         },
123
124         .txmode = {
125                 .mq_mode = ETH_MQ_TX_NONE,
126                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127                              DEV_TX_OFFLOAD_TCP_CKSUM |
128                              DEV_TX_OFFLOAD_VLAN_INSERT |
129                              DEV_TX_OFFLOAD_MULTI_SEGS |
130                              DEV_TX_OFFLOAD_TCP_TSO),
131         },
132         .rx_adv_conf = {
133                 /*
134                  * should be overridden separately in code with
135                  * appropriate values
136                  */
137                 .vmdq_rx_conf = {
138                         .nb_queue_pools = ETH_8_POOLS,
139                         .enable_default_pool = 0,
140                         .default_pool = 0,
141                         .nb_pool_maps = 0,
142                         .pool_map = {{0, 0},},
143                 },
144         },
145 };
146
147
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154
155 const uint16_t vlan_tags[] = {
156         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
158         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165
166 /* ethernet addresses of ports */
167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168
169 static struct vhost_dev_tailq_list vhost_dev_list =
170         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176         unsigned len;
177         unsigned txq_id;
178         struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180
181 struct vhost_bufftable {
182         uint32_t len;
183         uint64_t pre_tsc;
184         struct rte_mbuf *m_table[MAX_PKT_BURST];
185 };
186
187 /* TX queue for each data core. */
188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189
190 /*
191  * Vhost TX buffer for each data core.
192  * Every data core maintains a TX buffer for every vhost device,
193  * which is used for batch pkts enqueue for higher performance.
194  */
195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
196
197 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
198                                  / US_PER_S * BURST_TX_DRAIN_US)
199 #define VLAN_HLEN       4
200
201 static inline int
202 open_dma(const char *value)
203 {
204         if (strncmp(dma_type, "ioat", 4) == 0)
205                 return open_ioat(value);
206
207         return -1;
208 }
209
210 /*
211  * Builds up the correct configuration for VMDQ VLAN pool map
212  * according to the pool & queue limits.
213  */
214 static inline int
215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
216 {
217         struct rte_eth_vmdq_rx_conf conf;
218         struct rte_eth_vmdq_rx_conf *def_conf =
219                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220         unsigned i;
221
222         memset(&conf, 0, sizeof(conf));
223         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
224         conf.nb_pool_maps = num_devices;
225         conf.enable_loop_back = def_conf->enable_loop_back;
226         conf.rx_mode = def_conf->rx_mode;
227
228         for (i = 0; i < conf.nb_pool_maps; i++) {
229                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
230                 conf.pool_map[i].pools = (1UL << i);
231         }
232
233         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
234         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
235                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
236         return 0;
237 }
238
239 /*
240  * Initialises a given port using global settings and with the rx buffers
241  * coming from the mbuf_pool passed as parameter
242  */
243 static inline int
244 port_init(uint16_t port)
245 {
246         struct rte_eth_dev_info dev_info;
247         struct rte_eth_conf port_conf;
248         struct rte_eth_rxconf *rxconf;
249         struct rte_eth_txconf *txconf;
250         int16_t rx_rings, tx_rings;
251         uint16_t rx_ring_size, tx_ring_size;
252         int retval;
253         uint16_t q;
254
255         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
256         retval = rte_eth_dev_info_get(port, &dev_info);
257         if (retval != 0) {
258                 RTE_LOG(ERR, VHOST_PORT,
259                         "Error during getting device (port %u) info: %s\n",
260                         port, strerror(-retval));
261
262                 return retval;
263         }
264
265         rxconf = &dev_info.default_rxconf;
266         txconf = &dev_info.default_txconf;
267         rxconf->rx_drop_en = 1;
268
269         /*configure the number of supported virtio devices based on VMDQ limits */
270         num_devices = dev_info.max_vmdq_pools;
271
272         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
273         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
274
275         tx_rings = (uint16_t)rte_lcore_count();
276
277         /* Get port configuration. */
278         retval = get_eth_conf(&port_conf, num_devices);
279         if (retval < 0)
280                 return retval;
281         /* NIC queues are divided into pf queues and vmdq queues.  */
282         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284         num_vmdq_queues = num_devices * queues_per_pool;
285         num_queues = num_pf_queues + num_vmdq_queues;
286         vmdq_queue_base = dev_info.vmdq_queue_base;
287         vmdq_pool_base  = dev_info.vmdq_pool_base;
288         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289                 num_pf_queues, num_devices, queues_per_pool);
290
291         if (!rte_eth_dev_is_valid_port(port))
292                 return -1;
293
294         rx_rings = (uint16_t)dev_info.max_rx_queues;
295         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296                 port_conf.txmode.offloads |=
297                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298         /* Configure ethernet device. */
299         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300         if (retval != 0) {
301                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302                         port, strerror(-retval));
303                 return retval;
304         }
305
306         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307                 &tx_ring_size);
308         if (retval != 0) {
309                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310                         "for port %u: %s.\n", port, strerror(-retval));
311                 return retval;
312         }
313         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315                         "for Rx queues on port %u.\n", port);
316                 return -1;
317         }
318
319         /* Setup the queues. */
320         rxconf->offloads = port_conf.rxmode.offloads;
321         for (q = 0; q < rx_rings; q ++) {
322                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323                                                 rte_eth_dev_socket_id(port),
324                                                 rxconf,
325                                                 mbuf_pool);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup rx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333         txconf->offloads = port_conf.txmode.offloads;
334         for (q = 0; q < tx_rings; q ++) {
335                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336                                                 rte_eth_dev_socket_id(port),
337                                                 txconf);
338                 if (retval < 0) {
339                         RTE_LOG(ERR, VHOST_PORT,
340                                 "Failed to setup tx queue %u of port %u: %s.\n",
341                                 q, port, strerror(-retval));
342                         return retval;
343                 }
344         }
345
346         /* Start the device. */
347         retval  = rte_eth_dev_start(port);
348         if (retval < 0) {
349                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350                         port, strerror(-retval));
351                 return retval;
352         }
353
354         if (promiscuous) {
355                 retval = rte_eth_promiscuous_enable(port);
356                 if (retval != 0) {
357                         RTE_LOG(ERR, VHOST_PORT,
358                                 "Failed to enable promiscuous mode on port %u: %s\n",
359                                 port, rte_strerror(-retval));
360                         return retval;
361                 }
362         }
363
364         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
365         if (retval < 0) {
366                 RTE_LOG(ERR, VHOST_PORT,
367                         "Failed to get MAC address on port %u: %s\n",
368                         port, rte_strerror(-retval));
369                 return retval;
370         }
371
372         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
373         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
374                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
375                         port,
376                         vmdq_ports_eth_addr[port].addr_bytes[0],
377                         vmdq_ports_eth_addr[port].addr_bytes[1],
378                         vmdq_ports_eth_addr[port].addr_bytes[2],
379                         vmdq_ports_eth_addr[port].addr_bytes[3],
380                         vmdq_ports_eth_addr[port].addr_bytes[4],
381                         vmdq_ports_eth_addr[port].addr_bytes[5]);
382
383         return 0;
384 }
385
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392         char *old;
393
394         /* parse number string */
395         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396                 return -1;
397
398         old = socket_files;
399         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400         if (socket_files == NULL) {
401                 free(old);
402                 return -1;
403         }
404
405         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406         nb_sockets++;
407
408         return 0;
409 }
410
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417         char *end = NULL;
418         unsigned long pm;
419
420         errno = 0;
421
422         /* parse hexadecimal string */
423         pm = strtoul(portmask, &end, 16);
424         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425                 return 0;
426
427         return pm;
428
429 }
430
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437         char *end = NULL;
438         unsigned long num;
439
440         errno = 0;
441
442         /* parse unsigned int string */
443         num = strtoul(q_arg, &end, 10);
444         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445                 return -1;
446
447         if (num > max_valid_value)
448                 return -1;
449
450         return num;
451
452 }
453
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461         "               --vm2vm [0|1|2]\n"
462         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463         "               --socket-file <path>\n"
464         "               --nb-devices ND\n"
465         "               -p PORTMASK: Set mask for ports to be used by application\n"
466         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472         "               --socket-file: The path of the socket file.\n"
473         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
474         "               --tso [0|1] disable/enable TCP segment offload.\n"
475         "               --client register a vhost-user socket as client mode.\n"
476         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477         "               --dmas register dma channel for specific vhost device.\n",
478                prgname);
479 }
480
481 /*
482  * Parse the arguments given in the command line of the application.
483  */
484 static int
485 us_vhost_parse_args(int argc, char **argv)
486 {
487         int opt, ret;
488         int option_index;
489         unsigned i;
490         const char *prgname = argv[0];
491         static struct option long_option[] = {
492                 {"vm2vm", required_argument, NULL, 0},
493                 {"rx-retry", required_argument, NULL, 0},
494                 {"rx-retry-delay", required_argument, NULL, 0},
495                 {"rx-retry-num", required_argument, NULL, 0},
496                 {"mergeable", required_argument, NULL, 0},
497                 {"stats", required_argument, NULL, 0},
498                 {"socket-file", required_argument, NULL, 0},
499                 {"tx-csum", required_argument, NULL, 0},
500                 {"tso", required_argument, NULL, 0},
501                 {"client", no_argument, &client_mode, 1},
502                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
503                 {"dma-type", required_argument, NULL, 0},
504                 {"dmas", required_argument, NULL, 0},
505                 {NULL, 0, 0, 0},
506         };
507
508         /* Parse command line */
509         while ((opt = getopt_long(argc, argv, "p:P",
510                         long_option, &option_index)) != EOF) {
511                 switch (opt) {
512                 /* Portmask */
513                 case 'p':
514                         enabled_port_mask = parse_portmask(optarg);
515                         if (enabled_port_mask == 0) {
516                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
517                                 us_vhost_usage(prgname);
518                                 return -1;
519                         }
520                         break;
521
522                 case 'P':
523                         promiscuous = 1;
524                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
525                                 ETH_VMDQ_ACCEPT_BROADCAST |
526                                 ETH_VMDQ_ACCEPT_MULTICAST;
527
528                         break;
529
530                 case 0:
531                         /* Enable/disable vm2vm comms. */
532                         if (!strncmp(long_option[option_index].name, "vm2vm",
533                                 MAX_LONG_OPT_SZ)) {
534                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535                                 if (ret == -1) {
536                                         RTE_LOG(INFO, VHOST_CONFIG,
537                                                 "Invalid argument for "
538                                                 "vm2vm [0|1|2]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         vm2vm_mode = (vm2vm_type)ret;
543                                 }
544                         }
545
546                         /* Enable/disable retries on RX. */
547                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else {
554                                         enable_retry = ret;
555                                 }
556                         }
557
558                         /* Enable/disable TX checksum offload. */
559                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560                                 ret = parse_num_opt(optarg, 1);
561                                 if (ret == -1) {
562                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563                                         us_vhost_usage(prgname);
564                                         return -1;
565                                 } else
566                                         enable_tx_csum = ret;
567                         }
568
569                         /* Enable/disable TSO offload. */
570                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571                                 ret = parse_num_opt(optarg, 1);
572                                 if (ret == -1) {
573                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574                                         us_vhost_usage(prgname);
575                                         return -1;
576                                 } else
577                                         enable_tso = ret;
578                         }
579
580                         /* Specify the retries delay time (in useconds) on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_delay_time = ret;
589                                 }
590                         }
591
592                         /* Specify the retries number on RX. */
593                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, INT32_MAX);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         burst_rx_retry_num = ret;
601                                 }
602                         }
603
604                         /* Enable/disable RX mergeable buffers. */
605                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606                                 ret = parse_num_opt(optarg, 1);
607                                 if (ret == -1) {
608                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609                                         us_vhost_usage(prgname);
610                                         return -1;
611                                 } else {
612                                         mergeable = !!ret;
613                                         if (ret) {
614                                                 vmdq_conf_default.rxmode.offloads |=
615                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
616                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
617                                                         = JUMBO_FRAME_MAX_SIZE;
618                                         }
619                                 }
620                         }
621
622                         /* Enable/disable stats. */
623                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
624                                 ret = parse_num_opt(optarg, INT32_MAX);
625                                 if (ret == -1) {
626                                         RTE_LOG(INFO, VHOST_CONFIG,
627                                                 "Invalid argument for stats [0..N]\n");
628                                         us_vhost_usage(prgname);
629                                         return -1;
630                                 } else {
631                                         enable_stats = ret;
632                                 }
633                         }
634
635                         /* Set socket file path. */
636                         if (!strncmp(long_option[option_index].name,
637                                                 "socket-file", MAX_LONG_OPT_SZ)) {
638                                 if (us_vhost_parse_socket_path(optarg) == -1) {
639                                         RTE_LOG(INFO, VHOST_CONFIG,
640                                         "Invalid argument for socket name (Max %d characters)\n",
641                                         PATH_MAX);
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 }
645                         }
646
647                         if (!strncmp(long_option[option_index].name,
648                                                 "dma-type", MAX_LONG_OPT_SZ)) {
649                                 if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
650                                         RTE_LOG(INFO, VHOST_CONFIG,
651                                                 "Wrong DMA type\n");
652                                         us_vhost_usage(prgname);
653                                         return -1;
654                                 }
655                                 strcpy(dma_type, optarg);
656                         }
657
658                         if (!strncmp(long_option[option_index].name,
659                                                 "dmas", MAX_LONG_OPT_SZ)) {
660                                 if (open_dma(optarg) == -1) {
661                                         RTE_LOG(INFO, VHOST_CONFIG,
662                                                 "Wrong DMA args\n");
663                                         us_vhost_usage(prgname);
664                                         return -1;
665                                 }
666                                 async_vhost_driver = 1;
667                         }
668
669                         break;
670
671                         /* Invalid option - print options. */
672                 default:
673                         us_vhost_usage(prgname);
674                         return -1;
675                 }
676         }
677
678         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
679                 if (enabled_port_mask & (1 << i))
680                         ports[num_ports++] = i;
681         }
682
683         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
684                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
685                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
686                 return -1;
687         }
688
689         return 0;
690 }
691
692 /*
693  * Update the global var NUM_PORTS and array PORTS according to system ports number
694  * and return valid ports number
695  */
696 static unsigned check_ports_num(unsigned nb_ports)
697 {
698         unsigned valid_num_ports = num_ports;
699         unsigned portid;
700
701         if (num_ports > nb_ports) {
702                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
703                         num_ports, nb_ports);
704                 num_ports = nb_ports;
705         }
706
707         for (portid = 0; portid < num_ports; portid ++) {
708                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
709                         RTE_LOG(INFO, VHOST_PORT,
710                                 "\nSpecified port ID(%u) is not valid\n",
711                                 ports[portid]);
712                         ports[portid] = INVALID_PORT_ID;
713                         valid_num_ports--;
714                 }
715         }
716         return valid_num_ports;
717 }
718
719 static __rte_always_inline struct vhost_dev *
720 find_vhost_dev(struct rte_ether_addr *mac)
721 {
722         struct vhost_dev *vdev;
723
724         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
725                 if (vdev->ready == DEVICE_RX &&
726                     rte_is_same_ether_addr(mac, &vdev->mac_address))
727                         return vdev;
728         }
729
730         return NULL;
731 }
732
733 /*
734  * This function learns the MAC address of the device and registers this along with a
735  * vlan tag to a VMDQ.
736  */
737 static int
738 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
739 {
740         struct rte_ether_hdr *pkt_hdr;
741         int i, ret;
742
743         /* Learn MAC address of guest device from packet */
744         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
745
746         if (find_vhost_dev(&pkt_hdr->s_addr)) {
747                 RTE_LOG(ERR, VHOST_DATA,
748                         "(%d) device is using a registered MAC!\n",
749                         vdev->vid);
750                 return -1;
751         }
752
753         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
754                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
755
756         /* vlan_tag currently uses the device_id. */
757         vdev->vlan_tag = vlan_tags[vdev->vid];
758
759         /* Print out VMDQ registration info. */
760         RTE_LOG(INFO, VHOST_DATA,
761                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
762                 vdev->vid,
763                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
764                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
765                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
766                 vdev->vlan_tag);
767
768         /* Register the MAC address. */
769         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
770                                 (uint32_t)vdev->vid + vmdq_pool_base);
771         if (ret)
772                 RTE_LOG(ERR, VHOST_DATA,
773                         "(%d) failed to add device MAC address to VMDQ\n",
774                         vdev->vid);
775
776         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
777
778         /* Set device as ready for RX. */
779         vdev->ready = DEVICE_RX;
780
781         return 0;
782 }
783
784 /*
785  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
786  * queue before disabling RX on the device.
787  */
788 static inline void
789 unlink_vmdq(struct vhost_dev *vdev)
790 {
791         unsigned i = 0;
792         unsigned rx_count;
793         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
794
795         if (vdev->ready == DEVICE_RX) {
796                 /*clear MAC and VLAN settings*/
797                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
798                 for (i = 0; i < 6; i++)
799                         vdev->mac_address.addr_bytes[i] = 0;
800
801                 vdev->vlan_tag = 0;
802
803                 /*Clear out the receive buffers*/
804                 rx_count = rte_eth_rx_burst(ports[0],
805                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
806
807                 while (rx_count) {
808                         for (i = 0; i < rx_count; i++)
809                                 rte_pktmbuf_free(pkts_burst[i]);
810
811                         rx_count = rte_eth_rx_burst(ports[0],
812                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
813                 }
814
815                 vdev->ready = DEVICE_MAC_LEARNING;
816         }
817 }
818
819 static inline void
820 free_pkts(struct rte_mbuf **pkts, uint16_t n)
821 {
822         while (n--)
823                 rte_pktmbuf_free(pkts[n]);
824 }
825
826 static __rte_always_inline void
827 complete_async_pkts(struct vhost_dev *vdev)
828 {
829         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
830         uint16_t complete_count;
831
832         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
833                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
834         if (complete_count)
835                 free_pkts(p_cpl, complete_count);
836 }
837
838 static __rte_always_inline void
839 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
840             struct rte_mbuf *m)
841 {
842         uint16_t ret;
843
844         if (builtin_net_driver) {
845                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
846         } else {
847                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
848         }
849
850         if (enable_stats) {
851                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
852                                 __ATOMIC_SEQ_CST);
853                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
854                                 __ATOMIC_SEQ_CST);
855                 src_vdev->stats.tx_total++;
856                 src_vdev->stats.tx += ret;
857         }
858 }
859
860 static __rte_always_inline void
861 drain_vhost(struct vhost_dev *vdev)
862 {
863         uint16_t ret;
864         uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
865         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
866         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
867
868         if (builtin_net_driver) {
869                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
870         } else if (async_vhost_driver) {
871                 uint32_t cpu_cpl_nr = 0;
872                 uint16_t enqueue_fail = 0;
873                 struct rte_mbuf *m_cpu_cpl[nr_xmit];
874
875                 complete_async_pkts(vdev);
876                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
877                                         m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
878
879                 if (cpu_cpl_nr)
880                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
881
882                 enqueue_fail = nr_xmit - ret;
883                 if (enqueue_fail)
884                         free_pkts(&m[ret], nr_xmit - ret);
885         } else {
886                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
887                                                 m, nr_xmit);
888         }
889
890         if (enable_stats) {
891                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
892                                 __ATOMIC_SEQ_CST);
893                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
894                                 __ATOMIC_SEQ_CST);
895         }
896
897         if (!async_vhost_driver)
898                 free_pkts(m, nr_xmit);
899 }
900
901 static __rte_always_inline void
902 drain_vhost_table(void)
903 {
904         uint16_t lcore_id = rte_lcore_id();
905         struct vhost_bufftable *vhost_txq;
906         struct vhost_dev *vdev;
907         uint64_t cur_tsc;
908
909         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
910                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
911                                                 + vdev->vid];
912
913                 cur_tsc = rte_rdtsc();
914                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
915                                 > MBUF_TABLE_DRAIN_TSC)) {
916                         RTE_LOG_DP(DEBUG, VHOST_DATA,
917                                 "Vhost TX queue drained after timeout with burst size %u\n",
918                                 vhost_txq->len);
919                         drain_vhost(vdev);
920                         vhost_txq->len = 0;
921                         vhost_txq->pre_tsc = cur_tsc;
922                 }
923         }
924 }
925
926 /*
927  * Check if the packet destination MAC address is for a local device. If so then put
928  * the packet on that devices RX queue. If not then return.
929  */
930 static __rte_always_inline int
931 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
932 {
933         struct rte_ether_hdr *pkt_hdr;
934         struct vhost_dev *dst_vdev;
935         struct vhost_bufftable *vhost_txq;
936         uint16_t lcore_id = rte_lcore_id();
937         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
938
939         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
940         if (!dst_vdev)
941                 return -1;
942
943         if (vdev->vid == dst_vdev->vid) {
944                 RTE_LOG_DP(DEBUG, VHOST_DATA,
945                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
946                         vdev->vid);
947                 return 0;
948         }
949
950         RTE_LOG_DP(DEBUG, VHOST_DATA,
951                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
952
953         if (unlikely(dst_vdev->remove)) {
954                 RTE_LOG_DP(DEBUG, VHOST_DATA,
955                         "(%d) device is marked for removal\n", dst_vdev->vid);
956                 return 0;
957         }
958
959         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
960         vhost_txq->m_table[vhost_txq->len++] = m;
961
962         if (enable_stats) {
963                 vdev->stats.tx_total++;
964                 vdev->stats.tx++;
965         }
966
967         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
968                 drain_vhost(dst_vdev);
969                 vhost_txq->len = 0;
970                 vhost_txq->pre_tsc = rte_rdtsc();
971         }
972         return 0;
973 }
974
975 /*
976  * Check if the destination MAC of a packet is one local VM,
977  * and get its vlan tag, and offset if it is.
978  */
979 static __rte_always_inline int
980 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
981         uint32_t *offset, uint16_t *vlan_tag)
982 {
983         struct vhost_dev *dst_vdev;
984         struct rte_ether_hdr *pkt_hdr =
985                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
986
987         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
988         if (!dst_vdev)
989                 return 0;
990
991         if (vdev->vid == dst_vdev->vid) {
992                 RTE_LOG_DP(DEBUG, VHOST_DATA,
993                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
994                         vdev->vid);
995                 return -1;
996         }
997
998         /*
999          * HW vlan strip will reduce the packet length
1000          * by minus length of vlan tag, so need restore
1001          * the packet length by plus it.
1002          */
1003         *offset  = VLAN_HLEN;
1004         *vlan_tag = vlan_tags[vdev->vid];
1005
1006         RTE_LOG_DP(DEBUG, VHOST_DATA,
1007                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1008                 vdev->vid, dst_vdev->vid, *vlan_tag);
1009
1010         return 0;
1011 }
1012
1013 static uint16_t
1014 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1015 {
1016         if (ol_flags & PKT_TX_IPV4)
1017                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1018         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1019                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1020 }
1021
1022 static void virtio_tx_offload(struct rte_mbuf *m)
1023 {
1024         void *l3_hdr;
1025         struct rte_ipv4_hdr *ipv4_hdr = NULL;
1026         struct rte_tcp_hdr *tcp_hdr = NULL;
1027         struct rte_ether_hdr *eth_hdr =
1028                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1029
1030         l3_hdr = (char *)eth_hdr + m->l2_len;
1031
1032         if (m->ol_flags & PKT_TX_IPV4) {
1033                 ipv4_hdr = l3_hdr;
1034                 ipv4_hdr->hdr_checksum = 0;
1035                 m->ol_flags |= PKT_TX_IP_CKSUM;
1036         }
1037
1038         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1039         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1040 }
1041
1042 static __rte_always_inline void
1043 do_drain_mbuf_table(struct mbuf_table *tx_q)
1044 {
1045         uint16_t count;
1046
1047         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1048                                  tx_q->m_table, tx_q->len);
1049         if (unlikely(count < tx_q->len))
1050                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1051
1052         tx_q->len = 0;
1053 }
1054
1055 /*
1056  * This function routes the TX packet to the correct interface. This
1057  * may be a local device or the physical port.
1058  */
1059 static __rte_always_inline void
1060 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1061 {
1062         struct mbuf_table *tx_q;
1063         unsigned offset = 0;
1064         const uint16_t lcore_id = rte_lcore_id();
1065         struct rte_ether_hdr *nh;
1066
1067
1068         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1069         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1070                 struct vhost_dev *vdev2;
1071
1072                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1073                         if (vdev2 != vdev)
1074                                 sync_virtio_xmit(vdev2, vdev, m);
1075                 }
1076                 goto queue2nic;
1077         }
1078
1079         /*check if destination is local VM*/
1080         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1081                 return;
1082
1083         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1084                 if (unlikely(find_local_dest(vdev, m, &offset,
1085                                              &vlan_tag) != 0)) {
1086                         rte_pktmbuf_free(m);
1087                         return;
1088                 }
1089         }
1090
1091         RTE_LOG_DP(DEBUG, VHOST_DATA,
1092                 "(%d) TX: MAC address is external\n", vdev->vid);
1093
1094 queue2nic:
1095
1096         /*Add packet to the port tx queue*/
1097         tx_q = &lcore_tx_queue[lcore_id];
1098
1099         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1100         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1101                 /* Guest has inserted the vlan tag. */
1102                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1103                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1104                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1105                         (vh->vlan_tci != vlan_tag_be))
1106                         vh->vlan_tci = vlan_tag_be;
1107         } else {
1108                 m->ol_flags |= PKT_TX_VLAN_PKT;
1109
1110                 /*
1111                  * Find the right seg to adjust the data len when offset is
1112                  * bigger than tail room size.
1113                  */
1114                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1115                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1116                                 m->data_len += offset;
1117                         else {
1118                                 struct rte_mbuf *seg = m;
1119
1120                                 while ((seg->next != NULL) &&
1121                                         (offset > rte_pktmbuf_tailroom(seg)))
1122                                         seg = seg->next;
1123
1124                                 seg->data_len += offset;
1125                         }
1126                         m->pkt_len += offset;
1127                 }
1128
1129                 m->vlan_tci = vlan_tag;
1130         }
1131
1132         if (m->ol_flags & PKT_TX_TCP_SEG)
1133                 virtio_tx_offload(m);
1134
1135         tx_q->m_table[tx_q->len++] = m;
1136         if (enable_stats) {
1137                 vdev->stats.tx_total++;
1138                 vdev->stats.tx++;
1139         }
1140
1141         if (unlikely(tx_q->len == MAX_PKT_BURST))
1142                 do_drain_mbuf_table(tx_q);
1143 }
1144
1145
1146 static __rte_always_inline void
1147 drain_mbuf_table(struct mbuf_table *tx_q)
1148 {
1149         static uint64_t prev_tsc;
1150         uint64_t cur_tsc;
1151
1152         if (tx_q->len == 0)
1153                 return;
1154
1155         cur_tsc = rte_rdtsc();
1156         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1157                 prev_tsc = cur_tsc;
1158
1159                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1160                         "TX queue drained after timeout with burst size %u\n",
1161                         tx_q->len);
1162                 do_drain_mbuf_table(tx_q);
1163         }
1164 }
1165
1166 static __rte_always_inline void
1167 drain_eth_rx(struct vhost_dev *vdev)
1168 {
1169         uint16_t rx_count, enqueue_count;
1170         struct rte_mbuf *pkts[MAX_PKT_BURST];
1171
1172         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1173                                     pkts, MAX_PKT_BURST);
1174
1175         if (!rx_count)
1176                 return;
1177
1178         /*
1179          * When "enable_retry" is set, here we wait and retry when there
1180          * is no enough free slots in the queue to hold @rx_count packets,
1181          * to diminish packet loss.
1182          */
1183         if (enable_retry &&
1184             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1185                         VIRTIO_RXQ))) {
1186                 uint32_t retry;
1187
1188                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1189                         rte_delay_us(burst_rx_delay_time);
1190                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1191                                         VIRTIO_RXQ))
1192                                 break;
1193                 }
1194         }
1195
1196         if (builtin_net_driver) {
1197                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1198                                                 pkts, rx_count);
1199         } else if (async_vhost_driver) {
1200                 uint32_t cpu_cpl_nr = 0;
1201                 uint16_t enqueue_fail = 0;
1202                 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1203
1204                 complete_async_pkts(vdev);
1205                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1206                                         VIRTIO_RXQ, pkts, rx_count,
1207                                         m_cpu_cpl, &cpu_cpl_nr);
1208                 if (cpu_cpl_nr)
1209                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
1210
1211                 enqueue_fail = rx_count - enqueue_count;
1212                 if (enqueue_fail)
1213                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1214
1215         } else {
1216                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1217                                                 pkts, rx_count);
1218         }
1219
1220         if (enable_stats) {
1221                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1222                                 __ATOMIC_SEQ_CST);
1223                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1224                                 __ATOMIC_SEQ_CST);
1225         }
1226
1227         if (!async_vhost_driver)
1228                 free_pkts(pkts, rx_count);
1229 }
1230
1231 static __rte_always_inline void
1232 drain_virtio_tx(struct vhost_dev *vdev)
1233 {
1234         struct rte_mbuf *pkts[MAX_PKT_BURST];
1235         uint16_t count;
1236         uint16_t i;
1237
1238         if (builtin_net_driver) {
1239                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1240                                         pkts, MAX_PKT_BURST);
1241         } else {
1242                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1243                                         mbuf_pool, pkts, MAX_PKT_BURST);
1244         }
1245
1246         /* setup VMDq for the first packet */
1247         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1248                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1249                         free_pkts(pkts, count);
1250         }
1251
1252         for (i = 0; i < count; ++i)
1253                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1254 }
1255
1256 /*
1257  * Main function of vhost-switch. It basically does:
1258  *
1259  * for each vhost device {
1260  *    - drain_eth_rx()
1261  *
1262  *      Which drains the host eth Rx queue linked to the vhost device,
1263  *      and deliver all of them to guest virito Rx ring associated with
1264  *      this vhost device.
1265  *
1266  *    - drain_virtio_tx()
1267  *
1268  *      Which drains the guest virtio Tx queue and deliver all of them
1269  *      to the target, which could be another vhost device, or the
1270  *      physical eth dev. The route is done in function "virtio_tx_route".
1271  * }
1272  */
1273 static int
1274 switch_worker(void *arg __rte_unused)
1275 {
1276         unsigned i;
1277         unsigned lcore_id = rte_lcore_id();
1278         struct vhost_dev *vdev;
1279         struct mbuf_table *tx_q;
1280
1281         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1282
1283         tx_q = &lcore_tx_queue[lcore_id];
1284         for (i = 0; i < rte_lcore_count(); i++) {
1285                 if (lcore_ids[i] == lcore_id) {
1286                         tx_q->txq_id = i;
1287                         break;
1288                 }
1289         }
1290
1291         while(1) {
1292                 drain_mbuf_table(tx_q);
1293                 drain_vhost_table();
1294                 /*
1295                  * Inform the configuration core that we have exited the
1296                  * linked list and that no devices are in use if requested.
1297                  */
1298                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1299                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1300
1301                 /*
1302                  * Process vhost devices
1303                  */
1304                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1305                               lcore_vdev_entry) {
1306                         if (unlikely(vdev->remove)) {
1307                                 unlink_vmdq(vdev);
1308                                 vdev->ready = DEVICE_SAFE_REMOVE;
1309                                 continue;
1310                         }
1311
1312                         if (likely(vdev->ready == DEVICE_RX))
1313                                 drain_eth_rx(vdev);
1314
1315                         if (likely(!vdev->remove))
1316                                 drain_virtio_tx(vdev);
1317                 }
1318         }
1319
1320         return 0;
1321 }
1322
1323 /*
1324  * Remove a device from the specific data core linked list and from the
1325  * main linked list. Synchonization  occurs through the use of the
1326  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1327  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1328  */
1329 static void
1330 destroy_device(int vid)
1331 {
1332         struct vhost_dev *vdev = NULL;
1333         int lcore;
1334         uint16_t i;
1335
1336         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1337                 if (vdev->vid == vid)
1338                         break;
1339         }
1340         if (!vdev)
1341                 return;
1342         /*set the remove flag. */
1343         vdev->remove = 1;
1344         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1345                 rte_pause();
1346         }
1347
1348         for (i = 0; i < RTE_MAX_LCORE; i++)
1349                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1350
1351         if (builtin_net_driver)
1352                 vs_vhost_net_remove(vdev);
1353
1354         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1355                      lcore_vdev_entry);
1356         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1357
1358
1359         /* Set the dev_removal_flag on each lcore. */
1360         RTE_LCORE_FOREACH_WORKER(lcore)
1361                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1362
1363         /*
1364          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1365          * we can be sure that they can no longer access the device removed
1366          * from the linked lists and that the devices are no longer in use.
1367          */
1368         RTE_LCORE_FOREACH_WORKER(lcore) {
1369                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1370                         rte_pause();
1371         }
1372
1373         lcore_info[vdev->coreid].device_num--;
1374
1375         RTE_LOG(INFO, VHOST_DATA,
1376                 "(%d) device has been removed from data core\n",
1377                 vdev->vid);
1378
1379         if (async_vhost_driver)
1380                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1381
1382         rte_free(vdev);
1383 }
1384
1385 /*
1386  * A new device is added to a data core. First the device is added to the main linked list
1387  * and then allocated to a specific data core.
1388  */
1389 static int
1390 new_device(int vid)
1391 {
1392         int lcore, core_add = 0;
1393         uint16_t i;
1394         uint32_t device_num_min = num_devices;
1395         struct vhost_dev *vdev;
1396         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1397         if (vdev == NULL) {
1398                 RTE_LOG(INFO, VHOST_DATA,
1399                         "(%d) couldn't allocate memory for vhost dev\n",
1400                         vid);
1401                 return -1;
1402         }
1403         vdev->vid = vid;
1404
1405         for (i = 0; i < RTE_MAX_LCORE; i++) {
1406                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1407                         = rte_zmalloc("vhost bufftable",
1408                                 sizeof(struct vhost_bufftable),
1409                                 RTE_CACHE_LINE_SIZE);
1410
1411                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1412                         RTE_LOG(INFO, VHOST_DATA,
1413                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1414                         return -1;
1415                 }
1416         }
1417
1418         if (builtin_net_driver)
1419                 vs_vhost_net_setup(vdev);
1420
1421         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1422         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1423
1424         /*reset ready flag*/
1425         vdev->ready = DEVICE_MAC_LEARNING;
1426         vdev->remove = 0;
1427
1428         /* Find a suitable lcore to add the device. */
1429         RTE_LCORE_FOREACH_WORKER(lcore) {
1430                 if (lcore_info[lcore].device_num < device_num_min) {
1431                         device_num_min = lcore_info[lcore].device_num;
1432                         core_add = lcore;
1433                 }
1434         }
1435         vdev->coreid = core_add;
1436
1437         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1438                           lcore_vdev_entry);
1439         lcore_info[vdev->coreid].device_num++;
1440
1441         /* Disable notifications. */
1442         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1443         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1444
1445         RTE_LOG(INFO, VHOST_DATA,
1446                 "(%d) device has been added to data core %d\n",
1447                 vid, vdev->coreid);
1448
1449         if (async_vhost_driver) {
1450                 struct rte_vhost_async_features f;
1451                 struct rte_vhost_async_channel_ops channel_ops;
1452
1453                 if (strncmp(dma_type, "ioat", 4) == 0) {
1454                         channel_ops.transfer_data = ioat_transfer_data_cb;
1455                         channel_ops.check_completed_copies =
1456                                 ioat_check_completed_copies_cb;
1457
1458                         f.async_inorder = 1;
1459                         f.async_threshold = 256;
1460
1461                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1462                                 f.intval, &channel_ops);
1463                 }
1464         }
1465
1466         return 0;
1467 }
1468
1469 /*
1470  * These callback allow devices to be added to the data core when configuration
1471  * has been fully complete.
1472  */
1473 static const struct vhost_device_ops virtio_net_device_ops =
1474 {
1475         .new_device =  new_device,
1476         .destroy_device = destroy_device,
1477 };
1478
1479 /*
1480  * This is a thread will wake up after a period to print stats if the user has
1481  * enabled them.
1482  */
1483 static void *
1484 print_stats(__rte_unused void *arg)
1485 {
1486         struct vhost_dev *vdev;
1487         uint64_t tx_dropped, rx_dropped;
1488         uint64_t tx, tx_total, rx, rx_total;
1489         const char clr[] = { 27, '[', '2', 'J', '\0' };
1490         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1491
1492         while(1) {
1493                 sleep(enable_stats);
1494
1495                 /* Clear screen and move to top left */
1496                 printf("%s%s\n", clr, top_left);
1497                 printf("Device statistics =================================\n");
1498
1499                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1500                         tx_total   = vdev->stats.tx_total;
1501                         tx         = vdev->stats.tx;
1502                         tx_dropped = tx_total - tx;
1503
1504                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1505                                 __ATOMIC_SEQ_CST);
1506                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1507                                 __ATOMIC_SEQ_CST);
1508                         rx_dropped = rx_total - rx;
1509
1510                         printf("Statistics for device %d\n"
1511                                 "-----------------------\n"
1512                                 "TX total:              %" PRIu64 "\n"
1513                                 "TX dropped:            %" PRIu64 "\n"
1514                                 "TX successful:         %" PRIu64 "\n"
1515                                 "RX total:              %" PRIu64 "\n"
1516                                 "RX dropped:            %" PRIu64 "\n"
1517                                 "RX successful:         %" PRIu64 "\n",
1518                                 vdev->vid,
1519                                 tx_total, tx_dropped, tx,
1520                                 rx_total, rx_dropped, rx);
1521                 }
1522
1523                 printf("===================================================\n");
1524
1525                 fflush(stdout);
1526         }
1527
1528         return NULL;
1529 }
1530
1531 static void
1532 unregister_drivers(int socket_num)
1533 {
1534         int i, ret;
1535
1536         for (i = 0; i < socket_num; i++) {
1537                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1538                 if (ret != 0)
1539                         RTE_LOG(ERR, VHOST_CONFIG,
1540                                 "Fail to unregister vhost driver for %s.\n",
1541                                 socket_files + i * PATH_MAX);
1542         }
1543 }
1544
1545 /* When we receive a INT signal, unregister vhost driver */
1546 static void
1547 sigint_handler(__rte_unused int signum)
1548 {
1549         /* Unregister vhost driver. */
1550         unregister_drivers(nb_sockets);
1551
1552         exit(0);
1553 }
1554
1555 /*
1556  * While creating an mbuf pool, one key thing is to figure out how
1557  * many mbuf entries is enough for our use. FYI, here are some
1558  * guidelines:
1559  *
1560  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1561  *
1562  * - For each switch core (A CPU core does the packet switch), we need
1563  *   also make some reservation for receiving the packets from virtio
1564  *   Tx queue. How many is enough depends on the usage. It's normally
1565  *   a simple calculation like following:
1566  *
1567  *       MAX_PKT_BURST * max packet size / mbuf size
1568  *
1569  *   So, we definitely need allocate more mbufs when TSO is enabled.
1570  *
1571  * - Similarly, for each switching core, we should serve @nr_rx_desc
1572  *   mbufs for receiving the packets from physical NIC device.
1573  *
1574  * - We also need make sure, for each switch core, we have allocated
1575  *   enough mbufs to fill up the mbuf cache.
1576  */
1577 static void
1578 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1579         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1580 {
1581         uint32_t nr_mbufs;
1582         uint32_t nr_mbufs_per_core;
1583         uint32_t mtu = 1500;
1584
1585         if (mergeable)
1586                 mtu = 9000;
1587         if (enable_tso)
1588                 mtu = 64 * 1024;
1589
1590         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1591                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1592         nr_mbufs_per_core += nr_rx_desc;
1593         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1594
1595         nr_mbufs  = nr_queues * nr_rx_desc;
1596         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1597         nr_mbufs *= nr_port;
1598
1599         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1600                                             nr_mbuf_cache, 0, mbuf_size,
1601                                             rte_socket_id());
1602         if (mbuf_pool == NULL)
1603                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1604 }
1605
1606 /*
1607  * Main function, does initialisation and calls the per-lcore functions.
1608  */
1609 int
1610 main(int argc, char *argv[])
1611 {
1612         unsigned lcore_id, core_id = 0;
1613         unsigned nb_ports, valid_num_ports;
1614         int ret, i;
1615         uint16_t portid;
1616         static pthread_t tid;
1617         uint64_t flags = 0;
1618
1619         signal(SIGINT, sigint_handler);
1620
1621         /* init EAL */
1622         ret = rte_eal_init(argc, argv);
1623         if (ret < 0)
1624                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1625         argc -= ret;
1626         argv += ret;
1627
1628         /* parse app arguments */
1629         ret = us_vhost_parse_args(argc, argv);
1630         if (ret < 0)
1631                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1632
1633         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1634                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1635
1636                 if (rte_lcore_is_enabled(lcore_id))
1637                         lcore_ids[core_id++] = lcore_id;
1638         }
1639
1640         if (rte_lcore_count() > RTE_MAX_LCORE)
1641                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1642
1643         /* Get the number of physical ports. */
1644         nb_ports = rte_eth_dev_count_avail();
1645
1646         /*
1647          * Update the global var NUM_PORTS and global array PORTS
1648          * and get value of var VALID_NUM_PORTS according to system ports number
1649          */
1650         valid_num_ports = check_ports_num(nb_ports);
1651
1652         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1653                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1654                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1655                 return -1;
1656         }
1657
1658         /*
1659          * FIXME: here we are trying to allocate mbufs big enough for
1660          * @MAX_QUEUES, but the truth is we're never going to use that
1661          * many queues here. We probably should only do allocation for
1662          * those queues we are going to use.
1663          */
1664         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1665                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1666
1667         if (vm2vm_mode == VM2VM_HARDWARE) {
1668                 /* Enable VT loop back to let L2 switch to do it. */
1669                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1670                 RTE_LOG(DEBUG, VHOST_CONFIG,
1671                         "Enable loop back for L2 switch in vmdq.\n");
1672         }
1673
1674         /* initialize all ports */
1675         RTE_ETH_FOREACH_DEV(portid) {
1676                 /* skip ports that are not enabled */
1677                 if ((enabled_port_mask & (1 << portid)) == 0) {
1678                         RTE_LOG(INFO, VHOST_PORT,
1679                                 "Skipping disabled port %d\n", portid);
1680                         continue;
1681                 }
1682                 if (port_init(portid) != 0)
1683                         rte_exit(EXIT_FAILURE,
1684                                 "Cannot initialize network ports\n");
1685         }
1686
1687         /* Enable stats if the user option is set. */
1688         if (enable_stats) {
1689                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1690                                         print_stats, NULL);
1691                 if (ret < 0)
1692                         rte_exit(EXIT_FAILURE,
1693                                 "Cannot create print-stats thread\n");
1694         }
1695
1696         /* Launch all data cores. */
1697         RTE_LCORE_FOREACH_WORKER(lcore_id)
1698                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1699
1700         if (client_mode)
1701                 flags |= RTE_VHOST_USER_CLIENT;
1702
1703         /* Register vhost user driver to handle vhost messages. */
1704         for (i = 0; i < nb_sockets; i++) {
1705                 char *file = socket_files + i * PATH_MAX;
1706
1707                 if (async_vhost_driver)
1708                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1709
1710                 ret = rte_vhost_driver_register(file, flags);
1711                 if (ret != 0) {
1712                         unregister_drivers(i);
1713                         rte_exit(EXIT_FAILURE,
1714                                 "vhost driver register failure.\n");
1715                 }
1716
1717                 if (builtin_net_driver)
1718                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1719
1720                 if (mergeable == 0) {
1721                         rte_vhost_driver_disable_features(file,
1722                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1723                 }
1724
1725                 if (enable_tx_csum == 0) {
1726                         rte_vhost_driver_disable_features(file,
1727                                 1ULL << VIRTIO_NET_F_CSUM);
1728                 }
1729
1730                 if (enable_tso == 0) {
1731                         rte_vhost_driver_disable_features(file,
1732                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1733                         rte_vhost_driver_disable_features(file,
1734                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1735                         rte_vhost_driver_disable_features(file,
1736                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1737                         rte_vhost_driver_disable_features(file,
1738                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1739                 }
1740
1741                 if (promiscuous) {
1742                         rte_vhost_driver_enable_features(file,
1743                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1744                 }
1745
1746                 ret = rte_vhost_driver_callback_register(file,
1747                         &virtio_net_device_ops);
1748                 if (ret != 0) {
1749                         rte_exit(EXIT_FAILURE,
1750                                 "failed to register vhost driver callbacks.\n");
1751                 }
1752
1753                 if (rte_vhost_driver_start(file) < 0) {
1754                         rte_exit(EXIT_FAILURE,
1755                                 "failed to start vhost driver.\n");
1756                 }
1757         }
1758
1759         RTE_LCORE_FOREACH_WORKER(lcore_id)
1760                 rte_eal_wait_lcore(lcore_id);
1761
1762         return 0;
1763
1764 }