examples/vhost: refactor vhost data path
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
23 #include <rte_ip.h>
24 #include <rte_tcp.h>
25 #include <rte_pause.h>
26
27 #include "ioat.h"
28 #include "main.h"
29
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
39
40 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
41
42 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
44
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX                       1
50 #define DEVICE_SAFE_REMOVE      2
51
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55
56 #define INVALID_PORT_ID 0xFF
57
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
60
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
63
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
66
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
70
71 static struct rte_mempool *mbuf_pool;
72 static int mergeable;
73
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
75 typedef enum {
76         VM2VM_DISABLED = 0,
77         VM2VM_SOFTWARE = 1,
78         VM2VM_HARDWARE = 2,
79         VM2VM_LAST
80 } vm2vm_type;
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82
83 /* Enable stats. */
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
87
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
90
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
93
94 static int client_mode;
95
96 static int builtin_net_driver;
97
98 static int async_vhost_driver;
99
100 static char dma_type[MAX_LONG_OPT_SZ];
101
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113         .rxmode = {
114                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115                 .split_hdr_size = 0,
116                 /*
117                  * VLAN strip is necessary for 1G NIC such as I350,
118                  * this fixes bug of ipv4 forwarding in guest can't
119                  * forward pakets from one virtio dev to another virtio dev.
120                  */
121                 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122         },
123
124         .txmode = {
125                 .mq_mode = ETH_MQ_TX_NONE,
126                 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127                              DEV_TX_OFFLOAD_TCP_CKSUM |
128                              DEV_TX_OFFLOAD_VLAN_INSERT |
129                              DEV_TX_OFFLOAD_MULTI_SEGS |
130                              DEV_TX_OFFLOAD_TCP_TSO),
131         },
132         .rx_adv_conf = {
133                 /*
134                  * should be overridden separately in code with
135                  * appropriate values
136                  */
137                 .vmdq_rx_conf = {
138                         .nb_queue_pools = ETH_8_POOLS,
139                         .enable_default_pool = 0,
140                         .default_pool = 0,
141                         .nb_pool_maps = 0,
142                         .pool_map = {{0, 0},},
143                 },
144         },
145 };
146
147
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154
155 const uint16_t vlan_tags[] = {
156         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
158         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165
166 /* ethernet addresses of ports */
167 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168
169 static struct vhost_dev_tailq_list vhost_dev_list =
170         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176         unsigned len;
177         unsigned txq_id;
178         struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180
181 struct vhost_bufftable {
182         uint32_t len;
183         uint64_t pre_tsc;
184         struct rte_mbuf *m_table[MAX_PKT_BURST];
185 };
186
187 /* TX queue for each data core. */
188 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189
190 /*
191  * Vhost TX buffer for each data core.
192  * Every data core maintains a TX buffer for every vhost device,
193  * which is used for batch pkts enqueue for higher performance.
194  */
195 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
196
197 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
198                                  / US_PER_S * BURST_TX_DRAIN_US)
199 #define VLAN_HLEN       4
200
201 static inline int
202 open_dma(const char *value)
203 {
204         if (strncmp(dma_type, "ioat", 4) == 0)
205                 return open_ioat(value);
206
207         return -1;
208 }
209
210 /*
211  * Builds up the correct configuration for VMDQ VLAN pool map
212  * according to the pool & queue limits.
213  */
214 static inline int
215 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
216 {
217         struct rte_eth_vmdq_rx_conf conf;
218         struct rte_eth_vmdq_rx_conf *def_conf =
219                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220         unsigned i;
221
222         memset(&conf, 0, sizeof(conf));
223         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
224         conf.nb_pool_maps = num_devices;
225         conf.enable_loop_back = def_conf->enable_loop_back;
226         conf.rx_mode = def_conf->rx_mode;
227
228         for (i = 0; i < conf.nb_pool_maps; i++) {
229                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
230                 conf.pool_map[i].pools = (1UL << i);
231         }
232
233         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
234         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
235                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
236         return 0;
237 }
238
239 /*
240  * Initialises a given port using global settings and with the rx buffers
241  * coming from the mbuf_pool passed as parameter
242  */
243 static inline int
244 port_init(uint16_t port)
245 {
246         struct rte_eth_dev_info dev_info;
247         struct rte_eth_conf port_conf;
248         struct rte_eth_rxconf *rxconf;
249         struct rte_eth_txconf *txconf;
250         int16_t rx_rings, tx_rings;
251         uint16_t rx_ring_size, tx_ring_size;
252         int retval;
253         uint16_t q;
254
255         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
256         retval = rte_eth_dev_info_get(port, &dev_info);
257         if (retval != 0) {
258                 RTE_LOG(ERR, VHOST_PORT,
259                         "Error during getting device (port %u) info: %s\n",
260                         port, strerror(-retval));
261
262                 return retval;
263         }
264
265         rxconf = &dev_info.default_rxconf;
266         txconf = &dev_info.default_txconf;
267         rxconf->rx_drop_en = 1;
268
269         /*configure the number of supported virtio devices based on VMDQ limits */
270         num_devices = dev_info.max_vmdq_pools;
271
272         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
273         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
274
275         tx_rings = (uint16_t)rte_lcore_count();
276
277         /* Get port configuration. */
278         retval = get_eth_conf(&port_conf, num_devices);
279         if (retval < 0)
280                 return retval;
281         /* NIC queues are divided into pf queues and vmdq queues.  */
282         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284         num_vmdq_queues = num_devices * queues_per_pool;
285         num_queues = num_pf_queues + num_vmdq_queues;
286         vmdq_queue_base = dev_info.vmdq_queue_base;
287         vmdq_pool_base  = dev_info.vmdq_pool_base;
288         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289                 num_pf_queues, num_devices, queues_per_pool);
290
291         if (!rte_eth_dev_is_valid_port(port))
292                 return -1;
293
294         rx_rings = (uint16_t)dev_info.max_rx_queues;
295         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296                 port_conf.txmode.offloads |=
297                         DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298         /* Configure ethernet device. */
299         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300         if (retval != 0) {
301                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302                         port, strerror(-retval));
303                 return retval;
304         }
305
306         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307                 &tx_ring_size);
308         if (retval != 0) {
309                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310                         "for port %u: %s.\n", port, strerror(-retval));
311                 return retval;
312         }
313         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315                         "for Rx queues on port %u.\n", port);
316                 return -1;
317         }
318
319         /* Setup the queues. */
320         rxconf->offloads = port_conf.rxmode.offloads;
321         for (q = 0; q < rx_rings; q ++) {
322                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323                                                 rte_eth_dev_socket_id(port),
324                                                 rxconf,
325                                                 mbuf_pool);
326                 if (retval < 0) {
327                         RTE_LOG(ERR, VHOST_PORT,
328                                 "Failed to setup rx queue %u of port %u: %s.\n",
329                                 q, port, strerror(-retval));
330                         return retval;
331                 }
332         }
333         txconf->offloads = port_conf.txmode.offloads;
334         for (q = 0; q < tx_rings; q ++) {
335                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336                                                 rte_eth_dev_socket_id(port),
337                                                 txconf);
338                 if (retval < 0) {
339                         RTE_LOG(ERR, VHOST_PORT,
340                                 "Failed to setup tx queue %u of port %u: %s.\n",
341                                 q, port, strerror(-retval));
342                         return retval;
343                 }
344         }
345
346         /* Start the device. */
347         retval  = rte_eth_dev_start(port);
348         if (retval < 0) {
349                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350                         port, strerror(-retval));
351                 return retval;
352         }
353
354         if (promiscuous) {
355                 retval = rte_eth_promiscuous_enable(port);
356                 if (retval != 0) {
357                         RTE_LOG(ERR, VHOST_PORT,
358                                 "Failed to enable promiscuous mode on port %u: %s\n",
359                                 port, rte_strerror(-retval));
360                         return retval;
361                 }
362         }
363
364         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
365         if (retval < 0) {
366                 RTE_LOG(ERR, VHOST_PORT,
367                         "Failed to get MAC address on port %u: %s\n",
368                         port, rte_strerror(-retval));
369                 return retval;
370         }
371
372         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
373         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
374                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
375                         port,
376                         vmdq_ports_eth_addr[port].addr_bytes[0],
377                         vmdq_ports_eth_addr[port].addr_bytes[1],
378                         vmdq_ports_eth_addr[port].addr_bytes[2],
379                         vmdq_ports_eth_addr[port].addr_bytes[3],
380                         vmdq_ports_eth_addr[port].addr_bytes[4],
381                         vmdq_ports_eth_addr[port].addr_bytes[5]);
382
383         return 0;
384 }
385
386 /*
387  * Set socket file path.
388  */
389 static int
390 us_vhost_parse_socket_path(const char *q_arg)
391 {
392         char *old;
393
394         /* parse number string */
395         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396                 return -1;
397
398         old = socket_files;
399         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
400         if (socket_files == NULL) {
401                 free(old);
402                 return -1;
403         }
404
405         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
406         nb_sockets++;
407
408         return 0;
409 }
410
411 /*
412  * Parse the portmask provided at run time.
413  */
414 static int
415 parse_portmask(const char *portmask)
416 {
417         char *end = NULL;
418         unsigned long pm;
419
420         errno = 0;
421
422         /* parse hexadecimal string */
423         pm = strtoul(portmask, &end, 16);
424         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
425                 return 0;
426
427         return pm;
428
429 }
430
431 /*
432  * Parse num options at run time.
433  */
434 static int
435 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
436 {
437         char *end = NULL;
438         unsigned long num;
439
440         errno = 0;
441
442         /* parse unsigned int string */
443         num = strtoul(q_arg, &end, 10);
444         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445                 return -1;
446
447         if (num > max_valid_value)
448                 return -1;
449
450         return num;
451
452 }
453
454 /*
455  * Display usage
456  */
457 static void
458 us_vhost_usage(const char *prgname)
459 {
460         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
461         "               --vm2vm [0|1|2]\n"
462         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
463         "               --socket-file <path>\n"
464         "               --nb-devices ND\n"
465         "               -p PORTMASK: Set mask for ports to be used by application\n"
466         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
467         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
468         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
469         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
470         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
471         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
472         "               --socket-file: The path of the socket file.\n"
473         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
474         "               --tso [0|1] disable/enable TCP segment offload.\n"
475         "               --client register a vhost-user socket as client mode.\n"
476         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
477         "               --dmas register dma channel for specific vhost device.\n",
478                prgname);
479 }
480
481 /*
482  * Parse the arguments given in the command line of the application.
483  */
484 static int
485 us_vhost_parse_args(int argc, char **argv)
486 {
487         int opt, ret;
488         int option_index;
489         unsigned i;
490         const char *prgname = argv[0];
491         static struct option long_option[] = {
492                 {"vm2vm", required_argument, NULL, 0},
493                 {"rx-retry", required_argument, NULL, 0},
494                 {"rx-retry-delay", required_argument, NULL, 0},
495                 {"rx-retry-num", required_argument, NULL, 0},
496                 {"mergeable", required_argument, NULL, 0},
497                 {"stats", required_argument, NULL, 0},
498                 {"socket-file", required_argument, NULL, 0},
499                 {"tx-csum", required_argument, NULL, 0},
500                 {"tso", required_argument, NULL, 0},
501                 {"client", no_argument, &client_mode, 1},
502                 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
503                 {"dma-type", required_argument, NULL, 0},
504                 {"dmas", required_argument, NULL, 0},
505                 {NULL, 0, 0, 0},
506         };
507
508         /* Parse command line */
509         while ((opt = getopt_long(argc, argv, "p:P",
510                         long_option, &option_index)) != EOF) {
511                 switch (opt) {
512                 /* Portmask */
513                 case 'p':
514                         enabled_port_mask = parse_portmask(optarg);
515                         if (enabled_port_mask == 0) {
516                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
517                                 us_vhost_usage(prgname);
518                                 return -1;
519                         }
520                         break;
521
522                 case 'P':
523                         promiscuous = 1;
524                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
525                                 ETH_VMDQ_ACCEPT_BROADCAST |
526                                 ETH_VMDQ_ACCEPT_MULTICAST;
527
528                         break;
529
530                 case 0:
531                         /* Enable/disable vm2vm comms. */
532                         if (!strncmp(long_option[option_index].name, "vm2vm",
533                                 MAX_LONG_OPT_SZ)) {
534                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535                                 if (ret == -1) {
536                                         RTE_LOG(INFO, VHOST_CONFIG,
537                                                 "Invalid argument for "
538                                                 "vm2vm [0|1|2]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         vm2vm_mode = (vm2vm_type)ret;
543                                 }
544                         }
545
546                         /* Enable/disable retries on RX. */
547                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else {
554                                         enable_retry = ret;
555                                 }
556                         }
557
558                         /* Enable/disable TX checksum offload. */
559                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560                                 ret = parse_num_opt(optarg, 1);
561                                 if (ret == -1) {
562                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563                                         us_vhost_usage(prgname);
564                                         return -1;
565                                 } else
566                                         enable_tx_csum = ret;
567                         }
568
569                         /* Enable/disable TSO offload. */
570                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571                                 ret = parse_num_opt(optarg, 1);
572                                 if (ret == -1) {
573                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574                                         us_vhost_usage(prgname);
575                                         return -1;
576                                 } else
577                                         enable_tso = ret;
578                         }
579
580                         /* Specify the retries delay time (in useconds) on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_delay_time = ret;
589                                 }
590                         }
591
592                         /* Specify the retries number on RX. */
593                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, INT32_MAX);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         burst_rx_retry_num = ret;
601                                 }
602                         }
603
604                         /* Enable/disable RX mergeable buffers. */
605                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606                                 ret = parse_num_opt(optarg, 1);
607                                 if (ret == -1) {
608                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609                                         us_vhost_usage(prgname);
610                                         return -1;
611                                 } else {
612                                         mergeable = !!ret;
613                                         if (ret) {
614                                                 vmdq_conf_default.rxmode.offloads |=
615                                                         DEV_RX_OFFLOAD_JUMBO_FRAME;
616                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
617                                                         = JUMBO_FRAME_MAX_SIZE;
618                                         }
619                                 }
620                         }
621
622                         /* Enable/disable stats. */
623                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
624                                 ret = parse_num_opt(optarg, INT32_MAX);
625                                 if (ret == -1) {
626                                         RTE_LOG(INFO, VHOST_CONFIG,
627                                                 "Invalid argument for stats [0..N]\n");
628                                         us_vhost_usage(prgname);
629                                         return -1;
630                                 } else {
631                                         enable_stats = ret;
632                                 }
633                         }
634
635                         /* Set socket file path. */
636                         if (!strncmp(long_option[option_index].name,
637                                                 "socket-file", MAX_LONG_OPT_SZ)) {
638                                 if (us_vhost_parse_socket_path(optarg) == -1) {
639                                         RTE_LOG(INFO, VHOST_CONFIG,
640                                         "Invalid argument for socket name (Max %d characters)\n",
641                                         PATH_MAX);
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 }
645                         }
646
647                         if (!strncmp(long_option[option_index].name,
648                                                 "dma-type", MAX_LONG_OPT_SZ)) {
649                                 if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
650                                         RTE_LOG(INFO, VHOST_CONFIG,
651                                                 "Wrong DMA type\n");
652                                         us_vhost_usage(prgname);
653                                         return -1;
654                                 }
655                                 strcpy(dma_type, optarg);
656                         }
657
658                         if (!strncmp(long_option[option_index].name,
659                                                 "dmas", MAX_LONG_OPT_SZ)) {
660                                 if (open_dma(optarg) == -1) {
661                                         RTE_LOG(INFO, VHOST_CONFIG,
662                                                 "Wrong DMA args\n");
663                                         us_vhost_usage(prgname);
664                                         return -1;
665                                 }
666                                 async_vhost_driver = 1;
667                         }
668
669                         break;
670
671                         /* Invalid option - print options. */
672                 default:
673                         us_vhost_usage(prgname);
674                         return -1;
675                 }
676         }
677
678         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
679                 if (enabled_port_mask & (1 << i))
680                         ports[num_ports++] = i;
681         }
682
683         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
684                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
685                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
686                 return -1;
687         }
688
689         return 0;
690 }
691
692 /*
693  * Update the global var NUM_PORTS and array PORTS according to system ports number
694  * and return valid ports number
695  */
696 static unsigned check_ports_num(unsigned nb_ports)
697 {
698         unsigned valid_num_ports = num_ports;
699         unsigned portid;
700
701         if (num_ports > nb_ports) {
702                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
703                         num_ports, nb_ports);
704                 num_ports = nb_ports;
705         }
706
707         for (portid = 0; portid < num_ports; portid ++) {
708                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
709                         RTE_LOG(INFO, VHOST_PORT,
710                                 "\nSpecified port ID(%u) is not valid\n",
711                                 ports[portid]);
712                         ports[portid] = INVALID_PORT_ID;
713                         valid_num_ports--;
714                 }
715         }
716         return valid_num_ports;
717 }
718
719 static __rte_always_inline struct vhost_dev *
720 find_vhost_dev(struct rte_ether_addr *mac)
721 {
722         struct vhost_dev *vdev;
723
724         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
725                 if (vdev->ready == DEVICE_RX &&
726                     rte_is_same_ether_addr(mac, &vdev->mac_address))
727                         return vdev;
728         }
729
730         return NULL;
731 }
732
733 /*
734  * This function learns the MAC address of the device and registers this along with a
735  * vlan tag to a VMDQ.
736  */
737 static int
738 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
739 {
740         struct rte_ether_hdr *pkt_hdr;
741         int i, ret;
742
743         /* Learn MAC address of guest device from packet */
744         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
745
746         if (find_vhost_dev(&pkt_hdr->s_addr)) {
747                 RTE_LOG(ERR, VHOST_DATA,
748                         "(%d) device is using a registered MAC!\n",
749                         vdev->vid);
750                 return -1;
751         }
752
753         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
754                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
755
756         /* vlan_tag currently uses the device_id. */
757         vdev->vlan_tag = vlan_tags[vdev->vid];
758
759         /* Print out VMDQ registration info. */
760         RTE_LOG(INFO, VHOST_DATA,
761                 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
762                 vdev->vid,
763                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
764                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
765                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
766                 vdev->vlan_tag);
767
768         /* Register the MAC address. */
769         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
770                                 (uint32_t)vdev->vid + vmdq_pool_base);
771         if (ret)
772                 RTE_LOG(ERR, VHOST_DATA,
773                         "(%d) failed to add device MAC address to VMDQ\n",
774                         vdev->vid);
775
776         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
777
778         /* Set device as ready for RX. */
779         vdev->ready = DEVICE_RX;
780
781         return 0;
782 }
783
784 /*
785  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
786  * queue before disabling RX on the device.
787  */
788 static inline void
789 unlink_vmdq(struct vhost_dev *vdev)
790 {
791         unsigned i = 0;
792         unsigned rx_count;
793         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
794
795         if (vdev->ready == DEVICE_RX) {
796                 /*clear MAC and VLAN settings*/
797                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
798                 for (i = 0; i < 6; i++)
799                         vdev->mac_address.addr_bytes[i] = 0;
800
801                 vdev->vlan_tag = 0;
802
803                 /*Clear out the receive buffers*/
804                 rx_count = rte_eth_rx_burst(ports[0],
805                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
806
807                 while (rx_count) {
808                         for (i = 0; i < rx_count; i++)
809                                 rte_pktmbuf_free(pkts_burst[i]);
810
811                         rx_count = rte_eth_rx_burst(ports[0],
812                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
813                 }
814
815                 vdev->ready = DEVICE_MAC_LEARNING;
816         }
817 }
818
819 static inline void
820 free_pkts(struct rte_mbuf **pkts, uint16_t n)
821 {
822         while (n--)
823                 rte_pktmbuf_free(pkts[n]);
824 }
825
826 static __rte_always_inline void
827 complete_async_pkts(struct vhost_dev *vdev)
828 {
829         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
830         uint16_t complete_count;
831
832         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
833                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
834         if (complete_count) {
835                 __atomic_sub_fetch(&vdev->nr_async_pkts, complete_count,
836                         __ATOMIC_SEQ_CST);
837                 free_pkts(p_cpl, complete_count);
838         }
839 }
840
841 static __rte_always_inline void
842 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
843             struct rte_mbuf *m)
844 {
845         uint16_t ret;
846
847         if (builtin_net_driver) {
848                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
849         } else {
850                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
851         }
852
853         if (enable_stats) {
854                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
855                                 __ATOMIC_SEQ_CST);
856                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
857                                 __ATOMIC_SEQ_CST);
858                 src_vdev->stats.tx_total++;
859                 src_vdev->stats.tx += ret;
860         }
861 }
862
863 static __rte_always_inline void
864 drain_vhost(struct vhost_dev *vdev)
865 {
866         uint16_t ret;
867         uint64_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
868         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
869         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
870
871         if (builtin_net_driver) {
872                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
873         } else if (async_vhost_driver) {
874                 uint32_t cpu_cpl_nr = 0;
875                 uint16_t enqueue_fail = 0;
876                 struct rte_mbuf *m_cpu_cpl[nr_xmit];
877
878                 complete_async_pkts(vdev);
879                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
880                                         m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
881                 __atomic_add_fetch(&vdev->nr_async_pkts, ret - cpu_cpl_nr,
882                                 __ATOMIC_SEQ_CST);
883
884                 if (cpu_cpl_nr)
885                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
886
887                 enqueue_fail = nr_xmit - ret;
888                 if (enqueue_fail)
889                         free_pkts(&m[ret], nr_xmit - ret);
890         } else {
891                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
892                                                 m, nr_xmit);
893         }
894
895         if (enable_stats) {
896                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
897                                 __ATOMIC_SEQ_CST);
898                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
899                                 __ATOMIC_SEQ_CST);
900         }
901
902         if (!async_vhost_driver)
903                 free_pkts(m, nr_xmit);
904 }
905
906 static __rte_always_inline void
907 drain_vhost_table(void)
908 {
909         uint16_t lcore_id = rte_lcore_id();
910         struct vhost_bufftable *vhost_txq;
911         struct vhost_dev *vdev;
912         uint64_t cur_tsc;
913
914         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
915                 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
916                                                 + vdev->vid];
917
918                 cur_tsc = rte_rdtsc();
919                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
920                                 > MBUF_TABLE_DRAIN_TSC)) {
921                         RTE_LOG_DP(DEBUG, VHOST_DATA,
922                                 "Vhost TX queue drained after timeout with burst size %u\n",
923                                 vhost_txq->len);
924                         drain_vhost(vdev);
925                         vhost_txq->len = 0;
926                         vhost_txq->pre_tsc = cur_tsc;
927                 }
928         }
929 }
930
931 /*
932  * Check if the packet destination MAC address is for a local device. If so then put
933  * the packet on that devices RX queue. If not then return.
934  */
935 static __rte_always_inline int
936 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
937 {
938         struct rte_ether_hdr *pkt_hdr;
939         struct vhost_dev *dst_vdev;
940         struct vhost_bufftable *vhost_txq;
941         uint16_t lcore_id = rte_lcore_id();
942         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
943
944         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
945         if (!dst_vdev)
946                 return -1;
947
948         if (vdev->vid == dst_vdev->vid) {
949                 RTE_LOG_DP(DEBUG, VHOST_DATA,
950                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
951                         vdev->vid);
952                 return 0;
953         }
954
955         RTE_LOG_DP(DEBUG, VHOST_DATA,
956                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
957
958         if (unlikely(dst_vdev->remove)) {
959                 RTE_LOG_DP(DEBUG, VHOST_DATA,
960                         "(%d) device is marked for removal\n", dst_vdev->vid);
961                 return 0;
962         }
963
964         vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
965         vhost_txq->m_table[vhost_txq->len++] = m;
966
967         if (enable_stats) {
968                 vdev->stats.tx_total++;
969                 vdev->stats.tx++;
970         }
971
972         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
973                 drain_vhost(dst_vdev);
974                 vhost_txq->len = 0;
975                 vhost_txq->pre_tsc = rte_rdtsc();
976         }
977         return 0;
978 }
979
980 /*
981  * Check if the destination MAC of a packet is one local VM,
982  * and get its vlan tag, and offset if it is.
983  */
984 static __rte_always_inline int
985 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
986         uint32_t *offset, uint16_t *vlan_tag)
987 {
988         struct vhost_dev *dst_vdev;
989         struct rte_ether_hdr *pkt_hdr =
990                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
991
992         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
993         if (!dst_vdev)
994                 return 0;
995
996         if (vdev->vid == dst_vdev->vid) {
997                 RTE_LOG_DP(DEBUG, VHOST_DATA,
998                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
999                         vdev->vid);
1000                 return -1;
1001         }
1002
1003         /*
1004          * HW vlan strip will reduce the packet length
1005          * by minus length of vlan tag, so need restore
1006          * the packet length by plus it.
1007          */
1008         *offset  = VLAN_HLEN;
1009         *vlan_tag = vlan_tags[vdev->vid];
1010
1011         RTE_LOG_DP(DEBUG, VHOST_DATA,
1012                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1013                 vdev->vid, dst_vdev->vid, *vlan_tag);
1014
1015         return 0;
1016 }
1017
1018 static uint16_t
1019 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1020 {
1021         if (ol_flags & PKT_TX_IPV4)
1022                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1023         else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1024                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1025 }
1026
1027 static void virtio_tx_offload(struct rte_mbuf *m)
1028 {
1029         void *l3_hdr;
1030         struct rte_ipv4_hdr *ipv4_hdr = NULL;
1031         struct rte_tcp_hdr *tcp_hdr = NULL;
1032         struct rte_ether_hdr *eth_hdr =
1033                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1034
1035         l3_hdr = (char *)eth_hdr + m->l2_len;
1036
1037         if (m->ol_flags & PKT_TX_IPV4) {
1038                 ipv4_hdr = l3_hdr;
1039                 ipv4_hdr->hdr_checksum = 0;
1040                 m->ol_flags |= PKT_TX_IP_CKSUM;
1041         }
1042
1043         tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1044         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1045 }
1046
1047 static __rte_always_inline void
1048 do_drain_mbuf_table(struct mbuf_table *tx_q)
1049 {
1050         uint16_t count;
1051
1052         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1053                                  tx_q->m_table, tx_q->len);
1054         if (unlikely(count < tx_q->len))
1055                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1056
1057         tx_q->len = 0;
1058 }
1059
1060 /*
1061  * This function routes the TX packet to the correct interface. This
1062  * may be a local device or the physical port.
1063  */
1064 static __rte_always_inline void
1065 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1066 {
1067         struct mbuf_table *tx_q;
1068         unsigned offset = 0;
1069         const uint16_t lcore_id = rte_lcore_id();
1070         struct rte_ether_hdr *nh;
1071
1072
1073         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1074         if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1075                 struct vhost_dev *vdev2;
1076
1077                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1078                         if (vdev2 != vdev)
1079                                 sync_virtio_xmit(vdev2, vdev, m);
1080                 }
1081                 goto queue2nic;
1082         }
1083
1084         /*check if destination is local VM*/
1085         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1086                 return;
1087
1088         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1089                 if (unlikely(find_local_dest(vdev, m, &offset,
1090                                              &vlan_tag) != 0)) {
1091                         rte_pktmbuf_free(m);
1092                         return;
1093                 }
1094         }
1095
1096         RTE_LOG_DP(DEBUG, VHOST_DATA,
1097                 "(%d) TX: MAC address is external\n", vdev->vid);
1098
1099 queue2nic:
1100
1101         /*Add packet to the port tx queue*/
1102         tx_q = &lcore_tx_queue[lcore_id];
1103
1104         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1105         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1106                 /* Guest has inserted the vlan tag. */
1107                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1108                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1109                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1110                         (vh->vlan_tci != vlan_tag_be))
1111                         vh->vlan_tci = vlan_tag_be;
1112         } else {
1113                 m->ol_flags |= PKT_TX_VLAN_PKT;
1114
1115                 /*
1116                  * Find the right seg to adjust the data len when offset is
1117                  * bigger than tail room size.
1118                  */
1119                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1120                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1121                                 m->data_len += offset;
1122                         else {
1123                                 struct rte_mbuf *seg = m;
1124
1125                                 while ((seg->next != NULL) &&
1126                                         (offset > rte_pktmbuf_tailroom(seg)))
1127                                         seg = seg->next;
1128
1129                                 seg->data_len += offset;
1130                         }
1131                         m->pkt_len += offset;
1132                 }
1133
1134                 m->vlan_tci = vlan_tag;
1135         }
1136
1137         if (m->ol_flags & PKT_TX_TCP_SEG)
1138                 virtio_tx_offload(m);
1139
1140         tx_q->m_table[tx_q->len++] = m;
1141         if (enable_stats) {
1142                 vdev->stats.tx_total++;
1143                 vdev->stats.tx++;
1144         }
1145
1146         if (unlikely(tx_q->len == MAX_PKT_BURST))
1147                 do_drain_mbuf_table(tx_q);
1148 }
1149
1150
1151 static __rte_always_inline void
1152 drain_mbuf_table(struct mbuf_table *tx_q)
1153 {
1154         static uint64_t prev_tsc;
1155         uint64_t cur_tsc;
1156
1157         if (tx_q->len == 0)
1158                 return;
1159
1160         cur_tsc = rte_rdtsc();
1161         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1162                 prev_tsc = cur_tsc;
1163
1164                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1165                         "TX queue drained after timeout with burst size %u\n",
1166                         tx_q->len);
1167                 do_drain_mbuf_table(tx_q);
1168         }
1169 }
1170
1171 static __rte_always_inline void
1172 drain_eth_rx(struct vhost_dev *vdev)
1173 {
1174         uint16_t rx_count, enqueue_count;
1175         struct rte_mbuf *pkts[MAX_PKT_BURST];
1176
1177         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1178                                     pkts, MAX_PKT_BURST);
1179
1180         if (!rx_count)
1181                 return;
1182
1183         /*
1184          * When "enable_retry" is set, here we wait and retry when there
1185          * is no enough free slots in the queue to hold @rx_count packets,
1186          * to diminish packet loss.
1187          */
1188         if (enable_retry &&
1189             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1190                         VIRTIO_RXQ))) {
1191                 uint32_t retry;
1192
1193                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1194                         rte_delay_us(burst_rx_delay_time);
1195                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1196                                         VIRTIO_RXQ))
1197                                 break;
1198                 }
1199         }
1200
1201         if (builtin_net_driver) {
1202                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1203                                                 pkts, rx_count);
1204         } else if (async_vhost_driver) {
1205                 uint32_t cpu_cpl_nr = 0;
1206                 uint16_t enqueue_fail = 0;
1207                 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1208
1209                 complete_async_pkts(vdev);
1210                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1211                                         VIRTIO_RXQ, pkts, rx_count,
1212                                         m_cpu_cpl, &cpu_cpl_nr);
1213                 __atomic_add_fetch(&vdev->nr_async_pkts,
1214                                         enqueue_count - cpu_cpl_nr,
1215                                         __ATOMIC_SEQ_CST);
1216                 if (cpu_cpl_nr)
1217                         free_pkts(m_cpu_cpl, cpu_cpl_nr);
1218
1219                 enqueue_fail = rx_count - enqueue_count;
1220                 if (enqueue_fail)
1221                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1222
1223         } else {
1224                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1225                                                 pkts, rx_count);
1226         }
1227
1228         if (enable_stats) {
1229                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1230                                 __ATOMIC_SEQ_CST);
1231                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1232                                 __ATOMIC_SEQ_CST);
1233         }
1234
1235         if (!async_vhost_driver)
1236                 free_pkts(pkts, rx_count);
1237 }
1238
1239 static __rte_always_inline void
1240 drain_virtio_tx(struct vhost_dev *vdev)
1241 {
1242         struct rte_mbuf *pkts[MAX_PKT_BURST];
1243         uint16_t count;
1244         uint16_t i;
1245
1246         if (builtin_net_driver) {
1247                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1248                                         pkts, MAX_PKT_BURST);
1249         } else {
1250                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1251                                         mbuf_pool, pkts, MAX_PKT_BURST);
1252         }
1253
1254         /* setup VMDq for the first packet */
1255         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1256                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1257                         free_pkts(pkts, count);
1258         }
1259
1260         for (i = 0; i < count; ++i)
1261                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1262 }
1263
1264 /*
1265  * Main function of vhost-switch. It basically does:
1266  *
1267  * for each vhost device {
1268  *    - drain_eth_rx()
1269  *
1270  *      Which drains the host eth Rx queue linked to the vhost device,
1271  *      and deliver all of them to guest virito Rx ring associated with
1272  *      this vhost device.
1273  *
1274  *    - drain_virtio_tx()
1275  *
1276  *      Which drains the guest virtio Tx queue and deliver all of them
1277  *      to the target, which could be another vhost device, or the
1278  *      physical eth dev. The route is done in function "virtio_tx_route".
1279  * }
1280  */
1281 static int
1282 switch_worker(void *arg __rte_unused)
1283 {
1284         unsigned i;
1285         unsigned lcore_id = rte_lcore_id();
1286         struct vhost_dev *vdev;
1287         struct mbuf_table *tx_q;
1288
1289         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1290
1291         tx_q = &lcore_tx_queue[lcore_id];
1292         for (i = 0; i < rte_lcore_count(); i++) {
1293                 if (lcore_ids[i] == lcore_id) {
1294                         tx_q->txq_id = i;
1295                         break;
1296                 }
1297         }
1298
1299         while(1) {
1300                 drain_mbuf_table(tx_q);
1301                 drain_vhost_table();
1302                 /*
1303                  * Inform the configuration core that we have exited the
1304                  * linked list and that no devices are in use if requested.
1305                  */
1306                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1307                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1308
1309                 /*
1310                  * Process vhost devices
1311                  */
1312                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1313                               lcore_vdev_entry) {
1314                         if (unlikely(vdev->remove)) {
1315                                 unlink_vmdq(vdev);
1316                                 vdev->ready = DEVICE_SAFE_REMOVE;
1317                                 continue;
1318                         }
1319
1320                         if (likely(vdev->ready == DEVICE_RX))
1321                                 drain_eth_rx(vdev);
1322
1323                         if (likely(!vdev->remove))
1324                                 drain_virtio_tx(vdev);
1325                 }
1326         }
1327
1328         return 0;
1329 }
1330
1331 /*
1332  * Remove a device from the specific data core linked list and from the
1333  * main linked list. Synchonization  occurs through the use of the
1334  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1335  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1336  */
1337 static void
1338 destroy_device(int vid)
1339 {
1340         struct vhost_dev *vdev = NULL;
1341         int lcore;
1342         uint16_t i;
1343
1344         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1345                 if (vdev->vid == vid)
1346                         break;
1347         }
1348         if (!vdev)
1349                 return;
1350         /*set the remove flag. */
1351         vdev->remove = 1;
1352         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1353                 rte_pause();
1354         }
1355
1356         for (i = 0; i < RTE_MAX_LCORE; i++)
1357                 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1358
1359         if (builtin_net_driver)
1360                 vs_vhost_net_remove(vdev);
1361
1362         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1363                      lcore_vdev_entry);
1364         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1365
1366
1367         /* Set the dev_removal_flag on each lcore. */
1368         RTE_LCORE_FOREACH_WORKER(lcore)
1369                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1370
1371         /*
1372          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1373          * we can be sure that they can no longer access the device removed
1374          * from the linked lists and that the devices are no longer in use.
1375          */
1376         RTE_LCORE_FOREACH_WORKER(lcore) {
1377                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1378                         rte_pause();
1379         }
1380
1381         lcore_info[vdev->coreid].device_num--;
1382
1383         RTE_LOG(INFO, VHOST_DATA,
1384                 "(%d) device has been removed from data core\n",
1385                 vdev->vid);
1386
1387         if (async_vhost_driver)
1388                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1389
1390         rte_free(vdev);
1391 }
1392
1393 /*
1394  * A new device is added to a data core. First the device is added to the main linked list
1395  * and then allocated to a specific data core.
1396  */
1397 static int
1398 new_device(int vid)
1399 {
1400         int lcore, core_add = 0;
1401         uint16_t i;
1402         uint32_t device_num_min = num_devices;
1403         struct vhost_dev *vdev;
1404         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1405         if (vdev == NULL) {
1406                 RTE_LOG(INFO, VHOST_DATA,
1407                         "(%d) couldn't allocate memory for vhost dev\n",
1408                         vid);
1409                 return -1;
1410         }
1411         vdev->vid = vid;
1412
1413         for (i = 0; i < RTE_MAX_LCORE; i++) {
1414                 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1415                         = rte_zmalloc("vhost bufftable",
1416                                 sizeof(struct vhost_bufftable),
1417                                 RTE_CACHE_LINE_SIZE);
1418
1419                 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1420                         RTE_LOG(INFO, VHOST_DATA,
1421                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1422                         return -1;
1423                 }
1424         }
1425
1426         if (builtin_net_driver)
1427                 vs_vhost_net_setup(vdev);
1428
1429         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1430         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1431
1432         /*reset ready flag*/
1433         vdev->ready = DEVICE_MAC_LEARNING;
1434         vdev->remove = 0;
1435
1436         /* Find a suitable lcore to add the device. */
1437         RTE_LCORE_FOREACH_WORKER(lcore) {
1438                 if (lcore_info[lcore].device_num < device_num_min) {
1439                         device_num_min = lcore_info[lcore].device_num;
1440                         core_add = lcore;
1441                 }
1442         }
1443         vdev->coreid = core_add;
1444
1445         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1446                           lcore_vdev_entry);
1447         lcore_info[vdev->coreid].device_num++;
1448
1449         /* Disable notifications. */
1450         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1451         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1452
1453         RTE_LOG(INFO, VHOST_DATA,
1454                 "(%d) device has been added to data core %d\n",
1455                 vid, vdev->coreid);
1456
1457         if (async_vhost_driver) {
1458                 struct rte_vhost_async_features f;
1459                 struct rte_vhost_async_channel_ops channel_ops;
1460
1461                 if (strncmp(dma_type, "ioat", 4) == 0) {
1462                         channel_ops.transfer_data = ioat_transfer_data_cb;
1463                         channel_ops.check_completed_copies =
1464                                 ioat_check_completed_copies_cb;
1465
1466                         f.async_inorder = 1;
1467                         f.async_threshold = 256;
1468
1469                         return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1470                                 f.intval, &channel_ops);
1471                 }
1472         }
1473
1474         return 0;
1475 }
1476
1477 /*
1478  * These callback allow devices to be added to the data core when configuration
1479  * has been fully complete.
1480  */
1481 static const struct vhost_device_ops virtio_net_device_ops =
1482 {
1483         .new_device =  new_device,
1484         .destroy_device = destroy_device,
1485 };
1486
1487 /*
1488  * This is a thread will wake up after a period to print stats if the user has
1489  * enabled them.
1490  */
1491 static void *
1492 print_stats(__rte_unused void *arg)
1493 {
1494         struct vhost_dev *vdev;
1495         uint64_t tx_dropped, rx_dropped;
1496         uint64_t tx, tx_total, rx, rx_total;
1497         const char clr[] = { 27, '[', '2', 'J', '\0' };
1498         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1499
1500         while(1) {
1501                 sleep(enable_stats);
1502
1503                 /* Clear screen and move to top left */
1504                 printf("%s%s\n", clr, top_left);
1505                 printf("Device statistics =================================\n");
1506
1507                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1508                         tx_total   = vdev->stats.tx_total;
1509                         tx         = vdev->stats.tx;
1510                         tx_dropped = tx_total - tx;
1511
1512                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1513                                 __ATOMIC_SEQ_CST);
1514                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1515                                 __ATOMIC_SEQ_CST);
1516                         rx_dropped = rx_total - rx;
1517
1518                         printf("Statistics for device %d\n"
1519                                 "-----------------------\n"
1520                                 "TX total:              %" PRIu64 "\n"
1521                                 "TX dropped:            %" PRIu64 "\n"
1522                                 "TX successful:         %" PRIu64 "\n"
1523                                 "RX total:              %" PRIu64 "\n"
1524                                 "RX dropped:            %" PRIu64 "\n"
1525                                 "RX successful:         %" PRIu64 "\n",
1526                                 vdev->vid,
1527                                 tx_total, tx_dropped, tx,
1528                                 rx_total, rx_dropped, rx);
1529                 }
1530
1531                 printf("===================================================\n");
1532
1533                 fflush(stdout);
1534         }
1535
1536         return NULL;
1537 }
1538
1539 static void
1540 unregister_drivers(int socket_num)
1541 {
1542         int i, ret;
1543
1544         for (i = 0; i < socket_num; i++) {
1545                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1546                 if (ret != 0)
1547                         RTE_LOG(ERR, VHOST_CONFIG,
1548                                 "Fail to unregister vhost driver for %s.\n",
1549                                 socket_files + i * PATH_MAX);
1550         }
1551 }
1552
1553 /* When we receive a INT signal, unregister vhost driver */
1554 static void
1555 sigint_handler(__rte_unused int signum)
1556 {
1557         /* Unregister vhost driver. */
1558         unregister_drivers(nb_sockets);
1559
1560         exit(0);
1561 }
1562
1563 /*
1564  * While creating an mbuf pool, one key thing is to figure out how
1565  * many mbuf entries is enough for our use. FYI, here are some
1566  * guidelines:
1567  *
1568  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1569  *
1570  * - For each switch core (A CPU core does the packet switch), we need
1571  *   also make some reservation for receiving the packets from virtio
1572  *   Tx queue. How many is enough depends on the usage. It's normally
1573  *   a simple calculation like following:
1574  *
1575  *       MAX_PKT_BURST * max packet size / mbuf size
1576  *
1577  *   So, we definitely need allocate more mbufs when TSO is enabled.
1578  *
1579  * - Similarly, for each switching core, we should serve @nr_rx_desc
1580  *   mbufs for receiving the packets from physical NIC device.
1581  *
1582  * - We also need make sure, for each switch core, we have allocated
1583  *   enough mbufs to fill up the mbuf cache.
1584  */
1585 static void
1586 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1587         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1588 {
1589         uint32_t nr_mbufs;
1590         uint32_t nr_mbufs_per_core;
1591         uint32_t mtu = 1500;
1592
1593         if (mergeable)
1594                 mtu = 9000;
1595         if (enable_tso)
1596                 mtu = 64 * 1024;
1597
1598         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1599                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1600         nr_mbufs_per_core += nr_rx_desc;
1601         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1602
1603         nr_mbufs  = nr_queues * nr_rx_desc;
1604         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1605         nr_mbufs *= nr_port;
1606
1607         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1608                                             nr_mbuf_cache, 0, mbuf_size,
1609                                             rte_socket_id());
1610         if (mbuf_pool == NULL)
1611                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1612 }
1613
1614 /*
1615  * Main function, does initialisation and calls the per-lcore functions.
1616  */
1617 int
1618 main(int argc, char *argv[])
1619 {
1620         unsigned lcore_id, core_id = 0;
1621         unsigned nb_ports, valid_num_ports;
1622         int ret, i;
1623         uint16_t portid;
1624         static pthread_t tid;
1625         uint64_t flags = 0;
1626
1627         signal(SIGINT, sigint_handler);
1628
1629         /* init EAL */
1630         ret = rte_eal_init(argc, argv);
1631         if (ret < 0)
1632                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1633         argc -= ret;
1634         argv += ret;
1635
1636         /* parse app arguments */
1637         ret = us_vhost_parse_args(argc, argv);
1638         if (ret < 0)
1639                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1640
1641         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1642                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1643
1644                 if (rte_lcore_is_enabled(lcore_id))
1645                         lcore_ids[core_id++] = lcore_id;
1646         }
1647
1648         if (rte_lcore_count() > RTE_MAX_LCORE)
1649                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1650
1651         /* Get the number of physical ports. */
1652         nb_ports = rte_eth_dev_count_avail();
1653
1654         /*
1655          * Update the global var NUM_PORTS and global array PORTS
1656          * and get value of var VALID_NUM_PORTS according to system ports number
1657          */
1658         valid_num_ports = check_ports_num(nb_ports);
1659
1660         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1661                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1662                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1663                 return -1;
1664         }
1665
1666         /*
1667          * FIXME: here we are trying to allocate mbufs big enough for
1668          * @MAX_QUEUES, but the truth is we're never going to use that
1669          * many queues here. We probably should only do allocation for
1670          * those queues we are going to use.
1671          */
1672         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1673                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1674
1675         if (vm2vm_mode == VM2VM_HARDWARE) {
1676                 /* Enable VT loop back to let L2 switch to do it. */
1677                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1678                 RTE_LOG(DEBUG, VHOST_CONFIG,
1679                         "Enable loop back for L2 switch in vmdq.\n");
1680         }
1681
1682         /* initialize all ports */
1683         RTE_ETH_FOREACH_DEV(portid) {
1684                 /* skip ports that are not enabled */
1685                 if ((enabled_port_mask & (1 << portid)) == 0) {
1686                         RTE_LOG(INFO, VHOST_PORT,
1687                                 "Skipping disabled port %d\n", portid);
1688                         continue;
1689                 }
1690                 if (port_init(portid) != 0)
1691                         rte_exit(EXIT_FAILURE,
1692                                 "Cannot initialize network ports\n");
1693         }
1694
1695         /* Enable stats if the user option is set. */
1696         if (enable_stats) {
1697                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1698                                         print_stats, NULL);
1699                 if (ret < 0)
1700                         rte_exit(EXIT_FAILURE,
1701                                 "Cannot create print-stats thread\n");
1702         }
1703
1704         /* Launch all data cores. */
1705         RTE_LCORE_FOREACH_WORKER(lcore_id)
1706                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1707
1708         if (client_mode)
1709                 flags |= RTE_VHOST_USER_CLIENT;
1710
1711         /* Register vhost user driver to handle vhost messages. */
1712         for (i = 0; i < nb_sockets; i++) {
1713                 char *file = socket_files + i * PATH_MAX;
1714
1715                 if (async_vhost_driver)
1716                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1717
1718                 ret = rte_vhost_driver_register(file, flags);
1719                 if (ret != 0) {
1720                         unregister_drivers(i);
1721                         rte_exit(EXIT_FAILURE,
1722                                 "vhost driver register failure.\n");
1723                 }
1724
1725                 if (builtin_net_driver)
1726                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1727
1728                 if (mergeable == 0) {
1729                         rte_vhost_driver_disable_features(file,
1730                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1731                 }
1732
1733                 if (enable_tx_csum == 0) {
1734                         rte_vhost_driver_disable_features(file,
1735                                 1ULL << VIRTIO_NET_F_CSUM);
1736                 }
1737
1738                 if (enable_tso == 0) {
1739                         rte_vhost_driver_disable_features(file,
1740                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1741                         rte_vhost_driver_disable_features(file,
1742                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1743                         rte_vhost_driver_disable_features(file,
1744                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1745                         rte_vhost_driver_disable_features(file,
1746                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1747                 }
1748
1749                 if (promiscuous) {
1750                         rte_vhost_driver_enable_features(file,
1751                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1752                 }
1753
1754                 ret = rte_vhost_driver_callback_register(file,
1755                         &virtio_net_device_ops);
1756                 if (ret != 0) {
1757                         rte_exit(EXIT_FAILURE,
1758                                 "failed to register vhost driver callbacks.\n");
1759                 }
1760
1761                 if (rte_vhost_driver_start(file) < 0) {
1762                         rte_exit(EXIT_FAILURE,
1763                                 "failed to start vhost driver.\n");
1764                 }
1765         }
1766
1767         RTE_LCORE_FOREACH_WORKER(lcore_id)
1768                 rte_eal_wait_lcore(lcore_id);
1769
1770         return 0;
1771
1772 }