3e784f5c6ffa9f42ee032c2a24a6569edc119845
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29
30 #include "main.h"
31
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35
36 /* the maximum number of external ports supported */
37 #define MAX_SUP_PORTS 1
38
39 #define MBUF_CACHE_SIZE 128
40 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
41
42 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
43
44 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
45 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
46
47 #define JUMBO_FRAME_MAX_SIZE    0x2600
48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
49
50 /* State of virtio device. */
51 #define DEVICE_MAC_LEARNING 0
52 #define DEVICE_RX                       1
53 #define DEVICE_SAFE_REMOVE      2
54
55 /* Configurable number of RX/TX ring descriptors */
56 #define RTE_TEST_RX_DESC_DEFAULT 1024
57 #define RTE_TEST_TX_DESC_DEFAULT 512
58
59 #define INVALID_PORT_ID 0xFF
60 #define INVALID_DMA_ID -1
61
62 #define DMA_RING_SIZE 4096
63
64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
66 static int dma_count;
67
68 /* mask of enabled ports */
69 static uint32_t enabled_port_mask = 0;
70
71 /* Promiscuous mode */
72 static uint32_t promiscuous;
73
74 /* number of devices/queues to support*/
75 static uint32_t num_queues = 0;
76 static uint32_t num_devices;
77
78 static struct rte_mempool *mbuf_pool;
79 static int mergeable;
80
81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
82 typedef enum {
83         VM2VM_DISABLED = 0,
84         VM2VM_SOFTWARE = 1,
85         VM2VM_HARDWARE = 2,
86         VM2VM_LAST
87 } vm2vm_type;
88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
89
90 /* Enable stats. */
91 static uint32_t enable_stats = 0;
92 /* Enable retries on RX. */
93 static uint32_t enable_retry = 1;
94
95 /* Disable TX checksum offload */
96 static uint32_t enable_tx_csum;
97
98 /* Disable TSO offload */
99 static uint32_t enable_tso;
100
101 static int client_mode;
102
103 static int builtin_net_driver;
104
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113
114 /* empty VMDq configuration structure. Filled in programmatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116         .rxmode = {
117                 .mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
118                 .split_hdr_size = 0,
119                 /*
120                  * VLAN strip is necessary for 1G NIC such as I350,
121                  * this fixes bug of ipv4 forwarding in guest can't
122                  * forward packets from one virtio dev to another virtio dev.
123                  */
124                 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
125         },
126
127         .txmode = {
128                 .mq_mode = RTE_ETH_MQ_TX_NONE,
129                 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
130                              RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
131                              RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
132                              RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
133                              RTE_ETH_TX_OFFLOAD_TCP_TSO),
134         },
135         .rx_adv_conf = {
136                 /*
137                  * should be overridden separately in code with
138                  * appropriate values
139                  */
140                 .vmdq_rx_conf = {
141                         .nb_queue_pools = RTE_ETH_8_POOLS,
142                         .enable_default_pool = 0,
143                         .default_pool = 0,
144                         .nb_pool_maps = 0,
145                         .pool_map = {{0, 0},},
146                 },
147         },
148 };
149
150
151 static unsigned lcore_ids[RTE_MAX_LCORE];
152 static uint16_t ports[RTE_MAX_ETHPORTS];
153 static unsigned num_ports = 0; /**< The number of ports specified in command line */
154 static uint16_t num_pf_queues, num_vmdq_queues;
155 static uint16_t vmdq_pool_base, vmdq_queue_base;
156 static uint16_t queues_per_pool;
157
158 const uint16_t vlan_tags[] = {
159         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
160         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
161         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
162         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
163         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
164         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
165         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
166         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
167 };
168
169 /* ethernet addresses of ports */
170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
171
172 static struct vhost_dev_tailq_list vhost_dev_list =
173         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
174
175 static struct lcore_info lcore_info[RTE_MAX_LCORE];
176
177 /* Used for queueing bursts of TX packets. */
178 struct mbuf_table {
179         unsigned len;
180         unsigned txq_id;
181         struct rte_mbuf *m_table[MAX_PKT_BURST];
182 };
183
184 struct vhost_bufftable {
185         uint32_t len;
186         uint64_t pre_tsc;
187         struct rte_mbuf *m_table[MAX_PKT_BURST];
188 };
189
190 /* TX queue for each data core. */
191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
192
193 /*
194  * Vhost TX buffer for each data core.
195  * Every data core maintains a TX buffer for every vhost device,
196  * which is used for batch pkts enqueue for higher performance.
197  */
198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
199
200 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
201                                  / US_PER_S * BURST_TX_DRAIN_US)
202
203 static inline bool
204 is_dma_configured(int16_t dev_id)
205 {
206         int i;
207
208         for (i = 0; i < dma_count; i++)
209                 if (dmas_id[i] == dev_id)
210                         return true;
211         return false;
212 }
213
214 static inline int
215 open_dma(const char *value)
216 {
217         struct dma_for_vhost *dma_info = dma_bind;
218         char *input = strndup(value, strlen(value) + 1);
219         char *addrs = input;
220         char *ptrs[2];
221         char *start, *end, *substr;
222         int64_t vid;
223
224         struct rte_dma_info info;
225         struct rte_dma_conf dev_config = { .nb_vchans = 1 };
226         struct rte_dma_vchan_conf qconf = {
227                 .direction = RTE_DMA_DIR_MEM_TO_MEM,
228                 .nb_desc = DMA_RING_SIZE
229         };
230
231         int dev_id;
232         int ret = 0;
233         uint16_t i = 0;
234         char *dma_arg[RTE_MAX_VHOST_DEVICE];
235         int args_nr;
236
237         while (isblank(*addrs))
238                 addrs++;
239         if (*addrs == '\0') {
240                 ret = -1;
241                 goto out;
242         }
243
244         /* process DMA devices within bracket. */
245         addrs++;
246         substr = strtok(addrs, ";]");
247         if (!substr) {
248                 ret = -1;
249                 goto out;
250         }
251
252         args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
253         if (args_nr <= 0) {
254                 ret = -1;
255                 goto out;
256         }
257
258         while (i < args_nr) {
259                 char *arg_temp = dma_arg[i];
260                 uint8_t sub_nr;
261
262                 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
263                 if (sub_nr != 2) {
264                         ret = -1;
265                         goto out;
266                 }
267
268                 start = strstr(ptrs[0], "txd");
269                 if (start == NULL) {
270                         ret = -1;
271                         goto out;
272                 }
273
274                 start += 3;
275                 vid = strtol(start, &end, 0);
276                 if (end == start) {
277                         ret = -1;
278                         goto out;
279                 }
280
281                 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
282                 if (dev_id < 0) {
283                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
284                         ret = -1;
285                         goto out;
286                 }
287
288                 /* DMA device is already configured, so skip */
289                 if (is_dma_configured(dev_id))
290                         goto done;
291
292                 if (rte_dma_info_get(dev_id, &info) != 0) {
293                         RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
294                         ret = -1;
295                         goto out;
296                 }
297
298                 if (info.max_vchans < 1) {
299                         RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
300                         ret = -1;
301                         goto out;
302                 }
303
304                 if (rte_dma_configure(dev_id, &dev_config) != 0) {
305                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
306                         ret = -1;
307                         goto out;
308                 }
309
310                 /* Check the max desc supported by DMA device */
311                 rte_dma_info_get(dev_id, &info);
312                 if (info.nb_vchans != 1) {
313                         RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
314                                         dev_id);
315                         ret = -1;
316                         goto out;
317                 }
318
319                 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
320
321                 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
322                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
323                         ret = -1;
324                         goto out;
325                 }
326
327                 if (rte_dma_start(dev_id) != 0) {
328                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
329                         ret = -1;
330                         goto out;
331                 }
332
333                 dmas_id[dma_count++] = dev_id;
334
335 done:
336                 (dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
337                 i++;
338         }
339 out:
340         free(input);
341         return ret;
342 }
343
344 /*
345  * Builds up the correct configuration for VMDQ VLAN pool map
346  * according to the pool & queue limits.
347  */
348 static inline int
349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
350 {
351         struct rte_eth_vmdq_rx_conf conf;
352         struct rte_eth_vmdq_rx_conf *def_conf =
353                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
354         unsigned i;
355
356         memset(&conf, 0, sizeof(conf));
357         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
358         conf.nb_pool_maps = num_devices;
359         conf.enable_loop_back = def_conf->enable_loop_back;
360         conf.rx_mode = def_conf->rx_mode;
361
362         for (i = 0; i < conf.nb_pool_maps; i++) {
363                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
364                 conf.pool_map[i].pools = (1UL << i);
365         }
366
367         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
368         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
369                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
370         return 0;
371 }
372
373 /*
374  * Initialises a given port using global settings and with the rx buffers
375  * coming from the mbuf_pool passed as parameter
376  */
377 static inline int
378 port_init(uint16_t port)
379 {
380         struct rte_eth_dev_info dev_info;
381         struct rte_eth_conf port_conf;
382         struct rte_eth_rxconf *rxconf;
383         struct rte_eth_txconf *txconf;
384         int16_t rx_rings, tx_rings;
385         uint16_t rx_ring_size, tx_ring_size;
386         int retval;
387         uint16_t q;
388
389         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
390         retval = rte_eth_dev_info_get(port, &dev_info);
391         if (retval != 0) {
392                 RTE_LOG(ERR, VHOST_PORT,
393                         "Error during getting device (port %u) info: %s\n",
394                         port, strerror(-retval));
395
396                 return retval;
397         }
398
399         rxconf = &dev_info.default_rxconf;
400         txconf = &dev_info.default_txconf;
401         rxconf->rx_drop_en = 1;
402
403         /*configure the number of supported virtio devices based on VMDQ limits */
404         num_devices = dev_info.max_vmdq_pools;
405
406         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
408
409         tx_rings = (uint16_t)rte_lcore_count();
410
411         if (mergeable) {
412                 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
413                         vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
414                 else
415                         vmdq_conf_default.rxmode.mtu = MAX_MTU;
416         }
417
418         /* Get port configuration. */
419         retval = get_eth_conf(&port_conf, num_devices);
420         if (retval < 0)
421                 return retval;
422         /* NIC queues are divided into pf queues and vmdq queues.  */
423         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
424         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
425         num_vmdq_queues = num_devices * queues_per_pool;
426         num_queues = num_pf_queues + num_vmdq_queues;
427         vmdq_queue_base = dev_info.vmdq_queue_base;
428         vmdq_pool_base  = dev_info.vmdq_pool_base;
429         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
430                 num_pf_queues, num_devices, queues_per_pool);
431
432         if (!rte_eth_dev_is_valid_port(port))
433                 return -1;
434
435         rx_rings = (uint16_t)dev_info.max_rx_queues;
436         if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
437                 port_conf.txmode.offloads |=
438                         RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
439         /* Configure ethernet device. */
440         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
441         if (retval != 0) {
442                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
443                         port, strerror(-retval));
444                 return retval;
445         }
446
447         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
448                 &tx_ring_size);
449         if (retval != 0) {
450                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
451                         "for port %u: %s.\n", port, strerror(-retval));
452                 return retval;
453         }
454         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
455                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
456                         "for Rx queues on port %u.\n", port);
457                 return -1;
458         }
459
460         /* Setup the queues. */
461         rxconf->offloads = port_conf.rxmode.offloads;
462         for (q = 0; q < rx_rings; q ++) {
463                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
464                                                 rte_eth_dev_socket_id(port),
465                                                 rxconf,
466                                                 mbuf_pool);
467                 if (retval < 0) {
468                         RTE_LOG(ERR, VHOST_PORT,
469                                 "Failed to setup rx queue %u of port %u: %s.\n",
470                                 q, port, strerror(-retval));
471                         return retval;
472                 }
473         }
474         txconf->offloads = port_conf.txmode.offloads;
475         for (q = 0; q < tx_rings; q ++) {
476                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
477                                                 rte_eth_dev_socket_id(port),
478                                                 txconf);
479                 if (retval < 0) {
480                         RTE_LOG(ERR, VHOST_PORT,
481                                 "Failed to setup tx queue %u of port %u: %s.\n",
482                                 q, port, strerror(-retval));
483                         return retval;
484                 }
485         }
486
487         /* Start the device. */
488         retval  = rte_eth_dev_start(port);
489         if (retval < 0) {
490                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
491                         port, strerror(-retval));
492                 return retval;
493         }
494
495         if (promiscuous) {
496                 retval = rte_eth_promiscuous_enable(port);
497                 if (retval != 0) {
498                         RTE_LOG(ERR, VHOST_PORT,
499                                 "Failed to enable promiscuous mode on port %u: %s\n",
500                                 port, rte_strerror(-retval));
501                         return retval;
502                 }
503         }
504
505         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
506         if (retval < 0) {
507                 RTE_LOG(ERR, VHOST_PORT,
508                         "Failed to get MAC address on port %u: %s\n",
509                         port, rte_strerror(-retval));
510                 return retval;
511         }
512
513         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
514         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
515                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
516                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
517
518         return 0;
519 }
520
521 /*
522  * Set socket file path.
523  */
524 static int
525 us_vhost_parse_socket_path(const char *q_arg)
526 {
527         char *old;
528
529         /* parse number string */
530         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
531                 return -1;
532
533         old = socket_files;
534         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
535         if (socket_files == NULL) {
536                 free(old);
537                 return -1;
538         }
539
540         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
541         nb_sockets++;
542
543         return 0;
544 }
545
546 /*
547  * Parse the portmask provided at run time.
548  */
549 static int
550 parse_portmask(const char *portmask)
551 {
552         char *end = NULL;
553         unsigned long pm;
554
555         errno = 0;
556
557         /* parse hexadecimal string */
558         pm = strtoul(portmask, &end, 16);
559         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
560                 return 0;
561
562         return pm;
563
564 }
565
566 /*
567  * Parse num options at run time.
568  */
569 static int
570 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
571 {
572         char *end = NULL;
573         unsigned long num;
574
575         errno = 0;
576
577         /* parse unsigned int string */
578         num = strtoul(q_arg, &end, 10);
579         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
580                 return -1;
581
582         if (num > max_valid_value)
583                 return -1;
584
585         return num;
586
587 }
588
589 /*
590  * Display usage
591  */
592 static void
593 us_vhost_usage(const char *prgname)
594 {
595         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
596         "               --vm2vm [0|1|2]\n"
597         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
598         "               --socket-file <path>\n"
599         "               --nb-devices ND\n"
600         "               -p PORTMASK: Set mask for ports to be used by application\n"
601         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
602         "               --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
603         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
604         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
605         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
606         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
607         "               --socket-file: The path of the socket file.\n"
608         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
609         "               --tso [0|1] disable/enable TCP segment offload.\n"
610         "               --client register a vhost-user socket as client mode.\n"
611         "               --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
612         "               --dmas register dma channel for specific vhost device.\n",
613                prgname);
614 }
615
616 enum {
617 #define OPT_VM2VM               "vm2vm"
618         OPT_VM2VM_NUM = 256,
619 #define OPT_RX_RETRY            "rx-retry"
620         OPT_RX_RETRY_NUM,
621 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
622         OPT_RX_RETRY_DELAY_NUM,
623 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
624         OPT_RX_RETRY_NUMB_NUM,
625 #define OPT_MERGEABLE           "mergeable"
626         OPT_MERGEABLE_NUM,
627 #define OPT_STATS               "stats"
628         OPT_STATS_NUM,
629 #define OPT_SOCKET_FILE         "socket-file"
630         OPT_SOCKET_FILE_NUM,
631 #define OPT_TX_CSUM             "tx-csum"
632         OPT_TX_CSUM_NUM,
633 #define OPT_TSO                 "tso"
634         OPT_TSO_NUM,
635 #define OPT_CLIENT              "client"
636         OPT_CLIENT_NUM,
637 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
638         OPT_BUILTIN_NET_DRIVER_NUM,
639 #define OPT_DMAS                "dmas"
640         OPT_DMAS_NUM,
641 };
642
643 /*
644  * Parse the arguments given in the command line of the application.
645  */
646 static int
647 us_vhost_parse_args(int argc, char **argv)
648 {
649         int opt, ret;
650         int option_index;
651         unsigned i;
652         const char *prgname = argv[0];
653         static struct option long_option[] = {
654                 {OPT_VM2VM, required_argument,
655                                 NULL, OPT_VM2VM_NUM},
656                 {OPT_RX_RETRY, required_argument,
657                                 NULL, OPT_RX_RETRY_NUM},
658                 {OPT_RX_RETRY_DELAY, required_argument,
659                                 NULL, OPT_RX_RETRY_DELAY_NUM},
660                 {OPT_RX_RETRY_NUMB, required_argument,
661                                 NULL, OPT_RX_RETRY_NUMB_NUM},
662                 {OPT_MERGEABLE, required_argument,
663                                 NULL, OPT_MERGEABLE_NUM},
664                 {OPT_STATS, required_argument,
665                                 NULL, OPT_STATS_NUM},
666                 {OPT_SOCKET_FILE, required_argument,
667                                 NULL, OPT_SOCKET_FILE_NUM},
668                 {OPT_TX_CSUM, required_argument,
669                                 NULL, OPT_TX_CSUM_NUM},
670                 {OPT_TSO, required_argument,
671                                 NULL, OPT_TSO_NUM},
672                 {OPT_CLIENT, no_argument,
673                                 NULL, OPT_CLIENT_NUM},
674                 {OPT_BUILTIN_NET_DRIVER, no_argument,
675                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
676                 {OPT_DMAS, required_argument,
677                                 NULL, OPT_DMAS_NUM},
678                 {NULL, 0, 0, 0},
679         };
680
681         /* Parse command line */
682         while ((opt = getopt_long(argc, argv, "p:P",
683                         long_option, &option_index)) != EOF) {
684                 switch (opt) {
685                 /* Portmask */
686                 case 'p':
687                         enabled_port_mask = parse_portmask(optarg);
688                         if (enabled_port_mask == 0) {
689                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
690                                 us_vhost_usage(prgname);
691                                 return -1;
692                         }
693                         break;
694
695                 case 'P':
696                         promiscuous = 1;
697                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
698                                 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
699                                 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
700                         break;
701
702                 case OPT_VM2VM_NUM:
703                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
704                         if (ret == -1) {
705                                 RTE_LOG(INFO, VHOST_CONFIG,
706                                         "Invalid argument for "
707                                         "vm2vm [0|1|2]\n");
708                                 us_vhost_usage(prgname);
709                                 return -1;
710                         }
711                         vm2vm_mode = (vm2vm_type)ret;
712                         break;
713
714                 case OPT_RX_RETRY_NUM:
715                         ret = parse_num_opt(optarg, 1);
716                         if (ret == -1) {
717                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
718                                 us_vhost_usage(prgname);
719                                 return -1;
720                         }
721                         enable_retry = ret;
722                         break;
723
724                 case OPT_TX_CSUM_NUM:
725                         ret = parse_num_opt(optarg, 1);
726                         if (ret == -1) {
727                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
728                                 us_vhost_usage(prgname);
729                                 return -1;
730                         }
731                         enable_tx_csum = ret;
732                         break;
733
734                 case OPT_TSO_NUM:
735                         ret = parse_num_opt(optarg, 1);
736                         if (ret == -1) {
737                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
738                                 us_vhost_usage(prgname);
739                                 return -1;
740                         }
741                         enable_tso = ret;
742                         break;
743
744                 case OPT_RX_RETRY_DELAY_NUM:
745                         ret = parse_num_opt(optarg, INT32_MAX);
746                         if (ret == -1) {
747                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
748                                 us_vhost_usage(prgname);
749                                 return -1;
750                         }
751                         burst_rx_delay_time = ret;
752                         break;
753
754                 case OPT_RX_RETRY_NUMB_NUM:
755                         ret = parse_num_opt(optarg, INT32_MAX);
756                         if (ret == -1) {
757                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
758                                 us_vhost_usage(prgname);
759                                 return -1;
760                         }
761                         burst_rx_retry_num = ret;
762                         break;
763
764                 case OPT_MERGEABLE_NUM:
765                         ret = parse_num_opt(optarg, 1);
766                         if (ret == -1) {
767                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
768                                 us_vhost_usage(prgname);
769                                 return -1;
770                         }
771                         mergeable = !!ret;
772                         break;
773
774                 case OPT_STATS_NUM:
775                         ret = parse_num_opt(optarg, INT32_MAX);
776                         if (ret == -1) {
777                                 RTE_LOG(INFO, VHOST_CONFIG,
778                                         "Invalid argument for stats [0..N]\n");
779                                 us_vhost_usage(prgname);
780                                 return -1;
781                         }
782                         enable_stats = ret;
783                         break;
784
785                 /* Set socket file path. */
786                 case OPT_SOCKET_FILE_NUM:
787                         if (us_vhost_parse_socket_path(optarg) == -1) {
788                                 RTE_LOG(INFO, VHOST_CONFIG,
789                                 "Invalid argument for socket name (Max %d characters)\n",
790                                 PATH_MAX);
791                                 us_vhost_usage(prgname);
792                                 return -1;
793                         }
794                         break;
795
796                 case OPT_DMAS_NUM:
797                         if (open_dma(optarg) == -1) {
798                                 RTE_LOG(INFO, VHOST_CONFIG,
799                                         "Wrong DMA args\n");
800                                 us_vhost_usage(prgname);
801                                 return -1;
802                         }
803                         break;
804
805                 case OPT_CLIENT_NUM:
806                         client_mode = 1;
807                         break;
808
809                 case OPT_BUILTIN_NET_DRIVER_NUM:
810                         builtin_net_driver = 1;
811                         break;
812
813                 /* Invalid option - print options. */
814                 default:
815                         us_vhost_usage(prgname);
816                         return -1;
817                 }
818         }
819
820         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
821                 if (enabled_port_mask & (1 << i))
822                         ports[num_ports++] = i;
823         }
824
825         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
826                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
827                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
828                 return -1;
829         }
830
831         return 0;
832 }
833
834 /*
835  * Update the global var NUM_PORTS and array PORTS according to system ports number
836  * and return valid ports number
837  */
838 static unsigned check_ports_num(unsigned nb_ports)
839 {
840         unsigned valid_num_ports = num_ports;
841         unsigned portid;
842
843         if (num_ports > nb_ports) {
844                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
845                         num_ports, nb_ports);
846                 num_ports = nb_ports;
847         }
848
849         for (portid = 0; portid < num_ports; portid ++) {
850                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
851                         RTE_LOG(INFO, VHOST_PORT,
852                                 "\nSpecified port ID(%u) is not valid\n",
853                                 ports[portid]);
854                         ports[portid] = INVALID_PORT_ID;
855                         valid_num_ports--;
856                 }
857         }
858         return valid_num_ports;
859 }
860
861 static __rte_always_inline struct vhost_dev *
862 find_vhost_dev(struct rte_ether_addr *mac)
863 {
864         struct vhost_dev *vdev;
865
866         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
867                 if (vdev->ready == DEVICE_RX &&
868                     rte_is_same_ether_addr(mac, &vdev->mac_address))
869                         return vdev;
870         }
871
872         return NULL;
873 }
874
875 /*
876  * This function learns the MAC address of the device and registers this along with a
877  * vlan tag to a VMDQ.
878  */
879 static int
880 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
881 {
882         struct rte_ether_hdr *pkt_hdr;
883         int i, ret;
884
885         /* Learn MAC address of guest device from packet */
886         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
887
888         if (find_vhost_dev(&pkt_hdr->src_addr)) {
889                 RTE_LOG(ERR, VHOST_DATA,
890                         "(%d) device is using a registered MAC!\n",
891                         vdev->vid);
892                 return -1;
893         }
894
895         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
896                 vdev->mac_address.addr_bytes[i] =
897                         pkt_hdr->src_addr.addr_bytes[i];
898
899         /* vlan_tag currently uses the device_id. */
900         vdev->vlan_tag = vlan_tags[vdev->vid];
901
902         /* Print out VMDQ registration info. */
903         RTE_LOG(INFO, VHOST_DATA,
904                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
905                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
906                 vdev->vlan_tag);
907
908         /* Register the MAC address. */
909         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
910                                 (uint32_t)vdev->vid + vmdq_pool_base);
911         if (ret)
912                 RTE_LOG(ERR, VHOST_DATA,
913                         "(%d) failed to add device MAC address to VMDQ\n",
914                         vdev->vid);
915
916         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
917
918         /* Set device as ready for RX. */
919         vdev->ready = DEVICE_RX;
920
921         return 0;
922 }
923
924 /*
925  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
926  * queue before disabling RX on the device.
927  */
928 static inline void
929 unlink_vmdq(struct vhost_dev *vdev)
930 {
931         unsigned i = 0;
932         unsigned rx_count;
933         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
934
935         if (vdev->ready == DEVICE_RX) {
936                 /*clear MAC and VLAN settings*/
937                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
938                 for (i = 0; i < 6; i++)
939                         vdev->mac_address.addr_bytes[i] = 0;
940
941                 vdev->vlan_tag = 0;
942
943                 /*Clear out the receive buffers*/
944                 rx_count = rte_eth_rx_burst(ports[0],
945                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
946
947                 while (rx_count) {
948                         for (i = 0; i < rx_count; i++)
949                                 rte_pktmbuf_free(pkts_burst[i]);
950
951                         rx_count = rte_eth_rx_burst(ports[0],
952                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
953                 }
954
955                 vdev->ready = DEVICE_MAC_LEARNING;
956         }
957 }
958
959 static inline void
960 free_pkts(struct rte_mbuf **pkts, uint16_t n)
961 {
962         while (n--)
963                 rte_pktmbuf_free(pkts[n]);
964 }
965
966 static __rte_always_inline void
967 complete_async_pkts(struct vhost_dev *vdev)
968 {
969         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
970         uint16_t complete_count;
971         int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
972
973         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
974                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
975         if (complete_count) {
976                 free_pkts(p_cpl, complete_count);
977                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
978         }
979
980 }
981
982 static __rte_always_inline void
983 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
984             struct rte_mbuf *m)
985 {
986         uint16_t ret;
987
988         if (builtin_net_driver) {
989                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
990         } else {
991                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
992         }
993
994         if (enable_stats) {
995                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
996                                 __ATOMIC_SEQ_CST);
997                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
998                                 __ATOMIC_SEQ_CST);
999                 src_vdev->stats.tx_total++;
1000                 src_vdev->stats.tx += ret;
1001         }
1002 }
1003
1004 static __rte_always_inline void
1005 drain_vhost(struct vhost_dev *vdev)
1006 {
1007         uint16_t ret;
1008         uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1009         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1010         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1011
1012         if (builtin_net_driver) {
1013                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
1014         } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1015                 uint16_t enqueue_fail = 0;
1016                 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1017
1018                 complete_async_pkts(vdev);
1019                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1020                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1021
1022                 enqueue_fail = nr_xmit - ret;
1023                 if (enqueue_fail)
1024                         free_pkts(&m[ret], nr_xmit - ret);
1025         } else {
1026                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1027                                                 m, nr_xmit);
1028         }
1029
1030         if (enable_stats) {
1031                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1032                                 __ATOMIC_SEQ_CST);
1033                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1034                                 __ATOMIC_SEQ_CST);
1035         }
1036
1037         if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1038                 free_pkts(m, nr_xmit);
1039 }
1040
1041 static __rte_always_inline void
1042 drain_vhost_table(void)
1043 {
1044         uint16_t lcore_id = rte_lcore_id();
1045         struct vhost_bufftable *vhost_txq;
1046         struct vhost_dev *vdev;
1047         uint64_t cur_tsc;
1048
1049         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1050                 if (unlikely(vdev->remove == 1))
1051                         continue;
1052
1053                 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1054
1055                 cur_tsc = rte_rdtsc();
1056                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1057                                 > MBUF_TABLE_DRAIN_TSC)) {
1058                         RTE_LOG_DP(DEBUG, VHOST_DATA,
1059                                 "Vhost TX queue drained after timeout with burst size %u\n",
1060                                 vhost_txq->len);
1061                         drain_vhost(vdev);
1062                         vhost_txq->len = 0;
1063                         vhost_txq->pre_tsc = cur_tsc;
1064                 }
1065         }
1066 }
1067
1068 /*
1069  * Check if the packet destination MAC address is for a local device. If so then put
1070  * the packet on that devices RX queue. If not then return.
1071  */
1072 static __rte_always_inline int
1073 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1074 {
1075         struct rte_ether_hdr *pkt_hdr;
1076         struct vhost_dev *dst_vdev;
1077         struct vhost_bufftable *vhost_txq;
1078         uint16_t lcore_id = rte_lcore_id();
1079         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1080
1081         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1082         if (!dst_vdev)
1083                 return -1;
1084
1085         if (vdev->vid == dst_vdev->vid) {
1086                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1087                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1088                         vdev->vid);
1089                 return 0;
1090         }
1091
1092         RTE_LOG_DP(DEBUG, VHOST_DATA,
1093                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1094
1095         if (unlikely(dst_vdev->remove)) {
1096                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1097                         "(%d) device is marked for removal\n", dst_vdev->vid);
1098                 return 0;
1099         }
1100
1101         vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1102         vhost_txq->m_table[vhost_txq->len++] = m;
1103
1104         if (enable_stats) {
1105                 vdev->stats.tx_total++;
1106                 vdev->stats.tx++;
1107         }
1108
1109         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1110                 drain_vhost(dst_vdev);
1111                 vhost_txq->len = 0;
1112                 vhost_txq->pre_tsc = rte_rdtsc();
1113         }
1114         return 0;
1115 }
1116
1117 /*
1118  * Check if the destination MAC of a packet is one local VM,
1119  * and get its vlan tag, and offset if it is.
1120  */
1121 static __rte_always_inline int
1122 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1123         uint32_t *offset, uint16_t *vlan_tag)
1124 {
1125         struct vhost_dev *dst_vdev;
1126         struct rte_ether_hdr *pkt_hdr =
1127                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1128
1129         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1130         if (!dst_vdev)
1131                 return 0;
1132
1133         if (vdev->vid == dst_vdev->vid) {
1134                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1135                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1136                         vdev->vid);
1137                 return -1;
1138         }
1139
1140         /*
1141          * HW vlan strip will reduce the packet length
1142          * by minus length of vlan tag, so need restore
1143          * the packet length by plus it.
1144          */
1145         *offset  = RTE_VLAN_HLEN;
1146         *vlan_tag = vlan_tags[vdev->vid];
1147
1148         RTE_LOG_DP(DEBUG, VHOST_DATA,
1149                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1150                 vdev->vid, dst_vdev->vid, *vlan_tag);
1151
1152         return 0;
1153 }
1154
1155 static void virtio_tx_offload(struct rte_mbuf *m)
1156 {
1157         struct rte_net_hdr_lens hdr_lens;
1158         struct rte_ipv4_hdr *ipv4_hdr;
1159         struct rte_tcp_hdr *tcp_hdr;
1160         uint32_t ptype;
1161         void *l3_hdr;
1162
1163         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1164         m->l2_len = hdr_lens.l2_len;
1165         m->l3_len = hdr_lens.l3_len;
1166         m->l4_len = hdr_lens.l4_len;
1167
1168         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1169         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1170                 m->l2_len + m->l3_len);
1171
1172         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1173         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1174                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1175                 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1176                 ipv4_hdr = l3_hdr;
1177                 ipv4_hdr->hdr_checksum = 0;
1178                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1179         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1180                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1181                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1182         }
1183 }
1184
1185 static __rte_always_inline void
1186 do_drain_mbuf_table(struct mbuf_table *tx_q)
1187 {
1188         uint16_t count;
1189
1190         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1191                                  tx_q->m_table, tx_q->len);
1192         if (unlikely(count < tx_q->len))
1193                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1194
1195         tx_q->len = 0;
1196 }
1197
1198 /*
1199  * This function routes the TX packet to the correct interface. This
1200  * may be a local device or the physical port.
1201  */
1202 static __rte_always_inline void
1203 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1204 {
1205         struct mbuf_table *tx_q;
1206         unsigned offset = 0;
1207         const uint16_t lcore_id = rte_lcore_id();
1208         struct rte_ether_hdr *nh;
1209
1210
1211         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1212         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1213                 struct vhost_dev *vdev2;
1214
1215                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1216                         if (vdev2 != vdev)
1217                                 sync_virtio_xmit(vdev2, vdev, m);
1218                 }
1219                 goto queue2nic;
1220         }
1221
1222         /*check if destination is local VM*/
1223         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1224                 return;
1225
1226         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1227                 if (unlikely(find_local_dest(vdev, m, &offset,
1228                                              &vlan_tag) != 0)) {
1229                         rte_pktmbuf_free(m);
1230                         return;
1231                 }
1232         }
1233
1234         RTE_LOG_DP(DEBUG, VHOST_DATA,
1235                 "(%d) TX: MAC address is external\n", vdev->vid);
1236
1237 queue2nic:
1238
1239         /*Add packet to the port tx queue*/
1240         tx_q = &lcore_tx_queue[lcore_id];
1241
1242         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1243         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1244                 /* Guest has inserted the vlan tag. */
1245                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1246                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1247                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1248                         (vh->vlan_tci != vlan_tag_be))
1249                         vh->vlan_tci = vlan_tag_be;
1250         } else {
1251                 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1252
1253                 /*
1254                  * Find the right seg to adjust the data len when offset is
1255                  * bigger than tail room size.
1256                  */
1257                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1258                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1259                                 m->data_len += offset;
1260                         else {
1261                                 struct rte_mbuf *seg = m;
1262
1263                                 while ((seg->next != NULL) &&
1264                                         (offset > rte_pktmbuf_tailroom(seg)))
1265                                         seg = seg->next;
1266
1267                                 seg->data_len += offset;
1268                         }
1269                         m->pkt_len += offset;
1270                 }
1271
1272                 m->vlan_tci = vlan_tag;
1273         }
1274
1275         if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1276                 virtio_tx_offload(m);
1277
1278         tx_q->m_table[tx_q->len++] = m;
1279         if (enable_stats) {
1280                 vdev->stats.tx_total++;
1281                 vdev->stats.tx++;
1282         }
1283
1284         if (unlikely(tx_q->len == MAX_PKT_BURST))
1285                 do_drain_mbuf_table(tx_q);
1286 }
1287
1288
1289 static __rte_always_inline void
1290 drain_mbuf_table(struct mbuf_table *tx_q)
1291 {
1292         static uint64_t prev_tsc;
1293         uint64_t cur_tsc;
1294
1295         if (tx_q->len == 0)
1296                 return;
1297
1298         cur_tsc = rte_rdtsc();
1299         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1300                 prev_tsc = cur_tsc;
1301
1302                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1303                         "TX queue drained after timeout with burst size %u\n",
1304                         tx_q->len);
1305                 do_drain_mbuf_table(tx_q);
1306         }
1307 }
1308
1309 static __rte_always_inline void
1310 drain_eth_rx(struct vhost_dev *vdev)
1311 {
1312         uint16_t rx_count, enqueue_count;
1313         struct rte_mbuf *pkts[MAX_PKT_BURST];
1314
1315         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1316                                     pkts, MAX_PKT_BURST);
1317
1318         if (!rx_count)
1319                 return;
1320
1321         /*
1322          * When "enable_retry" is set, here we wait and retry when there
1323          * is no enough free slots in the queue to hold @rx_count packets,
1324          * to diminish packet loss.
1325          */
1326         if (enable_retry &&
1327             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1328                         VIRTIO_RXQ))) {
1329                 uint32_t retry;
1330
1331                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1332                         rte_delay_us(burst_rx_delay_time);
1333                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1334                                         VIRTIO_RXQ))
1335                                 break;
1336                 }
1337         }
1338
1339         if (builtin_net_driver) {
1340                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1341                                                 pkts, rx_count);
1342         } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1343                 uint16_t enqueue_fail = 0;
1344                 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1345
1346                 complete_async_pkts(vdev);
1347                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1348                                         VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1349                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1350
1351                 enqueue_fail = rx_count - enqueue_count;
1352                 if (enqueue_fail)
1353                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1354
1355         } else {
1356                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1357                                                 pkts, rx_count);
1358         }
1359
1360         if (enable_stats) {
1361                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1362                                 __ATOMIC_SEQ_CST);
1363                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1364                                 __ATOMIC_SEQ_CST);
1365         }
1366
1367         if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1368                 free_pkts(pkts, rx_count);
1369 }
1370
1371 static __rte_always_inline void
1372 drain_virtio_tx(struct vhost_dev *vdev)
1373 {
1374         struct rte_mbuf *pkts[MAX_PKT_BURST];
1375         uint16_t count;
1376         uint16_t i;
1377
1378         if (builtin_net_driver) {
1379                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1380                                         pkts, MAX_PKT_BURST);
1381         } else {
1382                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1383                                         mbuf_pool, pkts, MAX_PKT_BURST);
1384         }
1385
1386         /* setup VMDq for the first packet */
1387         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1388                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1389                         free_pkts(pkts, count);
1390         }
1391
1392         for (i = 0; i < count; ++i)
1393                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1394 }
1395
1396 /*
1397  * Main function of vhost-switch. It basically does:
1398  *
1399  * for each vhost device {
1400  *    - drain_eth_rx()
1401  *
1402  *      Which drains the host eth Rx queue linked to the vhost device,
1403  *      and deliver all of them to guest virito Rx ring associated with
1404  *      this vhost device.
1405  *
1406  *    - drain_virtio_tx()
1407  *
1408  *      Which drains the guest virtio Tx queue and deliver all of them
1409  *      to the target, which could be another vhost device, or the
1410  *      physical eth dev. The route is done in function "virtio_tx_route".
1411  * }
1412  */
1413 static int
1414 switch_worker(void *arg __rte_unused)
1415 {
1416         unsigned i;
1417         unsigned lcore_id = rte_lcore_id();
1418         struct vhost_dev *vdev;
1419         struct mbuf_table *tx_q;
1420
1421         RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1422
1423         tx_q = &lcore_tx_queue[lcore_id];
1424         for (i = 0; i < rte_lcore_count(); i++) {
1425                 if (lcore_ids[i] == lcore_id) {
1426                         tx_q->txq_id = i;
1427                         break;
1428                 }
1429         }
1430
1431         while(1) {
1432                 drain_mbuf_table(tx_q);
1433                 drain_vhost_table();
1434                 /*
1435                  * Inform the configuration core that we have exited the
1436                  * linked list and that no devices are in use if requested.
1437                  */
1438                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1439                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1440
1441                 /*
1442                  * Process vhost devices
1443                  */
1444                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1445                               lcore_vdev_entry) {
1446                         if (unlikely(vdev->remove)) {
1447                                 unlink_vmdq(vdev);
1448                                 vdev->ready = DEVICE_SAFE_REMOVE;
1449                                 continue;
1450                         }
1451
1452                         if (likely(vdev->ready == DEVICE_RX))
1453                                 drain_eth_rx(vdev);
1454
1455                         if (likely(!vdev->remove))
1456                                 drain_virtio_tx(vdev);
1457                 }
1458         }
1459
1460         return 0;
1461 }
1462
1463 /*
1464  * Remove a device from the specific data core linked list and from the
1465  * main linked list. Synchronization  occurs through the use of the
1466  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1467  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1468  */
1469 static void
1470 destroy_device(int vid)
1471 {
1472         struct vhost_dev *vdev = NULL;
1473         int lcore;
1474         uint16_t i;
1475
1476         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1477                 if (vdev->vid == vid)
1478                         break;
1479         }
1480         if (!vdev)
1481                 return;
1482         /*set the remove flag. */
1483         vdev->remove = 1;
1484         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1485                 rte_pause();
1486         }
1487
1488         for (i = 0; i < RTE_MAX_LCORE; i++)
1489                 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1490
1491         if (builtin_net_driver)
1492                 vs_vhost_net_remove(vdev);
1493
1494         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1495                      lcore_vdev_entry);
1496         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1497
1498
1499         /* Set the dev_removal_flag on each lcore. */
1500         RTE_LCORE_FOREACH_WORKER(lcore)
1501                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1502
1503         /*
1504          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1505          * we can be sure that they can no longer access the device removed
1506          * from the linked lists and that the devices are no longer in use.
1507          */
1508         RTE_LCORE_FOREACH_WORKER(lcore) {
1509                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1510                         rte_pause();
1511         }
1512
1513         lcore_info[vdev->coreid].device_num--;
1514
1515         RTE_LOG(INFO, VHOST_DATA,
1516                 "(%d) device has been removed from data core\n",
1517                 vdev->vid);
1518
1519         if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1520                 uint16_t n_pkt = 0;
1521                 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1522                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1523
1524                 while (vdev->pkts_inflight) {
1525                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1526                                                 m_cpl, vdev->pkts_inflight, dma_id, 0);
1527                         free_pkts(m_cpl, n_pkt);
1528                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1529                 }
1530
1531                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1532                 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1533         }
1534
1535         rte_free(vdev);
1536 }
1537
1538 /*
1539  * A new device is added to a data core. First the device is added to the main linked list
1540  * and then allocated to a specific data core.
1541  */
1542 static int
1543 new_device(int vid)
1544 {
1545         int lcore, core_add = 0;
1546         uint16_t i;
1547         uint32_t device_num_min = num_devices;
1548         struct vhost_dev *vdev;
1549         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1550         if (vdev == NULL) {
1551                 RTE_LOG(INFO, VHOST_DATA,
1552                         "(%d) couldn't allocate memory for vhost dev\n",
1553                         vid);
1554                 return -1;
1555         }
1556         vdev->vid = vid;
1557
1558         for (i = 0; i < RTE_MAX_LCORE; i++) {
1559                 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1560                         = rte_zmalloc("vhost bufftable",
1561                                 sizeof(struct vhost_bufftable),
1562                                 RTE_CACHE_LINE_SIZE);
1563
1564                 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1565                         RTE_LOG(INFO, VHOST_DATA,
1566                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1567                         return -1;
1568                 }
1569         }
1570
1571         if (builtin_net_driver)
1572                 vs_vhost_net_setup(vdev);
1573
1574         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1575         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1576
1577         /*reset ready flag*/
1578         vdev->ready = DEVICE_MAC_LEARNING;
1579         vdev->remove = 0;
1580
1581         /* Find a suitable lcore to add the device. */
1582         RTE_LCORE_FOREACH_WORKER(lcore) {
1583                 if (lcore_info[lcore].device_num < device_num_min) {
1584                         device_num_min = lcore_info[lcore].device_num;
1585                         core_add = lcore;
1586                 }
1587         }
1588         vdev->coreid = core_add;
1589
1590         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1591                           lcore_vdev_entry);
1592         lcore_info[vdev->coreid].device_num++;
1593
1594         /* Disable notifications. */
1595         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1596         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1597
1598         RTE_LOG(INFO, VHOST_DATA,
1599                 "(%d) device has been added to data core %d\n",
1600                 vid, vdev->coreid);
1601
1602         if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1603                 int ret;
1604
1605                 ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1606                 if (ret == 0)
1607                         dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
1608                 return ret;
1609         }
1610
1611         return 0;
1612 }
1613
1614 static int
1615 vring_state_changed(int vid, uint16_t queue_id, int enable)
1616 {
1617         struct vhost_dev *vdev = NULL;
1618
1619         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1620                 if (vdev->vid == vid)
1621                         break;
1622         }
1623         if (!vdev)
1624                 return -1;
1625
1626         if (queue_id != VIRTIO_RXQ)
1627                 return 0;
1628
1629         if (dma_bind[vid].dmas[queue_id].async_enabled) {
1630                 if (!enable) {
1631                         uint16_t n_pkt = 0;
1632                         int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1633                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1634
1635                         while (vdev->pkts_inflight) {
1636                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1637                                                         m_cpl, vdev->pkts_inflight, dma_id, 0);
1638                                 free_pkts(m_cpl, n_pkt);
1639                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1640                         }
1641                 }
1642         }
1643
1644         return 0;
1645 }
1646
1647 /*
1648  * These callback allow devices to be added to the data core when configuration
1649  * has been fully complete.
1650  */
1651 static const struct rte_vhost_device_ops virtio_net_device_ops =
1652 {
1653         .new_device =  new_device,
1654         .destroy_device = destroy_device,
1655         .vring_state_changed = vring_state_changed,
1656 };
1657
1658 /*
1659  * This is a thread will wake up after a period to print stats if the user has
1660  * enabled them.
1661  */
1662 static void *
1663 print_stats(__rte_unused void *arg)
1664 {
1665         struct vhost_dev *vdev;
1666         uint64_t tx_dropped, rx_dropped;
1667         uint64_t tx, tx_total, rx, rx_total;
1668         const char clr[] = { 27, '[', '2', 'J', '\0' };
1669         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1670
1671         while(1) {
1672                 sleep(enable_stats);
1673
1674                 /* Clear screen and move to top left */
1675                 printf("%s%s\n", clr, top_left);
1676                 printf("Device statistics =================================\n");
1677
1678                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1679                         tx_total   = vdev->stats.tx_total;
1680                         tx         = vdev->stats.tx;
1681                         tx_dropped = tx_total - tx;
1682
1683                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1684                                 __ATOMIC_SEQ_CST);
1685                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1686                                 __ATOMIC_SEQ_CST);
1687                         rx_dropped = rx_total - rx;
1688
1689                         printf("Statistics for device %d\n"
1690                                 "-----------------------\n"
1691                                 "TX total:              %" PRIu64 "\n"
1692                                 "TX dropped:            %" PRIu64 "\n"
1693                                 "TX successful:         %" PRIu64 "\n"
1694                                 "RX total:              %" PRIu64 "\n"
1695                                 "RX dropped:            %" PRIu64 "\n"
1696                                 "RX successful:         %" PRIu64 "\n",
1697                                 vdev->vid,
1698                                 tx_total, tx_dropped, tx,
1699                                 rx_total, rx_dropped, rx);
1700                 }
1701
1702                 printf("===================================================\n");
1703
1704                 fflush(stdout);
1705         }
1706
1707         return NULL;
1708 }
1709
1710 static void
1711 unregister_drivers(int socket_num)
1712 {
1713         int i, ret;
1714
1715         for (i = 0; i < socket_num; i++) {
1716                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1717                 if (ret != 0)
1718                         RTE_LOG(ERR, VHOST_CONFIG,
1719                                 "Fail to unregister vhost driver for %s.\n",
1720                                 socket_files + i * PATH_MAX);
1721         }
1722 }
1723
1724 /* When we receive a INT signal, unregister vhost driver */
1725 static void
1726 sigint_handler(__rte_unused int signum)
1727 {
1728         /* Unregister vhost driver. */
1729         unregister_drivers(nb_sockets);
1730
1731         exit(0);
1732 }
1733
1734 /*
1735  * While creating an mbuf pool, one key thing is to figure out how
1736  * many mbuf entries is enough for our use. FYI, here are some
1737  * guidelines:
1738  *
1739  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1740  *
1741  * - For each switch core (A CPU core does the packet switch), we need
1742  *   also make some reservation for receiving the packets from virtio
1743  *   Tx queue. How many is enough depends on the usage. It's normally
1744  *   a simple calculation like following:
1745  *
1746  *       MAX_PKT_BURST * max packet size / mbuf size
1747  *
1748  *   So, we definitely need allocate more mbufs when TSO is enabled.
1749  *
1750  * - Similarly, for each switching core, we should serve @nr_rx_desc
1751  *   mbufs for receiving the packets from physical NIC device.
1752  *
1753  * - We also need make sure, for each switch core, we have allocated
1754  *   enough mbufs to fill up the mbuf cache.
1755  */
1756 static void
1757 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1758         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1759 {
1760         uint32_t nr_mbufs;
1761         uint32_t nr_mbufs_per_core;
1762         uint32_t mtu = 1500;
1763
1764         if (mergeable)
1765                 mtu = 9000;
1766         if (enable_tso)
1767                 mtu = 64 * 1024;
1768
1769         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1770                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1771         nr_mbufs_per_core += nr_rx_desc;
1772         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1773
1774         nr_mbufs  = nr_queues * nr_rx_desc;
1775         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1776         nr_mbufs *= nr_port;
1777
1778         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1779                                             nr_mbuf_cache, 0, mbuf_size,
1780                                             rte_socket_id());
1781         if (mbuf_pool == NULL)
1782                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1783 }
1784
1785 static void
1786 reset_dma(void)
1787 {
1788         int i;
1789
1790         for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1791                 int j;
1792
1793                 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1794                         dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1795                         dma_bind[i].dmas[j].async_enabled = false;
1796                 }
1797         }
1798
1799         for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1800                 dmas_id[i] = INVALID_DMA_ID;
1801 }
1802
1803 /*
1804  * Main function, does initialisation and calls the per-lcore functions.
1805  */
1806 int
1807 main(int argc, char *argv[])
1808 {
1809         unsigned lcore_id, core_id = 0;
1810         unsigned nb_ports, valid_num_ports;
1811         int ret, i;
1812         uint16_t portid;
1813         static pthread_t tid;
1814         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1815
1816         signal(SIGINT, sigint_handler);
1817
1818         /* init EAL */
1819         ret = rte_eal_init(argc, argv);
1820         if (ret < 0)
1821                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1822         argc -= ret;
1823         argv += ret;
1824
1825         /* initialize dma structures */
1826         reset_dma();
1827
1828         /* parse app arguments */
1829         ret = us_vhost_parse_args(argc, argv);
1830         if (ret < 0)
1831                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1832
1833         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1834                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1835
1836                 if (rte_lcore_is_enabled(lcore_id))
1837                         lcore_ids[core_id++] = lcore_id;
1838         }
1839
1840         if (rte_lcore_count() > RTE_MAX_LCORE)
1841                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1842
1843         /* Get the number of physical ports. */
1844         nb_ports = rte_eth_dev_count_avail();
1845
1846         /*
1847          * Update the global var NUM_PORTS and global array PORTS
1848          * and get value of var VALID_NUM_PORTS according to system ports number
1849          */
1850         valid_num_ports = check_ports_num(nb_ports);
1851
1852         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1853                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1854                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1855                 return -1;
1856         }
1857
1858         /*
1859          * FIXME: here we are trying to allocate mbufs big enough for
1860          * @MAX_QUEUES, but the truth is we're never going to use that
1861          * many queues here. We probably should only do allocation for
1862          * those queues we are going to use.
1863          */
1864         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1865                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1866
1867         if (vm2vm_mode == VM2VM_HARDWARE) {
1868                 /* Enable VT loop back to let L2 switch to do it. */
1869                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1870                 RTE_LOG(DEBUG, VHOST_CONFIG,
1871                         "Enable loop back for L2 switch in vmdq.\n");
1872         }
1873
1874         /* initialize all ports */
1875         RTE_ETH_FOREACH_DEV(portid) {
1876                 /* skip ports that are not enabled */
1877                 if ((enabled_port_mask & (1 << portid)) == 0) {
1878                         RTE_LOG(INFO, VHOST_PORT,
1879                                 "Skipping disabled port %d\n", portid);
1880                         continue;
1881                 }
1882                 if (port_init(portid) != 0)
1883                         rte_exit(EXIT_FAILURE,
1884                                 "Cannot initialize network ports\n");
1885         }
1886
1887         /* Enable stats if the user option is set. */
1888         if (enable_stats) {
1889                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1890                                         print_stats, NULL);
1891                 if (ret < 0)
1892                         rte_exit(EXIT_FAILURE,
1893                                 "Cannot create print-stats thread\n");
1894         }
1895
1896         /* Launch all data cores. */
1897         RTE_LCORE_FOREACH_WORKER(lcore_id)
1898                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1899
1900         if (client_mode)
1901                 flags |= RTE_VHOST_USER_CLIENT;
1902
1903         for (i = 0; i < dma_count; i++) {
1904                 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1905                         RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1906                         rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1907                 }
1908         }
1909
1910         /* Register vhost user driver to handle vhost messages. */
1911         for (i = 0; i < nb_sockets; i++) {
1912                 char *file = socket_files + i * PATH_MAX;
1913
1914                 if (dma_count)
1915                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1916
1917                 ret = rte_vhost_driver_register(file, flags);
1918                 if (ret != 0) {
1919                         unregister_drivers(i);
1920                         rte_exit(EXIT_FAILURE,
1921                                 "vhost driver register failure.\n");
1922                 }
1923
1924                 if (builtin_net_driver)
1925                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1926
1927                 if (mergeable == 0) {
1928                         rte_vhost_driver_disable_features(file,
1929                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1930                 }
1931
1932                 if (enable_tx_csum == 0) {
1933                         rte_vhost_driver_disable_features(file,
1934                                 1ULL << VIRTIO_NET_F_CSUM);
1935                 }
1936
1937                 if (enable_tso == 0) {
1938                         rte_vhost_driver_disable_features(file,
1939                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1940                         rte_vhost_driver_disable_features(file,
1941                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1942                         rte_vhost_driver_disable_features(file,
1943                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1944                         rte_vhost_driver_disable_features(file,
1945                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1946                 }
1947
1948                 if (promiscuous) {
1949                         rte_vhost_driver_enable_features(file,
1950                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1951                 }
1952
1953                 ret = rte_vhost_driver_callback_register(file,
1954                         &virtio_net_device_ops);
1955                 if (ret != 0) {
1956                         rte_exit(EXIT_FAILURE,
1957                                 "failed to register vhost driver callbacks.\n");
1958                 }
1959
1960                 if (rte_vhost_driver_start(file) < 0) {
1961                         rte_exit(EXIT_FAILURE,
1962                                 "failed to start vhost driver.\n");
1963                 }
1964         }
1965
1966         RTE_LCORE_FOREACH_WORKER(lcore_id)
1967                 rte_eal_wait_lcore(lcore_id);
1968
1969         /* clean up the EAL */
1970         rte_eal_cleanup();
1971
1972         return 0;
1973 }