doc: add GRE option flow item to feature list
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29
30 #include "main.h"
31
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35
36 /* the maximum number of external ports supported */
37 #define MAX_SUP_PORTS 1
38
39 #define MBUF_CACHE_SIZE 128
40 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
41
42 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
43
44 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
45 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
46
47 #define JUMBO_FRAME_MAX_SIZE    0x2600
48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
49
50 /* State of virtio device. */
51 #define DEVICE_MAC_LEARNING 0
52 #define DEVICE_RX                       1
53 #define DEVICE_SAFE_REMOVE      2
54
55 /* Configurable number of RX/TX ring descriptors */
56 #define RTE_TEST_RX_DESC_DEFAULT 1024
57 #define RTE_TEST_TX_DESC_DEFAULT 512
58
59 #define INVALID_PORT_ID 0xFF
60 #define INVALID_DMA_ID -1
61
62 #define DMA_RING_SIZE 4096
63
64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
66 static int dma_count;
67
68 /* mask of enabled ports */
69 static uint32_t enabled_port_mask = 0;
70
71 /* Promiscuous mode */
72 static uint32_t promiscuous;
73
74 /* number of devices/queues to support*/
75 static uint32_t num_queues = 0;
76 static uint32_t num_devices;
77
78 static struct rte_mempool *mbuf_pool;
79 static int mergeable;
80
81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
82 typedef enum {
83         VM2VM_DISABLED = 0,
84         VM2VM_SOFTWARE = 1,
85         VM2VM_HARDWARE = 2,
86         VM2VM_LAST
87 } vm2vm_type;
88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
89
90 /* Enable stats. */
91 static uint32_t enable_stats = 0;
92 /* Enable retries on RX. */
93 static uint32_t enable_retry = 1;
94
95 /* Disable TX checksum offload */
96 static uint32_t enable_tx_csum;
97
98 /* Disable TSO offload */
99 static uint32_t enable_tso;
100
101 static int client_mode;
102
103 static int builtin_net_driver;
104
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
109
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
113
114 /* empty VMDq configuration structure. Filled in programmatically */
115 static struct rte_eth_conf vmdq_conf_default = {
116         .rxmode = {
117                 .mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
118                 .split_hdr_size = 0,
119                 /*
120                  * VLAN strip is necessary for 1G NIC such as I350,
121                  * this fixes bug of ipv4 forwarding in guest can't
122                  * forward packets from one virtio dev to another virtio dev.
123                  */
124                 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
125         },
126
127         .txmode = {
128                 .mq_mode = RTE_ETH_MQ_TX_NONE,
129                 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
130                              RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
131                              RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
132                              RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
133                              RTE_ETH_TX_OFFLOAD_TCP_TSO),
134         },
135         .rx_adv_conf = {
136                 /*
137                  * should be overridden separately in code with
138                  * appropriate values
139                  */
140                 .vmdq_rx_conf = {
141                         .nb_queue_pools = RTE_ETH_8_POOLS,
142                         .enable_default_pool = 0,
143                         .default_pool = 0,
144                         .nb_pool_maps = 0,
145                         .pool_map = {{0, 0},},
146                 },
147         },
148 };
149
150
151 static unsigned lcore_ids[RTE_MAX_LCORE];
152 static uint16_t ports[RTE_MAX_ETHPORTS];
153 static unsigned num_ports = 0; /**< The number of ports specified in command line */
154 static uint16_t num_pf_queues, num_vmdq_queues;
155 static uint16_t vmdq_pool_base, vmdq_queue_base;
156 static uint16_t queues_per_pool;
157
158 const uint16_t vlan_tags[] = {
159         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
160         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
161         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
162         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
163         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
164         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
165         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
166         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
167 };
168
169 /* ethernet addresses of ports */
170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
171
172 static struct vhost_dev_tailq_list vhost_dev_list =
173         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
174
175 static struct lcore_info lcore_info[RTE_MAX_LCORE];
176
177 /* Used for queueing bursts of TX packets. */
178 struct mbuf_table {
179         unsigned len;
180         unsigned txq_id;
181         struct rte_mbuf *m_table[MAX_PKT_BURST];
182 };
183
184 struct vhost_bufftable {
185         uint32_t len;
186         uint64_t pre_tsc;
187         struct rte_mbuf *m_table[MAX_PKT_BURST];
188 };
189
190 /* TX queue for each data core. */
191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
192
193 /*
194  * Vhost TX buffer for each data core.
195  * Every data core maintains a TX buffer for every vhost device,
196  * which is used for batch pkts enqueue for higher performance.
197  */
198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
199
200 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
201                                  / US_PER_S * BURST_TX_DRAIN_US)
202
203 static inline bool
204 is_dma_configured(int16_t dev_id)
205 {
206         int i;
207
208         for (i = 0; i < dma_count; i++)
209                 if (dmas_id[i] == dev_id)
210                         return true;
211         return false;
212 }
213
214 static inline int
215 open_dma(const char *value)
216 {
217         struct dma_for_vhost *dma_info = dma_bind;
218         char *input = strndup(value, strlen(value) + 1);
219         char *addrs = input;
220         char *ptrs[2];
221         char *start, *end, *substr;
222         int64_t vid;
223
224         struct rte_dma_info info;
225         struct rte_dma_conf dev_config = { .nb_vchans = 1 };
226         struct rte_dma_vchan_conf qconf = {
227                 .direction = RTE_DMA_DIR_MEM_TO_MEM,
228                 .nb_desc = DMA_RING_SIZE
229         };
230
231         int dev_id;
232         int ret = 0;
233         uint16_t i = 0;
234         char *dma_arg[RTE_MAX_VHOST_DEVICE];
235         int args_nr;
236
237         while (isblank(*addrs))
238                 addrs++;
239         if (*addrs == '\0') {
240                 ret = -1;
241                 goto out;
242         }
243
244         /* process DMA devices within bracket. */
245         addrs++;
246         substr = strtok(addrs, ";]");
247         if (!substr) {
248                 ret = -1;
249                 goto out;
250         }
251
252         args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
253         if (args_nr <= 0) {
254                 ret = -1;
255                 goto out;
256         }
257
258         while (i < args_nr) {
259                 char *arg_temp = dma_arg[i];
260                 uint8_t sub_nr;
261
262                 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
263                 if (sub_nr != 2) {
264                         ret = -1;
265                         goto out;
266                 }
267
268                 start = strstr(ptrs[0], "txd");
269                 if (start == NULL) {
270                         ret = -1;
271                         goto out;
272                 }
273
274                 start += 3;
275                 vid = strtol(start, &end, 0);
276                 if (end == start) {
277                         ret = -1;
278                         goto out;
279                 }
280
281                 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
282                 if (dev_id < 0) {
283                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
284                         ret = -1;
285                         goto out;
286                 }
287
288                 /* DMA device is already configured, so skip */
289                 if (is_dma_configured(dev_id))
290                         goto done;
291
292                 if (rte_dma_info_get(dev_id, &info) != 0) {
293                         RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
294                         ret = -1;
295                         goto out;
296                 }
297
298                 if (info.max_vchans < 1) {
299                         RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
300                         ret = -1;
301                         goto out;
302                 }
303
304                 if (rte_dma_configure(dev_id, &dev_config) != 0) {
305                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
306                         ret = -1;
307                         goto out;
308                 }
309
310                 /* Check the max desc supported by DMA device */
311                 rte_dma_info_get(dev_id, &info);
312                 if (info.nb_vchans != 1) {
313                         RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
314                                         dev_id);
315                         ret = -1;
316                         goto out;
317                 }
318
319                 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
320
321                 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
322                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
323                         ret = -1;
324                         goto out;
325                 }
326
327                 if (rte_dma_start(dev_id) != 0) {
328                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
329                         ret = -1;
330                         goto out;
331                 }
332
333                 dmas_id[dma_count++] = dev_id;
334
335 done:
336                 (dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
337                 i++;
338         }
339 out:
340         free(input);
341         return ret;
342 }
343
344 /*
345  * Builds up the correct configuration for VMDQ VLAN pool map
346  * according to the pool & queue limits.
347  */
348 static inline int
349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
350 {
351         struct rte_eth_vmdq_rx_conf conf;
352         struct rte_eth_vmdq_rx_conf *def_conf =
353                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
354         unsigned i;
355
356         memset(&conf, 0, sizeof(conf));
357         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
358         conf.nb_pool_maps = num_devices;
359         conf.enable_loop_back = def_conf->enable_loop_back;
360         conf.rx_mode = def_conf->rx_mode;
361
362         for (i = 0; i < conf.nb_pool_maps; i++) {
363                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
364                 conf.pool_map[i].pools = (1UL << i);
365         }
366
367         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
368         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
369                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
370         return 0;
371 }
372
373 /*
374  * Initialises a given port using global settings and with the rx buffers
375  * coming from the mbuf_pool passed as parameter
376  */
377 static inline int
378 port_init(uint16_t port)
379 {
380         struct rte_eth_dev_info dev_info;
381         struct rte_eth_conf port_conf;
382         struct rte_eth_rxconf *rxconf;
383         struct rte_eth_txconf *txconf;
384         int16_t rx_rings, tx_rings;
385         uint16_t rx_ring_size, tx_ring_size;
386         int retval;
387         uint16_t q;
388
389         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
390         retval = rte_eth_dev_info_get(port, &dev_info);
391         if (retval != 0) {
392                 RTE_LOG(ERR, VHOST_PORT,
393                         "Error during getting device (port %u) info: %s\n",
394                         port, strerror(-retval));
395
396                 return retval;
397         }
398
399         rxconf = &dev_info.default_rxconf;
400         txconf = &dev_info.default_txconf;
401         rxconf->rx_drop_en = 1;
402
403         /*configure the number of supported virtio devices based on VMDQ limits */
404         num_devices = dev_info.max_vmdq_pools;
405
406         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
408
409         tx_rings = (uint16_t)rte_lcore_count();
410
411         if (mergeable) {
412                 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
413                         vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
414                 else
415                         vmdq_conf_default.rxmode.mtu = MAX_MTU;
416         }
417
418         /* Get port configuration. */
419         retval = get_eth_conf(&port_conf, num_devices);
420         if (retval < 0)
421                 return retval;
422         /* NIC queues are divided into pf queues and vmdq queues.  */
423         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
424         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
425         num_vmdq_queues = num_devices * queues_per_pool;
426         num_queues = num_pf_queues + num_vmdq_queues;
427         vmdq_queue_base = dev_info.vmdq_queue_base;
428         vmdq_pool_base  = dev_info.vmdq_pool_base;
429         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
430                 num_pf_queues, num_devices, queues_per_pool);
431
432         if (!rte_eth_dev_is_valid_port(port))
433                 return -1;
434
435         rx_rings = (uint16_t)dev_info.max_rx_queues;
436         if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
437                 port_conf.txmode.offloads |=
438                         RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
439         /* Configure ethernet device. */
440         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
441         if (retval != 0) {
442                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
443                         port, strerror(-retval));
444                 return retval;
445         }
446
447         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
448                 &tx_ring_size);
449         if (retval != 0) {
450                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
451                         "for port %u: %s.\n", port, strerror(-retval));
452                 return retval;
453         }
454         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
455                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
456                         "for Rx queues on port %u.\n", port);
457                 return -1;
458         }
459
460         /* Setup the queues. */
461         rxconf->offloads = port_conf.rxmode.offloads;
462         for (q = 0; q < rx_rings; q ++) {
463                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
464                                                 rte_eth_dev_socket_id(port),
465                                                 rxconf,
466                                                 mbuf_pool);
467                 if (retval < 0) {
468                         RTE_LOG(ERR, VHOST_PORT,
469                                 "Failed to setup rx queue %u of port %u: %s.\n",
470                                 q, port, strerror(-retval));
471                         return retval;
472                 }
473         }
474         txconf->offloads = port_conf.txmode.offloads;
475         for (q = 0; q < tx_rings; q ++) {
476                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
477                                                 rte_eth_dev_socket_id(port),
478                                                 txconf);
479                 if (retval < 0) {
480                         RTE_LOG(ERR, VHOST_PORT,
481                                 "Failed to setup tx queue %u of port %u: %s.\n",
482                                 q, port, strerror(-retval));
483                         return retval;
484                 }
485         }
486
487         /* Start the device. */
488         retval  = rte_eth_dev_start(port);
489         if (retval < 0) {
490                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
491                         port, strerror(-retval));
492                 return retval;
493         }
494
495         if (promiscuous) {
496                 retval = rte_eth_promiscuous_enable(port);
497                 if (retval != 0) {
498                         RTE_LOG(ERR, VHOST_PORT,
499                                 "Failed to enable promiscuous mode on port %u: %s\n",
500                                 port, rte_strerror(-retval));
501                         return retval;
502                 }
503         }
504
505         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
506         if (retval < 0) {
507                 RTE_LOG(ERR, VHOST_PORT,
508                         "Failed to get MAC address on port %u: %s\n",
509                         port, rte_strerror(-retval));
510                 return retval;
511         }
512
513         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
514         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
515                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
516                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
517
518         return 0;
519 }
520
521 /*
522  * Set socket file path.
523  */
524 static int
525 us_vhost_parse_socket_path(const char *q_arg)
526 {
527         char *old;
528
529         /* parse number string */
530         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
531                 return -1;
532
533         old = socket_files;
534         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
535         if (socket_files == NULL) {
536                 free(old);
537                 return -1;
538         }
539
540         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
541         nb_sockets++;
542
543         return 0;
544 }
545
546 /*
547  * Parse the portmask provided at run time.
548  */
549 static int
550 parse_portmask(const char *portmask)
551 {
552         char *end = NULL;
553         unsigned long pm;
554
555         errno = 0;
556
557         /* parse hexadecimal string */
558         pm = strtoul(portmask, &end, 16);
559         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
560                 return 0;
561
562         return pm;
563
564 }
565
566 /*
567  * Parse num options at run time.
568  */
569 static int
570 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
571 {
572         char *end = NULL;
573         unsigned long num;
574
575         errno = 0;
576
577         /* parse unsigned int string */
578         num = strtoul(q_arg, &end, 10);
579         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
580                 return -1;
581
582         if (num > max_valid_value)
583                 return -1;
584
585         return num;
586
587 }
588
589 /*
590  * Display usage
591  */
592 static void
593 us_vhost_usage(const char *prgname)
594 {
595         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
596         "               --vm2vm [0|1|2]\n"
597         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
598         "               --socket-file <path>\n"
599         "               --nb-devices ND\n"
600         "               -p PORTMASK: Set mask for ports to be used by application\n"
601         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
602         "               --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
603         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
604         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
605         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
606         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
607         "               --socket-file: The path of the socket file.\n"
608         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
609         "               --tso [0|1] disable/enable TCP segment offload.\n"
610         "               --client register a vhost-user socket as client mode.\n"
611         "               --dmas register dma channel for specific vhost device.\n",
612                prgname);
613 }
614
615 enum {
616 #define OPT_VM2VM               "vm2vm"
617         OPT_VM2VM_NUM = 256,
618 #define OPT_RX_RETRY            "rx-retry"
619         OPT_RX_RETRY_NUM,
620 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
621         OPT_RX_RETRY_DELAY_NUM,
622 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
623         OPT_RX_RETRY_NUMB_NUM,
624 #define OPT_MERGEABLE           "mergeable"
625         OPT_MERGEABLE_NUM,
626 #define OPT_STATS               "stats"
627         OPT_STATS_NUM,
628 #define OPT_SOCKET_FILE         "socket-file"
629         OPT_SOCKET_FILE_NUM,
630 #define OPT_TX_CSUM             "tx-csum"
631         OPT_TX_CSUM_NUM,
632 #define OPT_TSO                 "tso"
633         OPT_TSO_NUM,
634 #define OPT_CLIENT              "client"
635         OPT_CLIENT_NUM,
636 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
637         OPT_BUILTIN_NET_DRIVER_NUM,
638 #define OPT_DMAS                "dmas"
639         OPT_DMAS_NUM,
640 };
641
642 /*
643  * Parse the arguments given in the command line of the application.
644  */
645 static int
646 us_vhost_parse_args(int argc, char **argv)
647 {
648         int opt, ret;
649         int option_index;
650         unsigned i;
651         const char *prgname = argv[0];
652         static struct option long_option[] = {
653                 {OPT_VM2VM, required_argument,
654                                 NULL, OPT_VM2VM_NUM},
655                 {OPT_RX_RETRY, required_argument,
656                                 NULL, OPT_RX_RETRY_NUM},
657                 {OPT_RX_RETRY_DELAY, required_argument,
658                                 NULL, OPT_RX_RETRY_DELAY_NUM},
659                 {OPT_RX_RETRY_NUMB, required_argument,
660                                 NULL, OPT_RX_RETRY_NUMB_NUM},
661                 {OPT_MERGEABLE, required_argument,
662                                 NULL, OPT_MERGEABLE_NUM},
663                 {OPT_STATS, required_argument,
664                                 NULL, OPT_STATS_NUM},
665                 {OPT_SOCKET_FILE, required_argument,
666                                 NULL, OPT_SOCKET_FILE_NUM},
667                 {OPT_TX_CSUM, required_argument,
668                                 NULL, OPT_TX_CSUM_NUM},
669                 {OPT_TSO, required_argument,
670                                 NULL, OPT_TSO_NUM},
671                 {OPT_CLIENT, no_argument,
672                                 NULL, OPT_CLIENT_NUM},
673                 {OPT_BUILTIN_NET_DRIVER, no_argument,
674                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
675                 {OPT_DMAS, required_argument,
676                                 NULL, OPT_DMAS_NUM},
677                 {NULL, 0, 0, 0},
678         };
679
680         /* Parse command line */
681         while ((opt = getopt_long(argc, argv, "p:P",
682                         long_option, &option_index)) != EOF) {
683                 switch (opt) {
684                 /* Portmask */
685                 case 'p':
686                         enabled_port_mask = parse_portmask(optarg);
687                         if (enabled_port_mask == 0) {
688                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
689                                 us_vhost_usage(prgname);
690                                 return -1;
691                         }
692                         break;
693
694                 case 'P':
695                         promiscuous = 1;
696                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
697                                 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
698                                 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
699                         break;
700
701                 case OPT_VM2VM_NUM:
702                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
703                         if (ret == -1) {
704                                 RTE_LOG(INFO, VHOST_CONFIG,
705                                         "Invalid argument for "
706                                         "vm2vm [0|1|2]\n");
707                                 us_vhost_usage(prgname);
708                                 return -1;
709                         }
710                         vm2vm_mode = (vm2vm_type)ret;
711                         break;
712
713                 case OPT_RX_RETRY_NUM:
714                         ret = parse_num_opt(optarg, 1);
715                         if (ret == -1) {
716                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
717                                 us_vhost_usage(prgname);
718                                 return -1;
719                         }
720                         enable_retry = ret;
721                         break;
722
723                 case OPT_TX_CSUM_NUM:
724                         ret = parse_num_opt(optarg, 1);
725                         if (ret == -1) {
726                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
727                                 us_vhost_usage(prgname);
728                                 return -1;
729                         }
730                         enable_tx_csum = ret;
731                         break;
732
733                 case OPT_TSO_NUM:
734                         ret = parse_num_opt(optarg, 1);
735                         if (ret == -1) {
736                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
737                                 us_vhost_usage(prgname);
738                                 return -1;
739                         }
740                         enable_tso = ret;
741                         break;
742
743                 case OPT_RX_RETRY_DELAY_NUM:
744                         ret = parse_num_opt(optarg, INT32_MAX);
745                         if (ret == -1) {
746                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
747                                 us_vhost_usage(prgname);
748                                 return -1;
749                         }
750                         burst_rx_delay_time = ret;
751                         break;
752
753                 case OPT_RX_RETRY_NUMB_NUM:
754                         ret = parse_num_opt(optarg, INT32_MAX);
755                         if (ret == -1) {
756                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
757                                 us_vhost_usage(prgname);
758                                 return -1;
759                         }
760                         burst_rx_retry_num = ret;
761                         break;
762
763                 case OPT_MERGEABLE_NUM:
764                         ret = parse_num_opt(optarg, 1);
765                         if (ret == -1) {
766                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
767                                 us_vhost_usage(prgname);
768                                 return -1;
769                         }
770                         mergeable = !!ret;
771                         break;
772
773                 case OPT_STATS_NUM:
774                         ret = parse_num_opt(optarg, INT32_MAX);
775                         if (ret == -1) {
776                                 RTE_LOG(INFO, VHOST_CONFIG,
777                                         "Invalid argument for stats [0..N]\n");
778                                 us_vhost_usage(prgname);
779                                 return -1;
780                         }
781                         enable_stats = ret;
782                         break;
783
784                 /* Set socket file path. */
785                 case OPT_SOCKET_FILE_NUM:
786                         if (us_vhost_parse_socket_path(optarg) == -1) {
787                                 RTE_LOG(INFO, VHOST_CONFIG,
788                                 "Invalid argument for socket name (Max %d characters)\n",
789                                 PATH_MAX);
790                                 us_vhost_usage(prgname);
791                                 return -1;
792                         }
793                         break;
794
795                 case OPT_DMAS_NUM:
796                         if (open_dma(optarg) == -1) {
797                                 RTE_LOG(INFO, VHOST_CONFIG,
798                                         "Wrong DMA args\n");
799                                 us_vhost_usage(prgname);
800                                 return -1;
801                         }
802                         break;
803
804                 case OPT_CLIENT_NUM:
805                         client_mode = 1;
806                         break;
807
808                 case OPT_BUILTIN_NET_DRIVER_NUM:
809                         builtin_net_driver = 1;
810                         break;
811
812                 /* Invalid option - print options. */
813                 default:
814                         us_vhost_usage(prgname);
815                         return -1;
816                 }
817         }
818
819         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
820                 if (enabled_port_mask & (1 << i))
821                         ports[num_ports++] = i;
822         }
823
824         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
825                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
826                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
827                 return -1;
828         }
829
830         return 0;
831 }
832
833 /*
834  * Update the global var NUM_PORTS and array PORTS according to system ports number
835  * and return valid ports number
836  */
837 static unsigned check_ports_num(unsigned nb_ports)
838 {
839         unsigned valid_num_ports = num_ports;
840         unsigned portid;
841
842         if (num_ports > nb_ports) {
843                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
844                         num_ports, nb_ports);
845                 num_ports = nb_ports;
846         }
847
848         for (portid = 0; portid < num_ports; portid ++) {
849                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
850                         RTE_LOG(INFO, VHOST_PORT,
851                                 "\nSpecified port ID(%u) is not valid\n",
852                                 ports[portid]);
853                         ports[portid] = INVALID_PORT_ID;
854                         valid_num_ports--;
855                 }
856         }
857         return valid_num_ports;
858 }
859
860 static __rte_always_inline struct vhost_dev *
861 find_vhost_dev(struct rte_ether_addr *mac)
862 {
863         struct vhost_dev *vdev;
864
865         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
866                 if (vdev->ready == DEVICE_RX &&
867                     rte_is_same_ether_addr(mac, &vdev->mac_address))
868                         return vdev;
869         }
870
871         return NULL;
872 }
873
874 /*
875  * This function learns the MAC address of the device and registers this along with a
876  * vlan tag to a VMDQ.
877  */
878 static int
879 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
880 {
881         struct rte_ether_hdr *pkt_hdr;
882         int i, ret;
883
884         /* Learn MAC address of guest device from packet */
885         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
886
887         if (find_vhost_dev(&pkt_hdr->src_addr)) {
888                 RTE_LOG(ERR, VHOST_DATA,
889                         "(%d) device is using a registered MAC!\n",
890                         vdev->vid);
891                 return -1;
892         }
893
894         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
895                 vdev->mac_address.addr_bytes[i] =
896                         pkt_hdr->src_addr.addr_bytes[i];
897
898         /* vlan_tag currently uses the device_id. */
899         vdev->vlan_tag = vlan_tags[vdev->vid];
900
901         /* Print out VMDQ registration info. */
902         RTE_LOG(INFO, VHOST_DATA,
903                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
904                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
905                 vdev->vlan_tag);
906
907         /* Register the MAC address. */
908         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
909                                 (uint32_t)vdev->vid + vmdq_pool_base);
910         if (ret)
911                 RTE_LOG(ERR, VHOST_DATA,
912                         "(%d) failed to add device MAC address to VMDQ\n",
913                         vdev->vid);
914
915         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
916
917         /* Set device as ready for RX. */
918         vdev->ready = DEVICE_RX;
919
920         return 0;
921 }
922
923 /*
924  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
925  * queue before disabling RX on the device.
926  */
927 static inline void
928 unlink_vmdq(struct vhost_dev *vdev)
929 {
930         unsigned i = 0;
931         unsigned rx_count;
932         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
933
934         if (vdev->ready == DEVICE_RX) {
935                 /*clear MAC and VLAN settings*/
936                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
937                 for (i = 0; i < 6; i++)
938                         vdev->mac_address.addr_bytes[i] = 0;
939
940                 vdev->vlan_tag = 0;
941
942                 /*Clear out the receive buffers*/
943                 rx_count = rte_eth_rx_burst(ports[0],
944                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
945
946                 while (rx_count) {
947                         for (i = 0; i < rx_count; i++)
948                                 rte_pktmbuf_free(pkts_burst[i]);
949
950                         rx_count = rte_eth_rx_burst(ports[0],
951                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
952                 }
953
954                 vdev->ready = DEVICE_MAC_LEARNING;
955         }
956 }
957
958 static inline void
959 free_pkts(struct rte_mbuf **pkts, uint16_t n)
960 {
961         while (n--)
962                 rte_pktmbuf_free(pkts[n]);
963 }
964
965 static __rte_always_inline void
966 complete_async_pkts(struct vhost_dev *vdev)
967 {
968         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
969         uint16_t complete_count;
970         int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
971
972         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
973                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
974         if (complete_count) {
975                 free_pkts(p_cpl, complete_count);
976                 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
977         }
978
979 }
980
981 static __rte_always_inline void
982 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
983             struct rte_mbuf *m)
984 {
985         uint16_t ret;
986
987         if (builtin_net_driver) {
988                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
989         } else {
990                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
991         }
992
993         if (enable_stats) {
994                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
995                                 __ATOMIC_SEQ_CST);
996                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
997                                 __ATOMIC_SEQ_CST);
998                 src_vdev->stats.tx_total++;
999                 src_vdev->stats.tx += ret;
1000         }
1001 }
1002
1003 static __rte_always_inline void
1004 drain_vhost(struct vhost_dev *vdev)
1005 {
1006         uint16_t ret;
1007         uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1008         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1009         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1010
1011         if (builtin_net_driver) {
1012                 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
1013         } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1014                 uint16_t enqueue_fail = 0;
1015                 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1016
1017                 complete_async_pkts(vdev);
1018                 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1019                 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1020
1021                 enqueue_fail = nr_xmit - ret;
1022                 if (enqueue_fail)
1023                         free_pkts(&m[ret], nr_xmit - ret);
1024         } else {
1025                 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1026                                                 m, nr_xmit);
1027         }
1028
1029         if (enable_stats) {
1030                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1031                                 __ATOMIC_SEQ_CST);
1032                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1033                                 __ATOMIC_SEQ_CST);
1034         }
1035
1036         if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1037                 free_pkts(m, nr_xmit);
1038 }
1039
1040 static __rte_always_inline void
1041 drain_vhost_table(void)
1042 {
1043         uint16_t lcore_id = rte_lcore_id();
1044         struct vhost_bufftable *vhost_txq;
1045         struct vhost_dev *vdev;
1046         uint64_t cur_tsc;
1047
1048         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1049                 if (unlikely(vdev->remove == 1))
1050                         continue;
1051
1052                 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1053
1054                 cur_tsc = rte_rdtsc();
1055                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1056                                 > MBUF_TABLE_DRAIN_TSC)) {
1057                         RTE_LOG_DP(DEBUG, VHOST_DATA,
1058                                 "Vhost TX queue drained after timeout with burst size %u\n",
1059                                 vhost_txq->len);
1060                         drain_vhost(vdev);
1061                         vhost_txq->len = 0;
1062                         vhost_txq->pre_tsc = cur_tsc;
1063                 }
1064         }
1065 }
1066
1067 /*
1068  * Check if the packet destination MAC address is for a local device. If so then put
1069  * the packet on that devices RX queue. If not then return.
1070  */
1071 static __rte_always_inline int
1072 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1073 {
1074         struct rte_ether_hdr *pkt_hdr;
1075         struct vhost_dev *dst_vdev;
1076         struct vhost_bufftable *vhost_txq;
1077         uint16_t lcore_id = rte_lcore_id();
1078         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1079
1080         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1081         if (!dst_vdev)
1082                 return -1;
1083
1084         if (vdev->vid == dst_vdev->vid) {
1085                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1086                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1087                         vdev->vid);
1088                 return 0;
1089         }
1090
1091         RTE_LOG_DP(DEBUG, VHOST_DATA,
1092                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1093
1094         if (unlikely(dst_vdev->remove)) {
1095                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1096                         "(%d) device is marked for removal\n", dst_vdev->vid);
1097                 return 0;
1098         }
1099
1100         vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1101         vhost_txq->m_table[vhost_txq->len++] = m;
1102
1103         if (enable_stats) {
1104                 vdev->stats.tx_total++;
1105                 vdev->stats.tx++;
1106         }
1107
1108         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1109                 drain_vhost(dst_vdev);
1110                 vhost_txq->len = 0;
1111                 vhost_txq->pre_tsc = rte_rdtsc();
1112         }
1113         return 0;
1114 }
1115
1116 /*
1117  * Check if the destination MAC of a packet is one local VM,
1118  * and get its vlan tag, and offset if it is.
1119  */
1120 static __rte_always_inline int
1121 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1122         uint32_t *offset, uint16_t *vlan_tag)
1123 {
1124         struct vhost_dev *dst_vdev;
1125         struct rte_ether_hdr *pkt_hdr =
1126                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1127
1128         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1129         if (!dst_vdev)
1130                 return 0;
1131
1132         if (vdev->vid == dst_vdev->vid) {
1133                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1134                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1135                         vdev->vid);
1136                 return -1;
1137         }
1138
1139         /*
1140          * HW vlan strip will reduce the packet length
1141          * by minus length of vlan tag, so need restore
1142          * the packet length by plus it.
1143          */
1144         *offset  = RTE_VLAN_HLEN;
1145         *vlan_tag = vlan_tags[vdev->vid];
1146
1147         RTE_LOG_DP(DEBUG, VHOST_DATA,
1148                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1149                 vdev->vid, dst_vdev->vid, *vlan_tag);
1150
1151         return 0;
1152 }
1153
1154 static void virtio_tx_offload(struct rte_mbuf *m)
1155 {
1156         struct rte_net_hdr_lens hdr_lens;
1157         struct rte_ipv4_hdr *ipv4_hdr;
1158         struct rte_tcp_hdr *tcp_hdr;
1159         uint32_t ptype;
1160         void *l3_hdr;
1161
1162         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1163         m->l2_len = hdr_lens.l2_len;
1164         m->l3_len = hdr_lens.l3_len;
1165         m->l4_len = hdr_lens.l4_len;
1166
1167         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1168         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1169                 m->l2_len + m->l3_len);
1170
1171         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1172         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1173                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1174                 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1175                 ipv4_hdr = l3_hdr;
1176                 ipv4_hdr->hdr_checksum = 0;
1177                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1178         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1179                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1180                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1181         }
1182 }
1183
1184 static __rte_always_inline void
1185 do_drain_mbuf_table(struct mbuf_table *tx_q)
1186 {
1187         uint16_t count;
1188
1189         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1190                                  tx_q->m_table, tx_q->len);
1191         if (unlikely(count < tx_q->len))
1192                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1193
1194         tx_q->len = 0;
1195 }
1196
1197 /*
1198  * This function routes the TX packet to the correct interface. This
1199  * may be a local device or the physical port.
1200  */
1201 static __rte_always_inline void
1202 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1203 {
1204         struct mbuf_table *tx_q;
1205         unsigned offset = 0;
1206         const uint16_t lcore_id = rte_lcore_id();
1207         struct rte_ether_hdr *nh;
1208
1209
1210         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1211         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1212                 struct vhost_dev *vdev2;
1213
1214                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1215                         if (vdev2 != vdev)
1216                                 sync_virtio_xmit(vdev2, vdev, m);
1217                 }
1218                 goto queue2nic;
1219         }
1220
1221         /*check if destination is local VM*/
1222         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1223                 return;
1224
1225         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1226                 if (unlikely(find_local_dest(vdev, m, &offset,
1227                                              &vlan_tag) != 0)) {
1228                         rte_pktmbuf_free(m);
1229                         return;
1230                 }
1231         }
1232
1233         RTE_LOG_DP(DEBUG, VHOST_DATA,
1234                 "(%d) TX: MAC address is external\n", vdev->vid);
1235
1236 queue2nic:
1237
1238         /*Add packet to the port tx queue*/
1239         tx_q = &lcore_tx_queue[lcore_id];
1240
1241         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1242         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1243                 /* Guest has inserted the vlan tag. */
1244                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1245                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1246                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1247                         (vh->vlan_tci != vlan_tag_be))
1248                         vh->vlan_tci = vlan_tag_be;
1249         } else {
1250                 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1251
1252                 /*
1253                  * Find the right seg to adjust the data len when offset is
1254                  * bigger than tail room size.
1255                  */
1256                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1257                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1258                                 m->data_len += offset;
1259                         else {
1260                                 struct rte_mbuf *seg = m;
1261
1262                                 while ((seg->next != NULL) &&
1263                                         (offset > rte_pktmbuf_tailroom(seg)))
1264                                         seg = seg->next;
1265
1266                                 seg->data_len += offset;
1267                         }
1268                         m->pkt_len += offset;
1269                 }
1270
1271                 m->vlan_tci = vlan_tag;
1272         }
1273
1274         if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1275                 virtio_tx_offload(m);
1276
1277         tx_q->m_table[tx_q->len++] = m;
1278         if (enable_stats) {
1279                 vdev->stats.tx_total++;
1280                 vdev->stats.tx++;
1281         }
1282
1283         if (unlikely(tx_q->len == MAX_PKT_BURST))
1284                 do_drain_mbuf_table(tx_q);
1285 }
1286
1287
1288 static __rte_always_inline void
1289 drain_mbuf_table(struct mbuf_table *tx_q)
1290 {
1291         static uint64_t prev_tsc;
1292         uint64_t cur_tsc;
1293
1294         if (tx_q->len == 0)
1295                 return;
1296
1297         cur_tsc = rte_rdtsc();
1298         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1299                 prev_tsc = cur_tsc;
1300
1301                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1302                         "TX queue drained after timeout with burst size %u\n",
1303                         tx_q->len);
1304                 do_drain_mbuf_table(tx_q);
1305         }
1306 }
1307
1308 static __rte_always_inline void
1309 drain_eth_rx(struct vhost_dev *vdev)
1310 {
1311         uint16_t rx_count, enqueue_count;
1312         struct rte_mbuf *pkts[MAX_PKT_BURST];
1313
1314         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1315                                     pkts, MAX_PKT_BURST);
1316
1317         if (!rx_count)
1318                 return;
1319
1320         /*
1321          * When "enable_retry" is set, here we wait and retry when there
1322          * is no enough free slots in the queue to hold @rx_count packets,
1323          * to diminish packet loss.
1324          */
1325         if (enable_retry &&
1326             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1327                         VIRTIO_RXQ))) {
1328                 uint32_t retry;
1329
1330                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1331                         rte_delay_us(burst_rx_delay_time);
1332                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1333                                         VIRTIO_RXQ))
1334                                 break;
1335                 }
1336         }
1337
1338         if (builtin_net_driver) {
1339                 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1340                                                 pkts, rx_count);
1341         } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1342                 uint16_t enqueue_fail = 0;
1343                 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1344
1345                 complete_async_pkts(vdev);
1346                 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1347                                         VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1348                 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1349
1350                 enqueue_fail = rx_count - enqueue_count;
1351                 if (enqueue_fail)
1352                         free_pkts(&pkts[enqueue_count], enqueue_fail);
1353
1354         } else {
1355                 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1356                                                 pkts, rx_count);
1357         }
1358
1359         if (enable_stats) {
1360                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1361                                 __ATOMIC_SEQ_CST);
1362                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1363                                 __ATOMIC_SEQ_CST);
1364         }
1365
1366         if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1367                 free_pkts(pkts, rx_count);
1368 }
1369
1370 static __rte_always_inline void
1371 drain_virtio_tx(struct vhost_dev *vdev)
1372 {
1373         struct rte_mbuf *pkts[MAX_PKT_BURST];
1374         uint16_t count;
1375         uint16_t i;
1376
1377         if (builtin_net_driver) {
1378                 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1379                                         pkts, MAX_PKT_BURST);
1380         } else {
1381                 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1382                                         mbuf_pool, pkts, MAX_PKT_BURST);
1383         }
1384
1385         /* setup VMDq for the first packet */
1386         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1387                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1388                         free_pkts(pkts, count);
1389         }
1390
1391         for (i = 0; i < count; ++i)
1392                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1393 }
1394
1395 /*
1396  * Main function of vhost-switch. It basically does:
1397  *
1398  * for each vhost device {
1399  *    - drain_eth_rx()
1400  *
1401  *      Which drains the host eth Rx queue linked to the vhost device,
1402  *      and deliver all of them to guest virito Rx ring associated with
1403  *      this vhost device.
1404  *
1405  *    - drain_virtio_tx()
1406  *
1407  *      Which drains the guest virtio Tx queue and deliver all of them
1408  *      to the target, which could be another vhost device, or the
1409  *      physical eth dev. The route is done in function "virtio_tx_route".
1410  * }
1411  */
1412 static int
1413 switch_worker(void *arg __rte_unused)
1414 {
1415         unsigned i;
1416         unsigned lcore_id = rte_lcore_id();
1417         struct vhost_dev *vdev;
1418         struct mbuf_table *tx_q;
1419
1420         RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1421
1422         tx_q = &lcore_tx_queue[lcore_id];
1423         for (i = 0; i < rte_lcore_count(); i++) {
1424                 if (lcore_ids[i] == lcore_id) {
1425                         tx_q->txq_id = i;
1426                         break;
1427                 }
1428         }
1429
1430         while(1) {
1431                 drain_mbuf_table(tx_q);
1432                 drain_vhost_table();
1433                 /*
1434                  * Inform the configuration core that we have exited the
1435                  * linked list and that no devices are in use if requested.
1436                  */
1437                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1438                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1439
1440                 /*
1441                  * Process vhost devices
1442                  */
1443                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1444                               lcore_vdev_entry) {
1445                         if (unlikely(vdev->remove)) {
1446                                 unlink_vmdq(vdev);
1447                                 vdev->ready = DEVICE_SAFE_REMOVE;
1448                                 continue;
1449                         }
1450
1451                         if (likely(vdev->ready == DEVICE_RX))
1452                                 drain_eth_rx(vdev);
1453
1454                         if (likely(!vdev->remove))
1455                                 drain_virtio_tx(vdev);
1456                 }
1457         }
1458
1459         return 0;
1460 }
1461
1462 /*
1463  * Remove a device from the specific data core linked list and from the
1464  * main linked list. Synchronization  occurs through the use of the
1465  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1466  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1467  */
1468 static void
1469 destroy_device(int vid)
1470 {
1471         struct vhost_dev *vdev = NULL;
1472         int lcore;
1473         uint16_t i;
1474
1475         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1476                 if (vdev->vid == vid)
1477                         break;
1478         }
1479         if (!vdev)
1480                 return;
1481         /*set the remove flag. */
1482         vdev->remove = 1;
1483         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1484                 rte_pause();
1485         }
1486
1487         for (i = 0; i < RTE_MAX_LCORE; i++)
1488                 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1489
1490         if (builtin_net_driver)
1491                 vs_vhost_net_remove(vdev);
1492
1493         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1494                      lcore_vdev_entry);
1495         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1496
1497
1498         /* Set the dev_removal_flag on each lcore. */
1499         RTE_LCORE_FOREACH_WORKER(lcore)
1500                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1501
1502         /*
1503          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1504          * we can be sure that they can no longer access the device removed
1505          * from the linked lists and that the devices are no longer in use.
1506          */
1507         RTE_LCORE_FOREACH_WORKER(lcore) {
1508                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1509                         rte_pause();
1510         }
1511
1512         lcore_info[vdev->coreid].device_num--;
1513
1514         RTE_LOG(INFO, VHOST_DATA,
1515                 "(%d) device has been removed from data core\n",
1516                 vdev->vid);
1517
1518         if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1519                 uint16_t n_pkt = 0;
1520                 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1521                 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1522
1523                 while (vdev->pkts_inflight) {
1524                         n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1525                                                 m_cpl, vdev->pkts_inflight, dma_id, 0);
1526                         free_pkts(m_cpl, n_pkt);
1527                         __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1528                 }
1529
1530                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1531                 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1532         }
1533
1534         rte_free(vdev);
1535 }
1536
1537 /*
1538  * A new device is added to a data core. First the device is added to the main linked list
1539  * and then allocated to a specific data core.
1540  */
1541 static int
1542 new_device(int vid)
1543 {
1544         int lcore, core_add = 0;
1545         uint16_t i;
1546         uint32_t device_num_min = num_devices;
1547         struct vhost_dev *vdev;
1548         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1549         if (vdev == NULL) {
1550                 RTE_LOG(INFO, VHOST_DATA,
1551                         "(%d) couldn't allocate memory for vhost dev\n",
1552                         vid);
1553                 return -1;
1554         }
1555         vdev->vid = vid;
1556
1557         for (i = 0; i < RTE_MAX_LCORE; i++) {
1558                 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1559                         = rte_zmalloc("vhost bufftable",
1560                                 sizeof(struct vhost_bufftable),
1561                                 RTE_CACHE_LINE_SIZE);
1562
1563                 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1564                         RTE_LOG(INFO, VHOST_DATA,
1565                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1566                         return -1;
1567                 }
1568         }
1569
1570         if (builtin_net_driver)
1571                 vs_vhost_net_setup(vdev);
1572
1573         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1574         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1575
1576         /*reset ready flag*/
1577         vdev->ready = DEVICE_MAC_LEARNING;
1578         vdev->remove = 0;
1579
1580         /* Find a suitable lcore to add the device. */
1581         RTE_LCORE_FOREACH_WORKER(lcore) {
1582                 if (lcore_info[lcore].device_num < device_num_min) {
1583                         device_num_min = lcore_info[lcore].device_num;
1584                         core_add = lcore;
1585                 }
1586         }
1587         vdev->coreid = core_add;
1588
1589         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1590                           lcore_vdev_entry);
1591         lcore_info[vdev->coreid].device_num++;
1592
1593         /* Disable notifications. */
1594         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1595         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1596
1597         RTE_LOG(INFO, VHOST_DATA,
1598                 "(%d) device has been added to data core %d\n",
1599                 vid, vdev->coreid);
1600
1601         if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1602                 int ret;
1603
1604                 ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1605                 if (ret == 0)
1606                         dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
1607                 return ret;
1608         }
1609
1610         return 0;
1611 }
1612
1613 static int
1614 vring_state_changed(int vid, uint16_t queue_id, int enable)
1615 {
1616         struct vhost_dev *vdev = NULL;
1617
1618         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1619                 if (vdev->vid == vid)
1620                         break;
1621         }
1622         if (!vdev)
1623                 return -1;
1624
1625         if (queue_id != VIRTIO_RXQ)
1626                 return 0;
1627
1628         if (dma_bind[vid].dmas[queue_id].async_enabled) {
1629                 if (!enable) {
1630                         uint16_t n_pkt = 0;
1631                         int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1632                         struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1633
1634                         while (vdev->pkts_inflight) {
1635                                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1636                                                         m_cpl, vdev->pkts_inflight, dma_id, 0);
1637                                 free_pkts(m_cpl, n_pkt);
1638                                 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1639                         }
1640                 }
1641         }
1642
1643         return 0;
1644 }
1645
1646 /*
1647  * These callback allow devices to be added to the data core when configuration
1648  * has been fully complete.
1649  */
1650 static const struct rte_vhost_device_ops virtio_net_device_ops =
1651 {
1652         .new_device =  new_device,
1653         .destroy_device = destroy_device,
1654         .vring_state_changed = vring_state_changed,
1655 };
1656
1657 /*
1658  * This is a thread will wake up after a period to print stats if the user has
1659  * enabled them.
1660  */
1661 static void *
1662 print_stats(__rte_unused void *arg)
1663 {
1664         struct vhost_dev *vdev;
1665         uint64_t tx_dropped, rx_dropped;
1666         uint64_t tx, tx_total, rx, rx_total;
1667         const char clr[] = { 27, '[', '2', 'J', '\0' };
1668         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1669
1670         while(1) {
1671                 sleep(enable_stats);
1672
1673                 /* Clear screen and move to top left */
1674                 printf("%s%s\n", clr, top_left);
1675                 printf("Device statistics =================================\n");
1676
1677                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1678                         tx_total   = vdev->stats.tx_total;
1679                         tx         = vdev->stats.tx;
1680                         tx_dropped = tx_total - tx;
1681
1682                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1683                                 __ATOMIC_SEQ_CST);
1684                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1685                                 __ATOMIC_SEQ_CST);
1686                         rx_dropped = rx_total - rx;
1687
1688                         printf("Statistics for device %d\n"
1689                                 "-----------------------\n"
1690                                 "TX total:              %" PRIu64 "\n"
1691                                 "TX dropped:            %" PRIu64 "\n"
1692                                 "TX successful:         %" PRIu64 "\n"
1693                                 "RX total:              %" PRIu64 "\n"
1694                                 "RX dropped:            %" PRIu64 "\n"
1695                                 "RX successful:         %" PRIu64 "\n",
1696                                 vdev->vid,
1697                                 tx_total, tx_dropped, tx,
1698                                 rx_total, rx_dropped, rx);
1699                 }
1700
1701                 printf("===================================================\n");
1702
1703                 fflush(stdout);
1704         }
1705
1706         return NULL;
1707 }
1708
1709 static void
1710 unregister_drivers(int socket_num)
1711 {
1712         int i, ret;
1713
1714         for (i = 0; i < socket_num; i++) {
1715                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1716                 if (ret != 0)
1717                         RTE_LOG(ERR, VHOST_CONFIG,
1718                                 "Fail to unregister vhost driver for %s.\n",
1719                                 socket_files + i * PATH_MAX);
1720         }
1721 }
1722
1723 /* When we receive a INT signal, unregister vhost driver */
1724 static void
1725 sigint_handler(__rte_unused int signum)
1726 {
1727         /* Unregister vhost driver. */
1728         unregister_drivers(nb_sockets);
1729
1730         exit(0);
1731 }
1732
1733 /*
1734  * While creating an mbuf pool, one key thing is to figure out how
1735  * many mbuf entries is enough for our use. FYI, here are some
1736  * guidelines:
1737  *
1738  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1739  *
1740  * - For each switch core (A CPU core does the packet switch), we need
1741  *   also make some reservation for receiving the packets from virtio
1742  *   Tx queue. How many is enough depends on the usage. It's normally
1743  *   a simple calculation like following:
1744  *
1745  *       MAX_PKT_BURST * max packet size / mbuf size
1746  *
1747  *   So, we definitely need allocate more mbufs when TSO is enabled.
1748  *
1749  * - Similarly, for each switching core, we should serve @nr_rx_desc
1750  *   mbufs for receiving the packets from physical NIC device.
1751  *
1752  * - We also need make sure, for each switch core, we have allocated
1753  *   enough mbufs to fill up the mbuf cache.
1754  */
1755 static void
1756 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1757         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1758 {
1759         uint32_t nr_mbufs;
1760         uint32_t nr_mbufs_per_core;
1761         uint32_t mtu = 1500;
1762
1763         if (mergeable)
1764                 mtu = 9000;
1765         if (enable_tso)
1766                 mtu = 64 * 1024;
1767
1768         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1769                         (mbuf_size - RTE_PKTMBUF_HEADROOM);
1770         nr_mbufs_per_core += nr_rx_desc;
1771         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1772
1773         nr_mbufs  = nr_queues * nr_rx_desc;
1774         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1775         nr_mbufs *= nr_port;
1776
1777         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1778                                             nr_mbuf_cache, 0, mbuf_size,
1779                                             rte_socket_id());
1780         if (mbuf_pool == NULL)
1781                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1782 }
1783
1784 static void
1785 reset_dma(void)
1786 {
1787         int i;
1788
1789         for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1790                 int j;
1791
1792                 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1793                         dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1794                         dma_bind[i].dmas[j].async_enabled = false;
1795                 }
1796         }
1797
1798         for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1799                 dmas_id[i] = INVALID_DMA_ID;
1800 }
1801
1802 /*
1803  * Main function, does initialisation and calls the per-lcore functions.
1804  */
1805 int
1806 main(int argc, char *argv[])
1807 {
1808         unsigned lcore_id, core_id = 0;
1809         unsigned nb_ports, valid_num_ports;
1810         int ret, i;
1811         uint16_t portid;
1812         static pthread_t tid;
1813         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1814
1815         signal(SIGINT, sigint_handler);
1816
1817         /* init EAL */
1818         ret = rte_eal_init(argc, argv);
1819         if (ret < 0)
1820                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1821         argc -= ret;
1822         argv += ret;
1823
1824         /* initialize dma structures */
1825         reset_dma();
1826
1827         /* parse app arguments */
1828         ret = us_vhost_parse_args(argc, argv);
1829         if (ret < 0)
1830                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1831
1832         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1833                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1834
1835                 if (rte_lcore_is_enabled(lcore_id))
1836                         lcore_ids[core_id++] = lcore_id;
1837         }
1838
1839         if (rte_lcore_count() > RTE_MAX_LCORE)
1840                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1841
1842         /* Get the number of physical ports. */
1843         nb_ports = rte_eth_dev_count_avail();
1844
1845         /*
1846          * Update the global var NUM_PORTS and global array PORTS
1847          * and get value of var VALID_NUM_PORTS according to system ports number
1848          */
1849         valid_num_ports = check_ports_num(nb_ports);
1850
1851         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1852                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1853                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1854                 return -1;
1855         }
1856
1857         /*
1858          * FIXME: here we are trying to allocate mbufs big enough for
1859          * @MAX_QUEUES, but the truth is we're never going to use that
1860          * many queues here. We probably should only do allocation for
1861          * those queues we are going to use.
1862          */
1863         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1864                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1865
1866         if (vm2vm_mode == VM2VM_HARDWARE) {
1867                 /* Enable VT loop back to let L2 switch to do it. */
1868                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1869                 RTE_LOG(DEBUG, VHOST_CONFIG,
1870                         "Enable loop back for L2 switch in vmdq.\n");
1871         }
1872
1873         /* initialize all ports */
1874         RTE_ETH_FOREACH_DEV(portid) {
1875                 /* skip ports that are not enabled */
1876                 if ((enabled_port_mask & (1 << portid)) == 0) {
1877                         RTE_LOG(INFO, VHOST_PORT,
1878                                 "Skipping disabled port %d\n", portid);
1879                         continue;
1880                 }
1881                 if (port_init(portid) != 0)
1882                         rte_exit(EXIT_FAILURE,
1883                                 "Cannot initialize network ports\n");
1884         }
1885
1886         /* Enable stats if the user option is set. */
1887         if (enable_stats) {
1888                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1889                                         print_stats, NULL);
1890                 if (ret < 0)
1891                         rte_exit(EXIT_FAILURE,
1892                                 "Cannot create print-stats thread\n");
1893         }
1894
1895         /* Launch all data cores. */
1896         RTE_LCORE_FOREACH_WORKER(lcore_id)
1897                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1898
1899         if (client_mode)
1900                 flags |= RTE_VHOST_USER_CLIENT;
1901
1902         for (i = 0; i < dma_count; i++) {
1903                 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1904                         RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1905                         rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1906                 }
1907         }
1908
1909         /* Register vhost user driver to handle vhost messages. */
1910         for (i = 0; i < nb_sockets; i++) {
1911                 char *file = socket_files + i * PATH_MAX;
1912
1913                 if (dma_count)
1914                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1915
1916                 ret = rte_vhost_driver_register(file, flags);
1917                 if (ret != 0) {
1918                         unregister_drivers(i);
1919                         rte_exit(EXIT_FAILURE,
1920                                 "vhost driver register failure.\n");
1921                 }
1922
1923                 if (builtin_net_driver)
1924                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1925
1926                 if (mergeable == 0) {
1927                         rte_vhost_driver_disable_features(file,
1928                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1929                 }
1930
1931                 if (enable_tx_csum == 0) {
1932                         rte_vhost_driver_disable_features(file,
1933                                 1ULL << VIRTIO_NET_F_CSUM);
1934                 }
1935
1936                 if (enable_tso == 0) {
1937                         rte_vhost_driver_disable_features(file,
1938                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
1939                         rte_vhost_driver_disable_features(file,
1940                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
1941                         rte_vhost_driver_disable_features(file,
1942                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1943                         rte_vhost_driver_disable_features(file,
1944                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1945                 }
1946
1947                 if (promiscuous) {
1948                         rte_vhost_driver_enable_features(file,
1949                                 1ULL << VIRTIO_NET_F_CTRL_RX);
1950                 }
1951
1952                 ret = rte_vhost_driver_callback_register(file,
1953                         &virtio_net_device_ops);
1954                 if (ret != 0) {
1955                         rte_exit(EXIT_FAILURE,
1956                                 "failed to register vhost driver callbacks.\n");
1957                 }
1958
1959                 if (rte_vhost_driver_start(file) < 0) {
1960                         rte_exit(EXIT_FAILURE,
1961                                 "failed to start vhost driver.\n");
1962                 }
1963         }
1964
1965         RTE_LCORE_FOREACH_WORKER(lcore_id)
1966                 rte_eal_wait_lcore(lcore_id);
1967
1968         /* clean up the EAL */
1969         rte_eal_cleanup();
1970
1971         return 0;
1972 }