examples/vhost: support async dequeue data path
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29
30 #include "main.h"
31
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35
36 #define NUM_MBUFS_DEFAULT 0x24000
37
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
40
41 #define MBUF_CACHE_SIZE 128
42 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
43
44 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
45
46 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
48
49 #define JUMBO_FRAME_MAX_SIZE    0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
51
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
54 #define DEVICE_RX                       1
55 #define DEVICE_SAFE_REMOVE      2
56
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
60
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
63
64 #define DMA_RING_SIZE 4096
65
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
68
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
71
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
74 static int dma_count;
75
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
78
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
81
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
85
86 static struct rte_mempool *mbuf_pool;
87 static int mergeable;
88
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
90 typedef enum {
91         VM2VM_DISABLED = 0,
92         VM2VM_SOFTWARE = 1,
93         VM2VM_HARDWARE = 2,
94         VM2VM_LAST
95 } vm2vm_type;
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
97
98 /* Enable stats. */
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
102
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
105
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
108
109 static int client_mode;
110
111 static int builtin_net_driver;
112
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
117
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
121
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
123
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
126         .rxmode = {
127                 .mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
128                 .split_hdr_size = 0,
129                 /*
130                  * VLAN strip is necessary for 1G NIC such as I350,
131                  * this fixes bug of ipv4 forwarding in guest can't
132                  * forward packets from one virtio dev to another virtio dev.
133                  */
134                 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
135         },
136
137         .txmode = {
138                 .mq_mode = RTE_ETH_MQ_TX_NONE,
139                 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140                              RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141                              RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142                              RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143                              RTE_ETH_TX_OFFLOAD_TCP_TSO),
144         },
145         .rx_adv_conf = {
146                 /*
147                  * should be overridden separately in code with
148                  * appropriate values
149                  */
150                 .vmdq_rx_conf = {
151                         .nb_queue_pools = RTE_ETH_8_POOLS,
152                         .enable_default_pool = 0,
153                         .default_pool = 0,
154                         .nb_pool_maps = 0,
155                         .pool_map = {{0, 0},},
156                 },
157         },
158 };
159
160
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
167
168 const uint16_t vlan_tags[] = {
169         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
171         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
177 };
178
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
181
182 static struct vhost_dev_tailq_list vhost_dev_list =
183         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
184
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
186
187 /* Used for queueing bursts of TX packets. */
188 struct mbuf_table {
189         unsigned len;
190         unsigned txq_id;
191         struct rte_mbuf *m_table[MAX_PKT_BURST];
192 };
193
194 struct vhost_bufftable {
195         uint32_t len;
196         uint64_t pre_tsc;
197         struct rte_mbuf *m_table[MAX_PKT_BURST];
198 };
199
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
202
203 /*
204  * Vhost TX buffer for each data core.
205  * Every data core maintains a TX buffer for every vhost device,
206  * which is used for batch pkts enqueue for higher performance.
207  */
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
209
210 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
211                                  / US_PER_S * BURST_TX_DRAIN_US)
212
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
214
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
217 {
218         return dma_bind[socketid].async_flag;
219 }
220
221 static inline void
222 init_vid2socketid_array(int vid, int socketid)
223 {
224         vid2socketid[vid] = socketid;
225 }
226
227 static inline bool
228 is_dma_configured(int16_t dev_id)
229 {
230         int i;
231
232         for (i = 0; i < dma_count; i++)
233                 if (dmas_id[i] == dev_id)
234                         return true;
235         return false;
236 }
237
238 static inline int
239 open_dma(const char *value)
240 {
241         struct dma_for_vhost *dma_info = dma_bind;
242         char *input = strndup(value, strlen(value) + 1);
243         char *addrs = input;
244         char *ptrs[2];
245         char *start, *end, *substr;
246         int64_t socketid, vring_id;
247
248         struct rte_dma_info info;
249         struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250         struct rte_dma_vchan_conf qconf = {
251                 .direction = RTE_DMA_DIR_MEM_TO_MEM,
252                 .nb_desc = DMA_RING_SIZE
253         };
254
255         int dev_id;
256         int ret = 0;
257         uint16_t i = 0;
258         char *dma_arg[RTE_MAX_VHOST_DEVICE];
259         int args_nr;
260
261         while (isblank(*addrs))
262                 addrs++;
263         if (*addrs == '\0') {
264                 ret = -1;
265                 goto out;
266         }
267
268         /* process DMA devices within bracket. */
269         addrs++;
270         substr = strtok(addrs, ";]");
271         if (!substr) {
272                 ret = -1;
273                 goto out;
274         }
275
276         args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
277         if (args_nr <= 0) {
278                 ret = -1;
279                 goto out;
280         }
281
282         while (i < args_nr) {
283                 char *arg_temp = dma_arg[i];
284                 char *txd, *rxd;
285                 uint8_t sub_nr;
286                 int async_flag;
287
288                 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
289                 if (sub_nr != 2) {
290                         ret = -1;
291                         goto out;
292                 }
293
294                 txd = strstr(ptrs[0], "txd");
295                 rxd = strstr(ptrs[0], "rxd");
296                 if (txd) {
297                         start = txd;
298                         vring_id = VIRTIO_RXQ;
299                         async_flag = ASYNC_ENQUEUE_VHOST;
300                 } else if (rxd) {
301                         start = rxd;
302                         vring_id = VIRTIO_TXQ;
303                         async_flag = ASYNC_DEQUEUE_VHOST;
304                 } else {
305                         ret = -1;
306                         goto out;
307                 }
308
309                 start += 3;
310                 socketid = strtol(start, &end, 0);
311                 if (end == start) {
312                         ret = -1;
313                         goto out;
314                 }
315
316                 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
317                 if (dev_id < 0) {
318                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
319                         ret = -1;
320                         goto out;
321                 }
322
323                 /* DMA device is already configured, so skip */
324                 if (is_dma_configured(dev_id))
325                         goto done;
326
327                 if (rte_dma_info_get(dev_id, &info) != 0) {
328                         RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
329                         ret = -1;
330                         goto out;
331                 }
332
333                 if (info.max_vchans < 1) {
334                         RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
335                         ret = -1;
336                         goto out;
337                 }
338
339                 if (rte_dma_configure(dev_id, &dev_config) != 0) {
340                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
341                         ret = -1;
342                         goto out;
343                 }
344
345                 /* Check the max desc supported by DMA device */
346                 rte_dma_info_get(dev_id, &info);
347                 if (info.nb_vchans != 1) {
348                         RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
349                                         dev_id);
350                         ret = -1;
351                         goto out;
352                 }
353
354                 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
355
356                 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
358                         ret = -1;
359                         goto out;
360                 }
361
362                 if (rte_dma_start(dev_id) != 0) {
363                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
364                         ret = -1;
365                         goto out;
366                 }
367
368                 dmas_id[dma_count++] = dev_id;
369
370 done:
371                 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372                 (dma_info + socketid)->async_flag |= async_flag;
373                 i++;
374         }
375 out:
376         free(input);
377         return ret;
378 }
379
380 /*
381  * Builds up the correct configuration for VMDQ VLAN pool map
382  * according to the pool & queue limits.
383  */
384 static inline int
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
386 {
387         struct rte_eth_vmdq_rx_conf conf;
388         struct rte_eth_vmdq_rx_conf *def_conf =
389                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
390         unsigned i;
391
392         memset(&conf, 0, sizeof(conf));
393         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394         conf.nb_pool_maps = num_devices;
395         conf.enable_loop_back = def_conf->enable_loop_back;
396         conf.rx_mode = def_conf->rx_mode;
397
398         for (i = 0; i < conf.nb_pool_maps; i++) {
399                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
400                 conf.pool_map[i].pools = (1UL << i);
401         }
402
403         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
406         return 0;
407 }
408
409 /*
410  * Initialises a given port using global settings and with the rx buffers
411  * coming from the mbuf_pool passed as parameter
412  */
413 static inline int
414 port_init(uint16_t port)
415 {
416         struct rte_eth_dev_info dev_info;
417         struct rte_eth_conf port_conf;
418         struct rte_eth_rxconf *rxconf;
419         struct rte_eth_txconf *txconf;
420         int16_t rx_rings, tx_rings;
421         uint16_t rx_ring_size, tx_ring_size;
422         int retval;
423         uint16_t q;
424
425         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426         retval = rte_eth_dev_info_get(port, &dev_info);
427         if (retval != 0) {
428                 RTE_LOG(ERR, VHOST_PORT,
429                         "Error during getting device (port %u) info: %s\n",
430                         port, strerror(-retval));
431
432                 return retval;
433         }
434
435         rxconf = &dev_info.default_rxconf;
436         txconf = &dev_info.default_txconf;
437         rxconf->rx_drop_en = 1;
438
439         /*configure the number of supported virtio devices based on VMDQ limits */
440         num_devices = dev_info.max_vmdq_pools;
441
442         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
443         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
444
445         tx_rings = (uint16_t)rte_lcore_count();
446
447         if (mergeable) {
448                 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
449                         vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
450                 else
451                         vmdq_conf_default.rxmode.mtu = MAX_MTU;
452         }
453
454         /* Get port configuration. */
455         retval = get_eth_conf(&port_conf, num_devices);
456         if (retval < 0)
457                 return retval;
458         /* NIC queues are divided into pf queues and vmdq queues.  */
459         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
460         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
461         num_vmdq_queues = num_devices * queues_per_pool;
462         num_queues = num_pf_queues + num_vmdq_queues;
463         vmdq_queue_base = dev_info.vmdq_queue_base;
464         vmdq_pool_base  = dev_info.vmdq_pool_base;
465         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
466                 num_pf_queues, num_devices, queues_per_pool);
467
468         if (!rte_eth_dev_is_valid_port(port))
469                 return -1;
470
471         rx_rings = (uint16_t)dev_info.max_rx_queues;
472         if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
473                 port_conf.txmode.offloads |=
474                         RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
475         /* Configure ethernet device. */
476         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
477         if (retval != 0) {
478                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
479                         port, strerror(-retval));
480                 return retval;
481         }
482
483         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
484                 &tx_ring_size);
485         if (retval != 0) {
486                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
487                         "for port %u: %s.\n", port, strerror(-retval));
488                 return retval;
489         }
490         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
491                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
492                         "for Rx queues on port %u.\n", port);
493                 return -1;
494         }
495
496         /* Setup the queues. */
497         rxconf->offloads = port_conf.rxmode.offloads;
498         for (q = 0; q < rx_rings; q ++) {
499                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
500                                                 rte_eth_dev_socket_id(port),
501                                                 rxconf,
502                                                 mbuf_pool);
503                 if (retval < 0) {
504                         RTE_LOG(ERR, VHOST_PORT,
505                                 "Failed to setup rx queue %u of port %u: %s.\n",
506                                 q, port, strerror(-retval));
507                         return retval;
508                 }
509         }
510         txconf->offloads = port_conf.txmode.offloads;
511         for (q = 0; q < tx_rings; q ++) {
512                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
513                                                 rte_eth_dev_socket_id(port),
514                                                 txconf);
515                 if (retval < 0) {
516                         RTE_LOG(ERR, VHOST_PORT,
517                                 "Failed to setup tx queue %u of port %u: %s.\n",
518                                 q, port, strerror(-retval));
519                         return retval;
520                 }
521         }
522
523         /* Start the device. */
524         retval  = rte_eth_dev_start(port);
525         if (retval < 0) {
526                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
527                         port, strerror(-retval));
528                 return retval;
529         }
530
531         if (promiscuous) {
532                 retval = rte_eth_promiscuous_enable(port);
533                 if (retval != 0) {
534                         RTE_LOG(ERR, VHOST_PORT,
535                                 "Failed to enable promiscuous mode on port %u: %s\n",
536                                 port, rte_strerror(-retval));
537                         return retval;
538                 }
539         }
540
541         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
542         if (retval < 0) {
543                 RTE_LOG(ERR, VHOST_PORT,
544                         "Failed to get MAC address on port %u: %s\n",
545                         port, rte_strerror(-retval));
546                 return retval;
547         }
548
549         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
550         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
551                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
552                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
553
554         return 0;
555 }
556
557 /*
558  * Set socket file path.
559  */
560 static int
561 us_vhost_parse_socket_path(const char *q_arg)
562 {
563         char *old;
564
565         /* parse number string */
566         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
567                 return -1;
568
569         old = socket_files;
570         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
571         if (socket_files == NULL) {
572                 free(old);
573                 return -1;
574         }
575
576         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
577         nb_sockets++;
578
579         return 0;
580 }
581
582 /*
583  * Parse the portmask provided at run time.
584  */
585 static int
586 parse_portmask(const char *portmask)
587 {
588         char *end = NULL;
589         unsigned long pm;
590
591         errno = 0;
592
593         /* parse hexadecimal string */
594         pm = strtoul(portmask, &end, 16);
595         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
596                 return 0;
597
598         return pm;
599
600 }
601
602 /*
603  * Parse num options at run time.
604  */
605 static int
606 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
607 {
608         char *end = NULL;
609         unsigned long num;
610
611         errno = 0;
612
613         /* parse unsigned int string */
614         num = strtoul(q_arg, &end, 10);
615         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
616                 return -1;
617
618         if (num > max_valid_value)
619                 return -1;
620
621         return num;
622
623 }
624
625 /*
626  * Display usage
627  */
628 static void
629 us_vhost_usage(const char *prgname)
630 {
631         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
632         "               --vm2vm [0|1|2]\n"
633         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
634         "               --socket-file <path>\n"
635         "               --nb-devices ND\n"
636         "               -p PORTMASK: Set mask for ports to be used by application\n"
637         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
638         "               --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
639         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
640         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
641         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
642         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
643         "               --socket-file: The path of the socket file.\n"
644         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
645         "               --tso [0|1] disable/enable TCP segment offload.\n"
646         "               --client register a vhost-user socket as client mode.\n"
647         "               --dmas register dma channel for specific vhost device.\n"
648         "               --total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
649                prgname);
650 }
651
652 enum {
653 #define OPT_VM2VM               "vm2vm"
654         OPT_VM2VM_NUM = 256,
655 #define OPT_RX_RETRY            "rx-retry"
656         OPT_RX_RETRY_NUM,
657 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
658         OPT_RX_RETRY_DELAY_NUM,
659 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
660         OPT_RX_RETRY_NUMB_NUM,
661 #define OPT_MERGEABLE           "mergeable"
662         OPT_MERGEABLE_NUM,
663 #define OPT_STATS               "stats"
664         OPT_STATS_NUM,
665 #define OPT_SOCKET_FILE         "socket-file"
666         OPT_SOCKET_FILE_NUM,
667 #define OPT_TX_CSUM             "tx-csum"
668         OPT_TX_CSUM_NUM,
669 #define OPT_TSO                 "tso"
670         OPT_TSO_NUM,
671 #define OPT_CLIENT              "client"
672         OPT_CLIENT_NUM,
673 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
674         OPT_BUILTIN_NET_DRIVER_NUM,
675 #define OPT_DMAS                "dmas"
676         OPT_DMAS_NUM,
677 #define OPT_NUM_MBUFS           "total-num-mbufs"
678         OPT_NUM_MBUFS_NUM,
679 };
680
681 /*
682  * Parse the arguments given in the command line of the application.
683  */
684 static int
685 us_vhost_parse_args(int argc, char **argv)
686 {
687         int opt, ret;
688         int option_index;
689         unsigned i;
690         const char *prgname = argv[0];
691         static struct option long_option[] = {
692                 {OPT_VM2VM, required_argument,
693                                 NULL, OPT_VM2VM_NUM},
694                 {OPT_RX_RETRY, required_argument,
695                                 NULL, OPT_RX_RETRY_NUM},
696                 {OPT_RX_RETRY_DELAY, required_argument,
697                                 NULL, OPT_RX_RETRY_DELAY_NUM},
698                 {OPT_RX_RETRY_NUMB, required_argument,
699                                 NULL, OPT_RX_RETRY_NUMB_NUM},
700                 {OPT_MERGEABLE, required_argument,
701                                 NULL, OPT_MERGEABLE_NUM},
702                 {OPT_STATS, required_argument,
703                                 NULL, OPT_STATS_NUM},
704                 {OPT_SOCKET_FILE, required_argument,
705                                 NULL, OPT_SOCKET_FILE_NUM},
706                 {OPT_TX_CSUM, required_argument,
707                                 NULL, OPT_TX_CSUM_NUM},
708                 {OPT_TSO, required_argument,
709                                 NULL, OPT_TSO_NUM},
710                 {OPT_CLIENT, no_argument,
711                                 NULL, OPT_CLIENT_NUM},
712                 {OPT_BUILTIN_NET_DRIVER, no_argument,
713                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
714                 {OPT_DMAS, required_argument,
715                                 NULL, OPT_DMAS_NUM},
716                 {OPT_NUM_MBUFS, required_argument,
717                                 NULL, OPT_NUM_MBUFS_NUM},
718                 {NULL, 0, 0, 0},
719         };
720
721         /* Parse command line */
722         while ((opt = getopt_long(argc, argv, "p:P",
723                         long_option, &option_index)) != EOF) {
724                 switch (opt) {
725                 /* Portmask */
726                 case 'p':
727                         enabled_port_mask = parse_portmask(optarg);
728                         if (enabled_port_mask == 0) {
729                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
730                                 us_vhost_usage(prgname);
731                                 return -1;
732                         }
733                         break;
734
735                 case 'P':
736                         promiscuous = 1;
737                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
738                                 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
739                                 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
740                         break;
741
742                 case OPT_VM2VM_NUM:
743                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
744                         if (ret == -1) {
745                                 RTE_LOG(INFO, VHOST_CONFIG,
746                                         "Invalid argument for "
747                                         "vm2vm [0|1|2]\n");
748                                 us_vhost_usage(prgname);
749                                 return -1;
750                         }
751                         vm2vm_mode = (vm2vm_type)ret;
752                         break;
753
754                 case OPT_RX_RETRY_NUM:
755                         ret = parse_num_opt(optarg, 1);
756                         if (ret == -1) {
757                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
758                                 us_vhost_usage(prgname);
759                                 return -1;
760                         }
761                         enable_retry = ret;
762                         break;
763
764                 case OPT_TX_CSUM_NUM:
765                         ret = parse_num_opt(optarg, 1);
766                         if (ret == -1) {
767                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
768                                 us_vhost_usage(prgname);
769                                 return -1;
770                         }
771                         enable_tx_csum = ret;
772                         break;
773
774                 case OPT_TSO_NUM:
775                         ret = parse_num_opt(optarg, 1);
776                         if (ret == -1) {
777                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
778                                 us_vhost_usage(prgname);
779                                 return -1;
780                         }
781                         enable_tso = ret;
782                         break;
783
784                 case OPT_RX_RETRY_DELAY_NUM:
785                         ret = parse_num_opt(optarg, INT32_MAX);
786                         if (ret == -1) {
787                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
788                                 us_vhost_usage(prgname);
789                                 return -1;
790                         }
791                         burst_rx_delay_time = ret;
792                         break;
793
794                 case OPT_RX_RETRY_NUMB_NUM:
795                         ret = parse_num_opt(optarg, INT32_MAX);
796                         if (ret == -1) {
797                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
798                                 us_vhost_usage(prgname);
799                                 return -1;
800                         }
801                         burst_rx_retry_num = ret;
802                         break;
803
804                 case OPT_MERGEABLE_NUM:
805                         ret = parse_num_opt(optarg, 1);
806                         if (ret == -1) {
807                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
808                                 us_vhost_usage(prgname);
809                                 return -1;
810                         }
811                         mergeable = !!ret;
812                         break;
813
814                 case OPT_STATS_NUM:
815                         ret = parse_num_opt(optarg, INT32_MAX);
816                         if (ret == -1) {
817                                 RTE_LOG(INFO, VHOST_CONFIG,
818                                         "Invalid argument for stats [0..N]\n");
819                                 us_vhost_usage(prgname);
820                                 return -1;
821                         }
822                         enable_stats = ret;
823                         break;
824
825                 /* Set socket file path. */
826                 case OPT_SOCKET_FILE_NUM:
827                         if (us_vhost_parse_socket_path(optarg) == -1) {
828                                 RTE_LOG(INFO, VHOST_CONFIG,
829                                 "Invalid argument for socket name (Max %d characters)\n",
830                                 PATH_MAX);
831                                 us_vhost_usage(prgname);
832                                 return -1;
833                         }
834                         break;
835
836                 case OPT_DMAS_NUM:
837                         if (open_dma(optarg) == -1) {
838                                 RTE_LOG(INFO, VHOST_CONFIG,
839                                         "Wrong DMA args\n");
840                                 us_vhost_usage(prgname);
841                                 return -1;
842                         }
843                         break;
844
845                 case OPT_NUM_MBUFS_NUM:
846                         ret = parse_num_opt(optarg, INT32_MAX);
847                         if (ret == -1) {
848                                 RTE_LOG(INFO, VHOST_CONFIG,
849                                         "Invalid argument for total-num-mbufs [0..N]\n");
850                                 us_vhost_usage(prgname);
851                                 return -1;
852                         }
853
854                         if (total_num_mbufs < ret)
855                                 total_num_mbufs = ret;
856                         break;
857
858                 case OPT_CLIENT_NUM:
859                         client_mode = 1;
860                         break;
861
862                 case OPT_BUILTIN_NET_DRIVER_NUM:
863                         builtin_net_driver = 1;
864                         break;
865
866                 /* Invalid option - print options. */
867                 default:
868                         us_vhost_usage(prgname);
869                         return -1;
870                 }
871         }
872
873         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
874                 if (enabled_port_mask & (1 << i))
875                         ports[num_ports++] = i;
876         }
877
878         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
879                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
880                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
881                 return -1;
882         }
883
884         return 0;
885 }
886
887 /*
888  * Update the global var NUM_PORTS and array PORTS according to system ports number
889  * and return valid ports number
890  */
891 static unsigned check_ports_num(unsigned nb_ports)
892 {
893         unsigned valid_num_ports = num_ports;
894         unsigned portid;
895
896         if (num_ports > nb_ports) {
897                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
898                         num_ports, nb_ports);
899                 num_ports = nb_ports;
900         }
901
902         for (portid = 0; portid < num_ports; portid ++) {
903                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
904                         RTE_LOG(INFO, VHOST_PORT,
905                                 "\nSpecified port ID(%u) is not valid\n",
906                                 ports[portid]);
907                         ports[portid] = INVALID_PORT_ID;
908                         valid_num_ports--;
909                 }
910         }
911         return valid_num_ports;
912 }
913
914 static __rte_always_inline struct vhost_dev *
915 find_vhost_dev(struct rte_ether_addr *mac)
916 {
917         struct vhost_dev *vdev;
918
919         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
920                 if (vdev->ready == DEVICE_RX &&
921                     rte_is_same_ether_addr(mac, &vdev->mac_address))
922                         return vdev;
923         }
924
925         return NULL;
926 }
927
928 /*
929  * This function learns the MAC address of the device and registers this along with a
930  * vlan tag to a VMDQ.
931  */
932 static int
933 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
934 {
935         struct rte_ether_hdr *pkt_hdr;
936         int i, ret;
937
938         /* Learn MAC address of guest device from packet */
939         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
940
941         if (find_vhost_dev(&pkt_hdr->src_addr)) {
942                 RTE_LOG(ERR, VHOST_DATA,
943                         "(%d) device is using a registered MAC!\n",
944                         vdev->vid);
945                 return -1;
946         }
947
948         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
949                 vdev->mac_address.addr_bytes[i] =
950                         pkt_hdr->src_addr.addr_bytes[i];
951
952         /* vlan_tag currently uses the device_id. */
953         vdev->vlan_tag = vlan_tags[vdev->vid];
954
955         /* Print out VMDQ registration info. */
956         RTE_LOG(INFO, VHOST_DATA,
957                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
958                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
959                 vdev->vlan_tag);
960
961         /* Register the MAC address. */
962         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
963                                 (uint32_t)vdev->vid + vmdq_pool_base);
964         if (ret)
965                 RTE_LOG(ERR, VHOST_DATA,
966                         "(%d) failed to add device MAC address to VMDQ\n",
967                         vdev->vid);
968
969         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
970
971         /* Set device as ready for RX. */
972         vdev->ready = DEVICE_RX;
973
974         return 0;
975 }
976
977 /*
978  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
979  * queue before disabling RX on the device.
980  */
981 static inline void
982 unlink_vmdq(struct vhost_dev *vdev)
983 {
984         unsigned i = 0;
985         unsigned rx_count;
986         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
987
988         if (vdev->ready == DEVICE_RX) {
989                 /*clear MAC and VLAN settings*/
990                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
991                 for (i = 0; i < 6; i++)
992                         vdev->mac_address.addr_bytes[i] = 0;
993
994                 vdev->vlan_tag = 0;
995
996                 /*Clear out the receive buffers*/
997                 rx_count = rte_eth_rx_burst(ports[0],
998                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
999
1000                 while (rx_count) {
1001                         for (i = 0; i < rx_count; i++)
1002                                 rte_pktmbuf_free(pkts_burst[i]);
1003
1004                         rx_count = rte_eth_rx_burst(ports[0],
1005                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1006                 }
1007
1008                 vdev->ready = DEVICE_MAC_LEARNING;
1009         }
1010 }
1011
1012 static inline void
1013 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1014 {
1015         while (n--)
1016                 rte_pktmbuf_free(pkts[n]);
1017 }
1018
1019 static __rte_always_inline void
1020 complete_async_pkts(struct vhost_dev *vdev)
1021 {
1022         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1023         uint16_t complete_count;
1024         int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1025
1026         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1027                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1028         if (complete_count)
1029                 free_pkts(p_cpl, complete_count);
1030
1031 }
1032
1033 static __rte_always_inline void
1034 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1035             struct rte_mbuf *m)
1036 {
1037         uint16_t ret;
1038
1039         if (builtin_net_driver) {
1040                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1041         } else {
1042                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1043         }
1044
1045         if (enable_stats) {
1046                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1047                                 __ATOMIC_SEQ_CST);
1048                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1049                                 __ATOMIC_SEQ_CST);
1050                 src_vdev->stats.tx_total++;
1051                 src_vdev->stats.tx += ret;
1052         }
1053 }
1054
1055 static __rte_always_inline void
1056 drain_vhost(struct vhost_dev *vdev)
1057 {
1058         uint16_t ret;
1059         uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1060         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1061         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1062
1063         ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1064
1065         if (enable_stats) {
1066                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1067                                 __ATOMIC_SEQ_CST);
1068                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1069                                 __ATOMIC_SEQ_CST);
1070         }
1071
1072         if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1073                 free_pkts(m, nr_xmit);
1074 }
1075
1076 static __rte_always_inline void
1077 drain_vhost_table(void)
1078 {
1079         uint16_t lcore_id = rte_lcore_id();
1080         struct vhost_bufftable *vhost_txq;
1081         struct vhost_dev *vdev;
1082         uint64_t cur_tsc;
1083
1084         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1085                 if (unlikely(vdev->remove == 1))
1086                         continue;
1087
1088                 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1089
1090                 cur_tsc = rte_rdtsc();
1091                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1092                                 > MBUF_TABLE_DRAIN_TSC)) {
1093                         RTE_LOG_DP(DEBUG, VHOST_DATA,
1094                                 "Vhost TX queue drained after timeout with burst size %u\n",
1095                                 vhost_txq->len);
1096                         drain_vhost(vdev);
1097                         vhost_txq->len = 0;
1098                         vhost_txq->pre_tsc = cur_tsc;
1099                 }
1100         }
1101 }
1102
1103 /*
1104  * Check if the packet destination MAC address is for a local device. If so then put
1105  * the packet on that devices RX queue. If not then return.
1106  */
1107 static __rte_always_inline int
1108 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1109 {
1110         struct rte_ether_hdr *pkt_hdr;
1111         struct vhost_dev *dst_vdev;
1112         struct vhost_bufftable *vhost_txq;
1113         uint16_t lcore_id = rte_lcore_id();
1114         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1115
1116         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1117         if (!dst_vdev)
1118                 return -1;
1119
1120         if (vdev->vid == dst_vdev->vid) {
1121                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1122                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1123                         vdev->vid);
1124                 return 0;
1125         }
1126
1127         RTE_LOG_DP(DEBUG, VHOST_DATA,
1128                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1129
1130         if (unlikely(dst_vdev->remove)) {
1131                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1132                         "(%d) device is marked for removal\n", dst_vdev->vid);
1133                 return 0;
1134         }
1135
1136         vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1137         vhost_txq->m_table[vhost_txq->len++] = m;
1138
1139         if (enable_stats) {
1140                 vdev->stats.tx_total++;
1141                 vdev->stats.tx++;
1142         }
1143
1144         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1145                 drain_vhost(dst_vdev);
1146                 vhost_txq->len = 0;
1147                 vhost_txq->pre_tsc = rte_rdtsc();
1148         }
1149         return 0;
1150 }
1151
1152 /*
1153  * Check if the destination MAC of a packet is one local VM,
1154  * and get its vlan tag, and offset if it is.
1155  */
1156 static __rte_always_inline int
1157 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1158         uint32_t *offset, uint16_t *vlan_tag)
1159 {
1160         struct vhost_dev *dst_vdev;
1161         struct rte_ether_hdr *pkt_hdr =
1162                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1163
1164         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1165         if (!dst_vdev)
1166                 return 0;
1167
1168         if (vdev->vid == dst_vdev->vid) {
1169                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1170                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1171                         vdev->vid);
1172                 return -1;
1173         }
1174
1175         /*
1176          * HW vlan strip will reduce the packet length
1177          * by minus length of vlan tag, so need restore
1178          * the packet length by plus it.
1179          */
1180         *offset  = RTE_VLAN_HLEN;
1181         *vlan_tag = vlan_tags[vdev->vid];
1182
1183         RTE_LOG_DP(DEBUG, VHOST_DATA,
1184                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1185                 vdev->vid, dst_vdev->vid, *vlan_tag);
1186
1187         return 0;
1188 }
1189
1190 static void virtio_tx_offload(struct rte_mbuf *m)
1191 {
1192         struct rte_net_hdr_lens hdr_lens;
1193         struct rte_ipv4_hdr *ipv4_hdr;
1194         struct rte_tcp_hdr *tcp_hdr;
1195         uint32_t ptype;
1196         void *l3_hdr;
1197
1198         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1199         m->l2_len = hdr_lens.l2_len;
1200         m->l3_len = hdr_lens.l3_len;
1201         m->l4_len = hdr_lens.l4_len;
1202
1203         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1204         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1205                 m->l2_len + m->l3_len);
1206
1207         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1208         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1209                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1210                 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1211                 ipv4_hdr = l3_hdr;
1212                 ipv4_hdr->hdr_checksum = 0;
1213                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1214         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1215                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1216                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1217         }
1218 }
1219
1220 static __rte_always_inline void
1221 do_drain_mbuf_table(struct mbuf_table *tx_q)
1222 {
1223         uint16_t count;
1224
1225         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1226                                  tx_q->m_table, tx_q->len);
1227         if (unlikely(count < tx_q->len))
1228                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1229
1230         tx_q->len = 0;
1231 }
1232
1233 /*
1234  * This function routes the TX packet to the correct interface. This
1235  * may be a local device or the physical port.
1236  */
1237 static __rte_always_inline void
1238 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1239 {
1240         struct mbuf_table *tx_q;
1241         unsigned offset = 0;
1242         const uint16_t lcore_id = rte_lcore_id();
1243         struct rte_ether_hdr *nh;
1244
1245
1246         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1247         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1248                 struct vhost_dev *vdev2;
1249
1250                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1251                         if (vdev2 != vdev)
1252                                 sync_virtio_xmit(vdev2, vdev, m);
1253                 }
1254                 goto queue2nic;
1255         }
1256
1257         /*check if destination is local VM*/
1258         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1259                 return;
1260
1261         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1262                 if (unlikely(find_local_dest(vdev, m, &offset,
1263                                              &vlan_tag) != 0)) {
1264                         rte_pktmbuf_free(m);
1265                         return;
1266                 }
1267         }
1268
1269         RTE_LOG_DP(DEBUG, VHOST_DATA,
1270                 "(%d) TX: MAC address is external\n", vdev->vid);
1271
1272 queue2nic:
1273
1274         /*Add packet to the port tx queue*/
1275         tx_q = &lcore_tx_queue[lcore_id];
1276
1277         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1278         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1279                 /* Guest has inserted the vlan tag. */
1280                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1281                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1282                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1283                         (vh->vlan_tci != vlan_tag_be))
1284                         vh->vlan_tci = vlan_tag_be;
1285         } else {
1286                 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1287
1288                 /*
1289                  * Find the right seg to adjust the data len when offset is
1290                  * bigger than tail room size.
1291                  */
1292                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1293                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1294                                 m->data_len += offset;
1295                         else {
1296                                 struct rte_mbuf *seg = m;
1297
1298                                 while ((seg->next != NULL) &&
1299                                         (offset > rte_pktmbuf_tailroom(seg)))
1300                                         seg = seg->next;
1301
1302                                 seg->data_len += offset;
1303                         }
1304                         m->pkt_len += offset;
1305                 }
1306
1307                 m->vlan_tci = vlan_tag;
1308         }
1309
1310         if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1311                 virtio_tx_offload(m);
1312
1313         tx_q->m_table[tx_q->len++] = m;
1314         if (enable_stats) {
1315                 vdev->stats.tx_total++;
1316                 vdev->stats.tx++;
1317         }
1318
1319         if (unlikely(tx_q->len == MAX_PKT_BURST))
1320                 do_drain_mbuf_table(tx_q);
1321 }
1322
1323
1324 static __rte_always_inline void
1325 drain_mbuf_table(struct mbuf_table *tx_q)
1326 {
1327         static uint64_t prev_tsc;
1328         uint64_t cur_tsc;
1329
1330         if (tx_q->len == 0)
1331                 return;
1332
1333         cur_tsc = rte_rdtsc();
1334         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1335                 prev_tsc = cur_tsc;
1336
1337                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1338                         "TX queue drained after timeout with burst size %u\n",
1339                         tx_q->len);
1340                 do_drain_mbuf_table(tx_q);
1341         }
1342 }
1343
1344 uint16_t
1345 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1346                 struct rte_mbuf **pkts, uint32_t rx_count)
1347 {
1348         uint16_t enqueue_count;
1349         uint16_t enqueue_fail = 0;
1350         uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1351
1352         complete_async_pkts(dev);
1353         enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1354                                         pkts, rx_count, dma_id, 0);
1355
1356         enqueue_fail = rx_count - enqueue_count;
1357         if (enqueue_fail)
1358                 free_pkts(&pkts[enqueue_count], enqueue_fail);
1359
1360         return enqueue_count;
1361 }
1362
1363 uint16_t
1364 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1365                 struct rte_mbuf **pkts, uint32_t rx_count)
1366 {
1367         return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1368 }
1369
1370 static __rte_always_inline void
1371 drain_eth_rx(struct vhost_dev *vdev)
1372 {
1373         uint16_t rx_count, enqueue_count;
1374         struct rte_mbuf *pkts[MAX_PKT_BURST];
1375
1376         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1377                                     pkts, MAX_PKT_BURST);
1378
1379         if (!rx_count)
1380                 return;
1381
1382         /*
1383          * When "enable_retry" is set, here we wait and retry when there
1384          * is no enough free slots in the queue to hold @rx_count packets,
1385          * to diminish packet loss.
1386          */
1387         if (enable_retry &&
1388             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1389                         VIRTIO_RXQ))) {
1390                 uint32_t retry;
1391
1392                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1393                         rte_delay_us(burst_rx_delay_time);
1394                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1395                                         VIRTIO_RXQ))
1396                                 break;
1397                 }
1398         }
1399
1400         enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1401                                         VIRTIO_RXQ, pkts, rx_count);
1402
1403         if (enable_stats) {
1404                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1405                                 __ATOMIC_SEQ_CST);
1406                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1407                                 __ATOMIC_SEQ_CST);
1408         }
1409
1410         if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1411                 free_pkts(pkts, rx_count);
1412 }
1413
1414 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1415                             struct rte_mempool *mbuf_pool,
1416                             struct rte_mbuf **pkts, uint16_t count)
1417 {
1418         int nr_inflight;
1419         uint16_t dequeue_count;
1420         int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1421
1422         dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1423                         mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1424
1425         return dequeue_count;
1426 }
1427
1428 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1429                            struct rte_mempool *mbuf_pool,
1430                            struct rte_mbuf **pkts, uint16_t count)
1431 {
1432         return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1433 }
1434
1435 static __rte_always_inline void
1436 drain_virtio_tx(struct vhost_dev *vdev)
1437 {
1438         struct rte_mbuf *pkts[MAX_PKT_BURST];
1439         uint16_t count;
1440         uint16_t i;
1441
1442         count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1443                                 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1444
1445         /* setup VMDq for the first packet */
1446         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1447                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1448                         free_pkts(pkts, count);
1449         }
1450
1451         for (i = 0; i < count; ++i)
1452                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1453 }
1454
1455 /*
1456  * Main function of vhost-switch. It basically does:
1457  *
1458  * for each vhost device {
1459  *    - drain_eth_rx()
1460  *
1461  *      Which drains the host eth Rx queue linked to the vhost device,
1462  *      and deliver all of them to guest virito Rx ring associated with
1463  *      this vhost device.
1464  *
1465  *    - drain_virtio_tx()
1466  *
1467  *      Which drains the guest virtio Tx queue and deliver all of them
1468  *      to the target, which could be another vhost device, or the
1469  *      physical eth dev. The route is done in function "virtio_tx_route".
1470  * }
1471  */
1472 static int
1473 switch_worker(void *arg __rte_unused)
1474 {
1475         unsigned i;
1476         unsigned lcore_id = rte_lcore_id();
1477         struct vhost_dev *vdev;
1478         struct mbuf_table *tx_q;
1479
1480         RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1481
1482         tx_q = &lcore_tx_queue[lcore_id];
1483         for (i = 0; i < rte_lcore_count(); i++) {
1484                 if (lcore_ids[i] == lcore_id) {
1485                         tx_q->txq_id = i;
1486                         break;
1487                 }
1488         }
1489
1490         while(1) {
1491                 drain_mbuf_table(tx_q);
1492                 drain_vhost_table();
1493                 /*
1494                  * Inform the configuration core that we have exited the
1495                  * linked list and that no devices are in use if requested.
1496                  */
1497                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1498                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1499
1500                 /*
1501                  * Process vhost devices
1502                  */
1503                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1504                               lcore_vdev_entry) {
1505                         if (unlikely(vdev->remove)) {
1506                                 unlink_vmdq(vdev);
1507                                 vdev->ready = DEVICE_SAFE_REMOVE;
1508                                 continue;
1509                         }
1510
1511                         if (likely(vdev->ready == DEVICE_RX))
1512                                 drain_eth_rx(vdev);
1513
1514                         if (likely(!vdev->remove))
1515                                 drain_virtio_tx(vdev);
1516                 }
1517         }
1518
1519         return 0;
1520 }
1521
1522 static void
1523 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1524 {
1525         uint16_t n_pkt = 0;
1526         int pkts_inflight;
1527
1528         int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1529         pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1530
1531         struct rte_mbuf *m_cpl[pkts_inflight];
1532
1533         while (pkts_inflight) {
1534                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1535                                                         pkts_inflight, dma_id, 0);
1536                 free_pkts(m_cpl, n_pkt);
1537                 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1538                                                                         queue_id);
1539         }
1540 }
1541
1542 /*
1543  * Remove a device from the specific data core linked list and from the
1544  * main linked list. Synchronization  occurs through the use of the
1545  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1546  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1547  */
1548 static void
1549 destroy_device(int vid)
1550 {
1551         struct vhost_dev *vdev = NULL;
1552         int lcore;
1553         uint16_t i;
1554
1555         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1556                 if (vdev->vid == vid)
1557                         break;
1558         }
1559         if (!vdev)
1560                 return;
1561         /*set the remove flag. */
1562         vdev->remove = 1;
1563         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1564                 rte_pause();
1565         }
1566
1567         for (i = 0; i < RTE_MAX_LCORE; i++)
1568                 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1569
1570         if (builtin_net_driver)
1571                 vs_vhost_net_remove(vdev);
1572
1573         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1574                      lcore_vdev_entry);
1575         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1576
1577
1578         /* Set the dev_removal_flag on each lcore. */
1579         RTE_LCORE_FOREACH_WORKER(lcore)
1580                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1581
1582         /*
1583          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1584          * we can be sure that they can no longer access the device removed
1585          * from the linked lists and that the devices are no longer in use.
1586          */
1587         RTE_LCORE_FOREACH_WORKER(lcore) {
1588                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1589                         rte_pause();
1590         }
1591
1592         lcore_info[vdev->coreid].device_num--;
1593
1594         RTE_LOG(INFO, VHOST_DATA,
1595                 "(%d) device has been removed from data core\n",
1596                 vdev->vid);
1597
1598         if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1599                 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ);
1600                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1601                 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1602         }
1603
1604         if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1605                 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ);
1606                 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1607                 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1608         }
1609
1610         rte_free(vdev);
1611 }
1612
1613 static inline int
1614 get_socketid_by_vid(int vid)
1615 {
1616         int i;
1617         char ifname[PATH_MAX];
1618         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1619
1620         for (i = 0; i < nb_sockets; i++) {
1621                 char *file = socket_files + i * PATH_MAX;
1622                 if (strcmp(file, ifname) == 0)
1623                         return i;
1624         }
1625
1626         return -1;
1627 }
1628
1629 static int
1630 init_vhost_queue_ops(int vid)
1631 {
1632         if (builtin_net_driver) {
1633                 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1634                 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1635         } else {
1636                 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1637                         vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1638                 else
1639                         vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1640
1641                 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1642                         vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1643                 else
1644                         vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1645         }
1646
1647         return 0;
1648 }
1649
1650 static inline int
1651 vhost_async_channel_register(int vid)
1652 {
1653         int rx_ret = 0, tx_ret = 0;
1654
1655         if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1656                 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1657                 if (rx_ret == 0)
1658                         dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1659         }
1660
1661         if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1662                 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1663                 if (tx_ret == 0)
1664                         dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1665         }
1666
1667         return rx_ret | tx_ret;
1668 }
1669
1670
1671
1672 /*
1673  * A new device is added to a data core. First the device is added to the main linked list
1674  * and then allocated to a specific data core.
1675  */
1676 static int
1677 new_device(int vid)
1678 {
1679         int lcore, core_add = 0;
1680         uint16_t i;
1681         uint32_t device_num_min = num_devices;
1682         struct vhost_dev *vdev;
1683         int ret;
1684
1685         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1686         if (vdev == NULL) {
1687                 RTE_LOG(INFO, VHOST_DATA,
1688                         "(%d) couldn't allocate memory for vhost dev\n",
1689                         vid);
1690                 return -1;
1691         }
1692         vdev->vid = vid;
1693
1694         for (i = 0; i < RTE_MAX_LCORE; i++) {
1695                 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1696                         = rte_zmalloc("vhost bufftable",
1697                                 sizeof(struct vhost_bufftable),
1698                                 RTE_CACHE_LINE_SIZE);
1699
1700                 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1701                         RTE_LOG(INFO, VHOST_DATA,
1702                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1703                         return -1;
1704                 }
1705         }
1706
1707         int socketid = get_socketid_by_vid(vid);
1708         if (socketid == -1)
1709                 return -1;
1710
1711         init_vid2socketid_array(vid, socketid);
1712
1713         ret =  vhost_async_channel_register(vid);
1714
1715         if (init_vhost_queue_ops(vid) != 0)
1716                 return -1;
1717
1718         if (builtin_net_driver)
1719                 vs_vhost_net_setup(vdev);
1720
1721         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1722         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1723
1724         /*reset ready flag*/
1725         vdev->ready = DEVICE_MAC_LEARNING;
1726         vdev->remove = 0;
1727
1728         /* Find a suitable lcore to add the device. */
1729         RTE_LCORE_FOREACH_WORKER(lcore) {
1730                 if (lcore_info[lcore].device_num < device_num_min) {
1731                         device_num_min = lcore_info[lcore].device_num;
1732                         core_add = lcore;
1733                 }
1734         }
1735         vdev->coreid = core_add;
1736
1737         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1738                           lcore_vdev_entry);
1739         lcore_info[vdev->coreid].device_num++;
1740
1741         /* Disable notifications. */
1742         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1743         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1744
1745         RTE_LOG(INFO, VHOST_DATA,
1746                 "(%d) device has been added to data core %d\n",
1747                 vid, vdev->coreid);
1748
1749         return ret;
1750 }
1751
1752 static int
1753 vring_state_changed(int vid, uint16_t queue_id, int enable)
1754 {
1755         struct vhost_dev *vdev = NULL;
1756
1757         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1758                 if (vdev->vid == vid)
1759                         break;
1760         }
1761         if (!vdev)
1762                 return -1;
1763
1764         if (queue_id != VIRTIO_RXQ)
1765                 return 0;
1766
1767         if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1768                 if (!enable)
1769                         vhost_clear_queue_thread_unsafe(vdev, queue_id);
1770         }
1771
1772         return 0;
1773 }
1774
1775 /*
1776  * These callback allow devices to be added to the data core when configuration
1777  * has been fully complete.
1778  */
1779 static const struct rte_vhost_device_ops virtio_net_device_ops =
1780 {
1781         .new_device =  new_device,
1782         .destroy_device = destroy_device,
1783         .vring_state_changed = vring_state_changed,
1784 };
1785
1786 /*
1787  * This is a thread will wake up after a period to print stats if the user has
1788  * enabled them.
1789  */
1790 static void *
1791 print_stats(__rte_unused void *arg)
1792 {
1793         struct vhost_dev *vdev;
1794         uint64_t tx_dropped, rx_dropped;
1795         uint64_t tx, tx_total, rx, rx_total;
1796         const char clr[] = { 27, '[', '2', 'J', '\0' };
1797         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1798
1799         while(1) {
1800                 sleep(enable_stats);
1801
1802                 /* Clear screen and move to top left */
1803                 printf("%s%s\n", clr, top_left);
1804                 printf("Device statistics =================================\n");
1805
1806                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1807                         tx_total   = vdev->stats.tx_total;
1808                         tx         = vdev->stats.tx;
1809                         tx_dropped = tx_total - tx;
1810
1811                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1812                                 __ATOMIC_SEQ_CST);
1813                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1814                                 __ATOMIC_SEQ_CST);
1815                         rx_dropped = rx_total - rx;
1816
1817                         printf("Statistics for device %d\n"
1818                                 "-----------------------\n"
1819                                 "TX total:              %" PRIu64 "\n"
1820                                 "TX dropped:            %" PRIu64 "\n"
1821                                 "TX successful:         %" PRIu64 "\n"
1822                                 "RX total:              %" PRIu64 "\n"
1823                                 "RX dropped:            %" PRIu64 "\n"
1824                                 "RX successful:         %" PRIu64 "\n",
1825                                 vdev->vid,
1826                                 tx_total, tx_dropped, tx,
1827                                 rx_total, rx_dropped, rx);
1828                 }
1829
1830                 printf("===================================================\n");
1831
1832                 fflush(stdout);
1833         }
1834
1835         return NULL;
1836 }
1837
1838 static void
1839 unregister_drivers(int socket_num)
1840 {
1841         int i, ret;
1842
1843         for (i = 0; i < socket_num; i++) {
1844                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1845                 if (ret != 0)
1846                         RTE_LOG(ERR, VHOST_CONFIG,
1847                                 "Fail to unregister vhost driver for %s.\n",
1848                                 socket_files + i * PATH_MAX);
1849         }
1850 }
1851
1852 /* When we receive a INT signal, unregister vhost driver */
1853 static void
1854 sigint_handler(__rte_unused int signum)
1855 {
1856         /* Unregister vhost driver. */
1857         unregister_drivers(nb_sockets);
1858
1859         exit(0);
1860 }
1861
1862 static void
1863 reset_dma(void)
1864 {
1865         int i;
1866
1867         for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1868                 int j;
1869
1870                 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1871                         dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1872                         dma_bind[i].dmas[j].async_enabled = false;
1873                 }
1874         }
1875
1876         for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1877                 dmas_id[i] = INVALID_DMA_ID;
1878 }
1879
1880 /*
1881  * Main function, does initialisation and calls the per-lcore functions.
1882  */
1883 int
1884 main(int argc, char *argv[])
1885 {
1886         unsigned lcore_id, core_id = 0;
1887         unsigned nb_ports, valid_num_ports;
1888         int ret, i;
1889         uint16_t portid;
1890         static pthread_t tid;
1891         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1892
1893         signal(SIGINT, sigint_handler);
1894
1895         /* init EAL */
1896         ret = rte_eal_init(argc, argv);
1897         if (ret < 0)
1898                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1899         argc -= ret;
1900         argv += ret;
1901
1902         /* initialize dma structures */
1903         reset_dma();
1904
1905         /* parse app arguments */
1906         ret = us_vhost_parse_args(argc, argv);
1907         if (ret < 0)
1908                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1909
1910         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1911                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1912
1913                 if (rte_lcore_is_enabled(lcore_id))
1914                         lcore_ids[core_id++] = lcore_id;
1915         }
1916
1917         if (rte_lcore_count() > RTE_MAX_LCORE)
1918                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1919
1920         /* Get the number of physical ports. */
1921         nb_ports = rte_eth_dev_count_avail();
1922
1923         /*
1924          * Update the global var NUM_PORTS and global array PORTS
1925          * and get value of var VALID_NUM_PORTS according to system ports number
1926          */
1927         valid_num_ports = check_ports_num(nb_ports);
1928
1929         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1930                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1931                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1932                 return -1;
1933         }
1934
1935         /*
1936          * FIXME: here we are trying to allocate mbufs big enough for
1937          * @MAX_QUEUES, but the truth is we're never going to use that
1938          * many queues here. We probably should only do allocation for
1939          * those queues we are going to use.
1940          */
1941         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1942                                             MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1943                                             rte_socket_id());
1944         if (mbuf_pool == NULL)
1945                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1946
1947         if (vm2vm_mode == VM2VM_HARDWARE) {
1948                 /* Enable VT loop back to let L2 switch to do it. */
1949                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1950                 RTE_LOG(DEBUG, VHOST_CONFIG,
1951                         "Enable loop back for L2 switch in vmdq.\n");
1952         }
1953
1954         /* initialize all ports */
1955         RTE_ETH_FOREACH_DEV(portid) {
1956                 /* skip ports that are not enabled */
1957                 if ((enabled_port_mask & (1 << portid)) == 0) {
1958                         RTE_LOG(INFO, VHOST_PORT,
1959                                 "Skipping disabled port %d\n", portid);
1960                         continue;
1961                 }
1962                 if (port_init(portid) != 0)
1963                         rte_exit(EXIT_FAILURE,
1964                                 "Cannot initialize network ports\n");
1965         }
1966
1967         /* Enable stats if the user option is set. */
1968         if (enable_stats) {
1969                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1970                                         print_stats, NULL);
1971                 if (ret < 0)
1972                         rte_exit(EXIT_FAILURE,
1973                                 "Cannot create print-stats thread\n");
1974         }
1975
1976         /* Launch all data cores. */
1977         RTE_LCORE_FOREACH_WORKER(lcore_id)
1978                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1979
1980         if (client_mode)
1981                 flags |= RTE_VHOST_USER_CLIENT;
1982
1983         for (i = 0; i < dma_count; i++) {
1984                 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1985                         RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1986                         rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1987                 }
1988         }
1989
1990         /* Register vhost user driver to handle vhost messages. */
1991         for (i = 0; i < nb_sockets; i++) {
1992                 char *file = socket_files + i * PATH_MAX;
1993
1994                 if (dma_count && get_async_flag_by_socketid(i) != 0)
1995                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1996
1997                 ret = rte_vhost_driver_register(file, flags);
1998                 if (ret != 0) {
1999                         unregister_drivers(i);
2000                         rte_exit(EXIT_FAILURE,
2001                                 "vhost driver register failure.\n");
2002                 }
2003
2004                 if (builtin_net_driver)
2005                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2006
2007                 if (mergeable == 0) {
2008                         rte_vhost_driver_disable_features(file,
2009                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
2010                 }
2011
2012                 if (enable_tx_csum == 0) {
2013                         rte_vhost_driver_disable_features(file,
2014                                 1ULL << VIRTIO_NET_F_CSUM);
2015                 }
2016
2017                 if (enable_tso == 0) {
2018                         rte_vhost_driver_disable_features(file,
2019                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
2020                         rte_vhost_driver_disable_features(file,
2021                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
2022                         rte_vhost_driver_disable_features(file,
2023                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
2024                         rte_vhost_driver_disable_features(file,
2025                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
2026                 }
2027
2028                 if (promiscuous) {
2029                         rte_vhost_driver_enable_features(file,
2030                                 1ULL << VIRTIO_NET_F_CTRL_RX);
2031                 }
2032
2033                 ret = rte_vhost_driver_callback_register(file,
2034                         &virtio_net_device_ops);
2035                 if (ret != 0) {
2036                         rte_exit(EXIT_FAILURE,
2037                                 "failed to register vhost driver callbacks.\n");
2038                 }
2039
2040                 if (rte_vhost_driver_start(file) < 0) {
2041                         rte_exit(EXIT_FAILURE,
2042                                 "failed to start vhost driver.\n");
2043                 }
2044         }
2045
2046         RTE_LCORE_FOREACH_WORKER(lcore_id)
2047                 rte_eal_wait_lcore(lcore_id);
2048
2049         /* clean up the EAL */
2050         rte_eal_cleanup();
2051
2052         return 0;
2053 }