raw/ifpga: remove virtual devices on close
[dpdk.git] / examples / vhost / main.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_log.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_net.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
29
30 #include "main.h"
31
32 #ifndef MAX_QUEUES
33 #define MAX_QUEUES 128
34 #endif
35
36 #define NUM_MBUFS_DEFAULT 0x24000
37
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
40
41 #define MBUF_CACHE_SIZE 128
42 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
43
44 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
45
46 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
48
49 #define JUMBO_FRAME_MAX_SIZE    0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
51
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
54 #define DEVICE_RX                       1
55 #define DEVICE_SAFE_REMOVE      2
56
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
60
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
63
64 #define DMA_RING_SIZE 4096
65
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
68
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
71
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
74 static int dma_count;
75
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
78
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
81
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
85
86 static struct rte_mempool *mbuf_pool;
87 static int mergeable;
88
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
90 typedef enum {
91         VM2VM_DISABLED = 0,
92         VM2VM_SOFTWARE = 1,
93         VM2VM_HARDWARE = 2,
94         VM2VM_LAST
95 } vm2vm_type;
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
97
98 /* Enable stats. */
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
102
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
105
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
108
109 static int client_mode;
110
111 static int builtin_net_driver;
112
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
117
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
121
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
123
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
126         .rxmode = {
127                 .mq_mode        = RTE_ETH_MQ_RX_VMDQ_ONLY,
128                 .split_hdr_size = 0,
129                 /*
130                  * VLAN strip is necessary for 1G NIC such as I350,
131                  * this fixes bug of ipv4 forwarding in guest can't
132                  * forward packets from one virtio dev to another virtio dev.
133                  */
134                 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
135         },
136
137         .txmode = {
138                 .mq_mode = RTE_ETH_MQ_TX_NONE,
139                 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140                              RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141                              RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142                              RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143                              RTE_ETH_TX_OFFLOAD_TCP_TSO),
144         },
145         .rx_adv_conf = {
146                 /*
147                  * should be overridden separately in code with
148                  * appropriate values
149                  */
150                 .vmdq_rx_conf = {
151                         .nb_queue_pools = RTE_ETH_8_POOLS,
152                         .enable_default_pool = 0,
153                         .default_pool = 0,
154                         .nb_pool_maps = 0,
155                         .pool_map = {{0, 0},},
156                 },
157         },
158 };
159
160
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
167
168 const uint16_t vlan_tags[] = {
169         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
171         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
177 };
178
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
181
182 static struct vhost_dev_tailq_list vhost_dev_list =
183         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
184
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
186
187 /* Used for queueing bursts of TX packets. */
188 struct mbuf_table {
189         unsigned len;
190         unsigned txq_id;
191         struct rte_mbuf *m_table[MAX_PKT_BURST];
192 };
193
194 struct vhost_bufftable {
195         uint32_t len;
196         uint64_t pre_tsc;
197         struct rte_mbuf *m_table[MAX_PKT_BURST];
198 };
199
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
202
203 /*
204  * Vhost TX buffer for each data core.
205  * Every data core maintains a TX buffer for every vhost device,
206  * which is used for batch pkts enqueue for higher performance.
207  */
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
209
210 #define MBUF_TABLE_DRAIN_TSC    ((rte_get_tsc_hz() + US_PER_S - 1) \
211                                  / US_PER_S * BURST_TX_DRAIN_US)
212
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
214
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
217 {
218         return dma_bind[socketid].async_flag;
219 }
220
221 static inline void
222 init_vid2socketid_array(int vid, int socketid)
223 {
224         vid2socketid[vid] = socketid;
225 }
226
227 static inline bool
228 is_dma_configured(int16_t dev_id)
229 {
230         int i;
231
232         for (i = 0; i < dma_count; i++)
233                 if (dmas_id[i] == dev_id)
234                         return true;
235         return false;
236 }
237
238 static inline int
239 open_dma(const char *value)
240 {
241         struct dma_for_vhost *dma_info = dma_bind;
242         char *input = strndup(value, strlen(value) + 1);
243         char *addrs = input;
244         char *ptrs[2];
245         char *start, *end, *substr;
246         int64_t socketid, vring_id;
247
248         struct rte_dma_info info;
249         struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250         struct rte_dma_vchan_conf qconf = {
251                 .direction = RTE_DMA_DIR_MEM_TO_MEM,
252                 .nb_desc = DMA_RING_SIZE
253         };
254
255         int dev_id;
256         int ret = 0;
257         uint16_t i = 0;
258         char *dma_arg[RTE_MAX_VHOST_DEVICE];
259         int args_nr;
260
261         while (isblank(*addrs))
262                 addrs++;
263         if (*addrs == '\0') {
264                 ret = -1;
265                 goto out;
266         }
267
268         /* process DMA devices within bracket. */
269         addrs++;
270         substr = strtok(addrs, ";]");
271         if (!substr) {
272                 ret = -1;
273                 goto out;
274         }
275
276         args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
277         if (args_nr <= 0) {
278                 ret = -1;
279                 goto out;
280         }
281
282         while (i < args_nr) {
283                 char *arg_temp = dma_arg[i];
284                 char *txd, *rxd;
285                 uint8_t sub_nr;
286                 int async_flag;
287
288                 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
289                 if (sub_nr != 2) {
290                         ret = -1;
291                         goto out;
292                 }
293
294                 txd = strstr(ptrs[0], "txd");
295                 rxd = strstr(ptrs[0], "rxd");
296                 if (txd) {
297                         start = txd;
298                         vring_id = VIRTIO_RXQ;
299                         async_flag = ASYNC_ENQUEUE_VHOST;
300                 } else if (rxd) {
301                         start = rxd;
302                         vring_id = VIRTIO_TXQ;
303                         async_flag = ASYNC_DEQUEUE_VHOST;
304                 } else {
305                         ret = -1;
306                         goto out;
307                 }
308
309                 start += 3;
310                 socketid = strtol(start, &end, 0);
311                 if (end == start) {
312                         ret = -1;
313                         goto out;
314                 }
315
316                 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
317                 if (dev_id < 0) {
318                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
319                         ret = -1;
320                         goto out;
321                 }
322
323                 /* DMA device is already configured, so skip */
324                 if (is_dma_configured(dev_id))
325                         goto done;
326
327                 if (rte_dma_info_get(dev_id, &info) != 0) {
328                         RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
329                         ret = -1;
330                         goto out;
331                 }
332
333                 if (info.max_vchans < 1) {
334                         RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
335                         ret = -1;
336                         goto out;
337                 }
338
339                 if (rte_dma_configure(dev_id, &dev_config) != 0) {
340                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
341                         ret = -1;
342                         goto out;
343                 }
344
345                 /* Check the max desc supported by DMA device */
346                 rte_dma_info_get(dev_id, &info);
347                 if (info.nb_vchans != 1) {
348                         RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
349                                         dev_id);
350                         ret = -1;
351                         goto out;
352                 }
353
354                 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
355
356                 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
358                         ret = -1;
359                         goto out;
360                 }
361
362                 if (rte_dma_start(dev_id) != 0) {
363                         RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
364                         ret = -1;
365                         goto out;
366                 }
367
368                 dmas_id[dma_count++] = dev_id;
369
370 done:
371                 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372                 (dma_info + socketid)->async_flag |= async_flag;
373                 i++;
374         }
375 out:
376         free(input);
377         return ret;
378 }
379
380 /*
381  * Builds up the correct configuration for VMDQ VLAN pool map
382  * according to the pool & queue limits.
383  */
384 static inline int
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
386 {
387         struct rte_eth_vmdq_rx_conf conf;
388         struct rte_eth_vmdq_rx_conf *def_conf =
389                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
390         unsigned i;
391
392         memset(&conf, 0, sizeof(conf));
393         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394         conf.nb_pool_maps = num_devices;
395         conf.enable_loop_back = def_conf->enable_loop_back;
396         conf.rx_mode = def_conf->rx_mode;
397
398         for (i = 0; i < conf.nb_pool_maps; i++) {
399                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
400                 conf.pool_map[i].pools = (1UL << i);
401         }
402
403         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
406         return 0;
407 }
408
409 /*
410  * Initialises a given port using global settings and with the rx buffers
411  * coming from the mbuf_pool passed as parameter
412  */
413 static inline int
414 port_init(uint16_t port)
415 {
416         struct rte_eth_dev_info dev_info;
417         struct rte_eth_conf port_conf;
418         struct rte_eth_rxconf *rxconf;
419         struct rte_eth_txconf *txconf;
420         int16_t rx_rings, tx_rings;
421         uint16_t rx_ring_size, tx_ring_size;
422         int retval;
423         uint16_t q;
424
425         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426         retval = rte_eth_dev_info_get(port, &dev_info);
427         if (retval != 0) {
428                 RTE_LOG(ERR, VHOST_PORT,
429                         "Error during getting device (port %u) info: %s\n",
430                         port, strerror(-retval));
431
432                 return retval;
433         }
434         if (dev_info.max_vmdq_pools == 0) {
435                 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
436                 return -1;
437         }
438
439         rxconf = &dev_info.default_rxconf;
440         txconf = &dev_info.default_txconf;
441         rxconf->rx_drop_en = 1;
442
443         /*configure the number of supported virtio devices based on VMDQ limits */
444         num_devices = dev_info.max_vmdq_pools;
445
446         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
447         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
448
449         tx_rings = (uint16_t)rte_lcore_count();
450
451         if (mergeable) {
452                 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
453                         vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
454                 else
455                         vmdq_conf_default.rxmode.mtu = MAX_MTU;
456         }
457
458         /* Get port configuration. */
459         retval = get_eth_conf(&port_conf, num_devices);
460         if (retval < 0)
461                 return retval;
462         /* NIC queues are divided into pf queues and vmdq queues.  */
463         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
464         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
465         num_vmdq_queues = num_devices * queues_per_pool;
466         num_queues = num_pf_queues + num_vmdq_queues;
467         vmdq_queue_base = dev_info.vmdq_queue_base;
468         vmdq_pool_base  = dev_info.vmdq_pool_base;
469         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
470                 num_pf_queues, num_devices, queues_per_pool);
471
472         if (!rte_eth_dev_is_valid_port(port))
473                 return -1;
474
475         rx_rings = (uint16_t)dev_info.max_rx_queues;
476         if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
477                 port_conf.txmode.offloads |=
478                         RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
479         /* Configure ethernet device. */
480         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
481         if (retval != 0) {
482                 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
483                         port, strerror(-retval));
484                 return retval;
485         }
486
487         retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
488                 &tx_ring_size);
489         if (retval != 0) {
490                 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
491                         "for port %u: %s.\n", port, strerror(-retval));
492                 return retval;
493         }
494         if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
495                 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
496                         "for Rx queues on port %u.\n", port);
497                 return -1;
498         }
499
500         /* Setup the queues. */
501         rxconf->offloads = port_conf.rxmode.offloads;
502         for (q = 0; q < rx_rings; q ++) {
503                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
504                                                 rte_eth_dev_socket_id(port),
505                                                 rxconf,
506                                                 mbuf_pool);
507                 if (retval < 0) {
508                         RTE_LOG(ERR, VHOST_PORT,
509                                 "Failed to setup rx queue %u of port %u: %s.\n",
510                                 q, port, strerror(-retval));
511                         return retval;
512                 }
513         }
514         txconf->offloads = port_conf.txmode.offloads;
515         for (q = 0; q < tx_rings; q ++) {
516                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
517                                                 rte_eth_dev_socket_id(port),
518                                                 txconf);
519                 if (retval < 0) {
520                         RTE_LOG(ERR, VHOST_PORT,
521                                 "Failed to setup tx queue %u of port %u: %s.\n",
522                                 q, port, strerror(-retval));
523                         return retval;
524                 }
525         }
526
527         /* Start the device. */
528         retval  = rte_eth_dev_start(port);
529         if (retval < 0) {
530                 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
531                         port, strerror(-retval));
532                 return retval;
533         }
534
535         if (promiscuous) {
536                 retval = rte_eth_promiscuous_enable(port);
537                 if (retval != 0) {
538                         RTE_LOG(ERR, VHOST_PORT,
539                                 "Failed to enable promiscuous mode on port %u: %s\n",
540                                 port, rte_strerror(-retval));
541                         return retval;
542                 }
543         }
544
545         retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
546         if (retval < 0) {
547                 RTE_LOG(ERR, VHOST_PORT,
548                         "Failed to get MAC address on port %u: %s\n",
549                         port, rte_strerror(-retval));
550                 return retval;
551         }
552
553         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
554         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
555                 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
556                 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
557
558         return 0;
559 }
560
561 /*
562  * Set socket file path.
563  */
564 static int
565 us_vhost_parse_socket_path(const char *q_arg)
566 {
567         char *old;
568
569         /* parse number string */
570         if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
571                 return -1;
572
573         old = socket_files;
574         socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
575         if (socket_files == NULL) {
576                 free(old);
577                 return -1;
578         }
579
580         strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
581         nb_sockets++;
582
583         return 0;
584 }
585
586 /*
587  * Parse the portmask provided at run time.
588  */
589 static int
590 parse_portmask(const char *portmask)
591 {
592         char *end = NULL;
593         unsigned long pm;
594
595         errno = 0;
596
597         /* parse hexadecimal string */
598         pm = strtoul(portmask, &end, 16);
599         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
600                 return 0;
601
602         return pm;
603
604 }
605
606 /*
607  * Parse num options at run time.
608  */
609 static int
610 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
611 {
612         char *end = NULL;
613         unsigned long num;
614
615         errno = 0;
616
617         /* parse unsigned int string */
618         num = strtoul(q_arg, &end, 10);
619         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
620                 return -1;
621
622         if (num > max_valid_value)
623                 return -1;
624
625         return num;
626
627 }
628
629 /*
630  * Display usage
631  */
632 static void
633 us_vhost_usage(const char *prgname)
634 {
635         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
636         "               --vm2vm [0|1|2]\n"
637         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
638         "               --socket-file <path>\n"
639         "               --nb-devices ND\n"
640         "               -p PORTMASK: Set mask for ports to be used by application\n"
641         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
642         "               --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
643         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
644         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
645         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
646         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
647         "               --socket-file: The path of the socket file.\n"
648         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
649         "               --tso [0|1] disable/enable TCP segment offload.\n"
650         "               --client register a vhost-user socket as client mode.\n"
651         "               --dmas register dma channel for specific vhost device.\n"
652         "               --total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
653                prgname);
654 }
655
656 enum {
657 #define OPT_VM2VM               "vm2vm"
658         OPT_VM2VM_NUM = 256,
659 #define OPT_RX_RETRY            "rx-retry"
660         OPT_RX_RETRY_NUM,
661 #define OPT_RX_RETRY_DELAY      "rx-retry-delay"
662         OPT_RX_RETRY_DELAY_NUM,
663 #define OPT_RX_RETRY_NUMB       "rx-retry-num"
664         OPT_RX_RETRY_NUMB_NUM,
665 #define OPT_MERGEABLE           "mergeable"
666         OPT_MERGEABLE_NUM,
667 #define OPT_STATS               "stats"
668         OPT_STATS_NUM,
669 #define OPT_SOCKET_FILE         "socket-file"
670         OPT_SOCKET_FILE_NUM,
671 #define OPT_TX_CSUM             "tx-csum"
672         OPT_TX_CSUM_NUM,
673 #define OPT_TSO                 "tso"
674         OPT_TSO_NUM,
675 #define OPT_CLIENT              "client"
676         OPT_CLIENT_NUM,
677 #define OPT_BUILTIN_NET_DRIVER  "builtin-net-driver"
678         OPT_BUILTIN_NET_DRIVER_NUM,
679 #define OPT_DMAS                "dmas"
680         OPT_DMAS_NUM,
681 #define OPT_NUM_MBUFS           "total-num-mbufs"
682         OPT_NUM_MBUFS_NUM,
683 };
684
685 /*
686  * Parse the arguments given in the command line of the application.
687  */
688 static int
689 us_vhost_parse_args(int argc, char **argv)
690 {
691         int opt, ret;
692         int option_index;
693         unsigned i;
694         const char *prgname = argv[0];
695         static struct option long_option[] = {
696                 {OPT_VM2VM, required_argument,
697                                 NULL, OPT_VM2VM_NUM},
698                 {OPT_RX_RETRY, required_argument,
699                                 NULL, OPT_RX_RETRY_NUM},
700                 {OPT_RX_RETRY_DELAY, required_argument,
701                                 NULL, OPT_RX_RETRY_DELAY_NUM},
702                 {OPT_RX_RETRY_NUMB, required_argument,
703                                 NULL, OPT_RX_RETRY_NUMB_NUM},
704                 {OPT_MERGEABLE, required_argument,
705                                 NULL, OPT_MERGEABLE_NUM},
706                 {OPT_STATS, required_argument,
707                                 NULL, OPT_STATS_NUM},
708                 {OPT_SOCKET_FILE, required_argument,
709                                 NULL, OPT_SOCKET_FILE_NUM},
710                 {OPT_TX_CSUM, required_argument,
711                                 NULL, OPT_TX_CSUM_NUM},
712                 {OPT_TSO, required_argument,
713                                 NULL, OPT_TSO_NUM},
714                 {OPT_CLIENT, no_argument,
715                                 NULL, OPT_CLIENT_NUM},
716                 {OPT_BUILTIN_NET_DRIVER, no_argument,
717                                 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
718                 {OPT_DMAS, required_argument,
719                                 NULL, OPT_DMAS_NUM},
720                 {OPT_NUM_MBUFS, required_argument,
721                                 NULL, OPT_NUM_MBUFS_NUM},
722                 {NULL, 0, 0, 0},
723         };
724
725         /* Parse command line */
726         while ((opt = getopt_long(argc, argv, "p:P",
727                         long_option, &option_index)) != EOF) {
728                 switch (opt) {
729                 /* Portmask */
730                 case 'p':
731                         enabled_port_mask = parse_portmask(optarg);
732                         if (enabled_port_mask == 0) {
733                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
734                                 us_vhost_usage(prgname);
735                                 return -1;
736                         }
737                         break;
738
739                 case 'P':
740                         promiscuous = 1;
741                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
742                                 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
743                                 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
744                         break;
745
746                 case OPT_VM2VM_NUM:
747                         ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
748                         if (ret == -1) {
749                                 RTE_LOG(INFO, VHOST_CONFIG,
750                                         "Invalid argument for "
751                                         "vm2vm [0|1|2]\n");
752                                 us_vhost_usage(prgname);
753                                 return -1;
754                         }
755                         vm2vm_mode = (vm2vm_type)ret;
756                         break;
757
758                 case OPT_RX_RETRY_NUM:
759                         ret = parse_num_opt(optarg, 1);
760                         if (ret == -1) {
761                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
762                                 us_vhost_usage(prgname);
763                                 return -1;
764                         }
765                         enable_retry = ret;
766                         break;
767
768                 case OPT_TX_CSUM_NUM:
769                         ret = parse_num_opt(optarg, 1);
770                         if (ret == -1) {
771                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
772                                 us_vhost_usage(prgname);
773                                 return -1;
774                         }
775                         enable_tx_csum = ret;
776                         break;
777
778                 case OPT_TSO_NUM:
779                         ret = parse_num_opt(optarg, 1);
780                         if (ret == -1) {
781                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
782                                 us_vhost_usage(prgname);
783                                 return -1;
784                         }
785                         enable_tso = ret;
786                         break;
787
788                 case OPT_RX_RETRY_DELAY_NUM:
789                         ret = parse_num_opt(optarg, INT32_MAX);
790                         if (ret == -1) {
791                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
792                                 us_vhost_usage(prgname);
793                                 return -1;
794                         }
795                         burst_rx_delay_time = ret;
796                         break;
797
798                 case OPT_RX_RETRY_NUMB_NUM:
799                         ret = parse_num_opt(optarg, INT32_MAX);
800                         if (ret == -1) {
801                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
802                                 us_vhost_usage(prgname);
803                                 return -1;
804                         }
805                         burst_rx_retry_num = ret;
806                         break;
807
808                 case OPT_MERGEABLE_NUM:
809                         ret = parse_num_opt(optarg, 1);
810                         if (ret == -1) {
811                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
812                                 us_vhost_usage(prgname);
813                                 return -1;
814                         }
815                         mergeable = !!ret;
816                         break;
817
818                 case OPT_STATS_NUM:
819                         ret = parse_num_opt(optarg, INT32_MAX);
820                         if (ret == -1) {
821                                 RTE_LOG(INFO, VHOST_CONFIG,
822                                         "Invalid argument for stats [0..N]\n");
823                                 us_vhost_usage(prgname);
824                                 return -1;
825                         }
826                         enable_stats = ret;
827                         break;
828
829                 /* Set socket file path. */
830                 case OPT_SOCKET_FILE_NUM:
831                         if (us_vhost_parse_socket_path(optarg) == -1) {
832                                 RTE_LOG(INFO, VHOST_CONFIG,
833                                 "Invalid argument for socket name (Max %d characters)\n",
834                                 PATH_MAX);
835                                 us_vhost_usage(prgname);
836                                 return -1;
837                         }
838                         break;
839
840                 case OPT_DMAS_NUM:
841                         if (open_dma(optarg) == -1) {
842                                 RTE_LOG(INFO, VHOST_CONFIG,
843                                         "Wrong DMA args\n");
844                                 us_vhost_usage(prgname);
845                                 return -1;
846                         }
847                         break;
848
849                 case OPT_NUM_MBUFS_NUM:
850                         ret = parse_num_opt(optarg, INT32_MAX);
851                         if (ret == -1) {
852                                 RTE_LOG(INFO, VHOST_CONFIG,
853                                         "Invalid argument for total-num-mbufs [0..N]\n");
854                                 us_vhost_usage(prgname);
855                                 return -1;
856                         }
857
858                         if (total_num_mbufs < ret)
859                                 total_num_mbufs = ret;
860                         break;
861
862                 case OPT_CLIENT_NUM:
863                         client_mode = 1;
864                         break;
865
866                 case OPT_BUILTIN_NET_DRIVER_NUM:
867                         builtin_net_driver = 1;
868                         break;
869
870                 /* Invalid option - print options. */
871                 default:
872                         us_vhost_usage(prgname);
873                         return -1;
874                 }
875         }
876
877         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
878                 if (enabled_port_mask & (1 << i))
879                         ports[num_ports++] = i;
880         }
881
882         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
883                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
884                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
885                 return -1;
886         }
887
888         return 0;
889 }
890
891 /*
892  * Update the global var NUM_PORTS and array PORTS according to system ports number
893  * and return valid ports number
894  */
895 static unsigned check_ports_num(unsigned nb_ports)
896 {
897         unsigned valid_num_ports = num_ports;
898         unsigned portid;
899
900         if (num_ports > nb_ports) {
901                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
902                         num_ports, nb_ports);
903                 num_ports = nb_ports;
904         }
905
906         for (portid = 0; portid < num_ports; portid ++) {
907                 if (!rte_eth_dev_is_valid_port(ports[portid])) {
908                         RTE_LOG(INFO, VHOST_PORT,
909                                 "\nSpecified port ID(%u) is not valid\n",
910                                 ports[portid]);
911                         ports[portid] = INVALID_PORT_ID;
912                         valid_num_ports--;
913                 }
914         }
915         return valid_num_ports;
916 }
917
918 static __rte_always_inline struct vhost_dev *
919 find_vhost_dev(struct rte_ether_addr *mac)
920 {
921         struct vhost_dev *vdev;
922
923         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
924                 if (vdev->ready == DEVICE_RX &&
925                     rte_is_same_ether_addr(mac, &vdev->mac_address))
926                         return vdev;
927         }
928
929         return NULL;
930 }
931
932 /*
933  * This function learns the MAC address of the device and registers this along with a
934  * vlan tag to a VMDQ.
935  */
936 static int
937 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
938 {
939         struct rte_ether_hdr *pkt_hdr;
940         int i, ret;
941
942         /* Learn MAC address of guest device from packet */
943         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
944
945         if (find_vhost_dev(&pkt_hdr->src_addr)) {
946                 RTE_LOG(ERR, VHOST_DATA,
947                         "(%d) device is using a registered MAC!\n",
948                         vdev->vid);
949                 return -1;
950         }
951
952         for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
953                 vdev->mac_address.addr_bytes[i] =
954                         pkt_hdr->src_addr.addr_bytes[i];
955
956         /* vlan_tag currently uses the device_id. */
957         vdev->vlan_tag = vlan_tags[vdev->vid];
958
959         /* Print out VMDQ registration info. */
960         RTE_LOG(INFO, VHOST_DATA,
961                 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
962                 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
963                 vdev->vlan_tag);
964
965         /* Register the MAC address. */
966         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
967                                 (uint32_t)vdev->vid + vmdq_pool_base);
968         if (ret)
969                 RTE_LOG(ERR, VHOST_DATA,
970                         "(%d) failed to add device MAC address to VMDQ\n",
971                         vdev->vid);
972
973         rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
974
975         /* Set device as ready for RX. */
976         vdev->ready = DEVICE_RX;
977
978         return 0;
979 }
980
981 /*
982  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
983  * queue before disabling RX on the device.
984  */
985 static inline void
986 unlink_vmdq(struct vhost_dev *vdev)
987 {
988         unsigned i = 0;
989         unsigned rx_count;
990         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
991
992         if (vdev->ready == DEVICE_RX) {
993                 /*clear MAC and VLAN settings*/
994                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
995                 for (i = 0; i < 6; i++)
996                         vdev->mac_address.addr_bytes[i] = 0;
997
998                 vdev->vlan_tag = 0;
999
1000                 /*Clear out the receive buffers*/
1001                 rx_count = rte_eth_rx_burst(ports[0],
1002                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1003
1004                 while (rx_count) {
1005                         for (i = 0; i < rx_count; i++)
1006                                 rte_pktmbuf_free(pkts_burst[i]);
1007
1008                         rx_count = rte_eth_rx_burst(ports[0],
1009                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1010                 }
1011
1012                 vdev->ready = DEVICE_MAC_LEARNING;
1013         }
1014 }
1015
1016 static inline void
1017 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1018 {
1019         while (n--)
1020                 rte_pktmbuf_free(pkts[n]);
1021 }
1022
1023 static __rte_always_inline void
1024 complete_async_pkts(struct vhost_dev *vdev)
1025 {
1026         struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1027         uint16_t complete_count;
1028         int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1029
1030         complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1031                                         VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1032         if (complete_count)
1033                 free_pkts(p_cpl, complete_count);
1034
1035 }
1036
1037 static __rte_always_inline void
1038 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1039             struct rte_mbuf *m)
1040 {
1041         uint16_t ret;
1042
1043         if (builtin_net_driver) {
1044                 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1045         } else {
1046                 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1047         }
1048
1049         if (enable_stats) {
1050                 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1051                                 __ATOMIC_SEQ_CST);
1052                 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1053                                 __ATOMIC_SEQ_CST);
1054                 src_vdev->stats.tx_total++;
1055                 src_vdev->stats.tx += ret;
1056         }
1057 }
1058
1059 static __rte_always_inline void
1060 drain_vhost(struct vhost_dev *vdev)
1061 {
1062         uint16_t ret;
1063         uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1064         uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1065         struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1066
1067         ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1068
1069         if (enable_stats) {
1070                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1071                                 __ATOMIC_SEQ_CST);
1072                 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1073                                 __ATOMIC_SEQ_CST);
1074         }
1075
1076         if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1077                 free_pkts(m, nr_xmit);
1078 }
1079
1080 static __rte_always_inline void
1081 drain_vhost_table(void)
1082 {
1083         uint16_t lcore_id = rte_lcore_id();
1084         struct vhost_bufftable *vhost_txq;
1085         struct vhost_dev *vdev;
1086         uint64_t cur_tsc;
1087
1088         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1089                 if (unlikely(vdev->remove == 1))
1090                         continue;
1091
1092                 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1093
1094                 cur_tsc = rte_rdtsc();
1095                 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1096                                 > MBUF_TABLE_DRAIN_TSC)) {
1097                         RTE_LOG_DP(DEBUG, VHOST_DATA,
1098                                 "Vhost TX queue drained after timeout with burst size %u\n",
1099                                 vhost_txq->len);
1100                         drain_vhost(vdev);
1101                         vhost_txq->len = 0;
1102                         vhost_txq->pre_tsc = cur_tsc;
1103                 }
1104         }
1105 }
1106
1107 /*
1108  * Check if the packet destination MAC address is for a local device. If so then put
1109  * the packet on that devices RX queue. If not then return.
1110  */
1111 static __rte_always_inline int
1112 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1113 {
1114         struct rte_ether_hdr *pkt_hdr;
1115         struct vhost_dev *dst_vdev;
1116         struct vhost_bufftable *vhost_txq;
1117         uint16_t lcore_id = rte_lcore_id();
1118         pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1119
1120         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1121         if (!dst_vdev)
1122                 return -1;
1123
1124         if (vdev->vid == dst_vdev->vid) {
1125                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1126                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1127                         vdev->vid);
1128                 return 0;
1129         }
1130
1131         RTE_LOG_DP(DEBUG, VHOST_DATA,
1132                 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1133
1134         if (unlikely(dst_vdev->remove)) {
1135                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1136                         "(%d) device is marked for removal\n", dst_vdev->vid);
1137                 return 0;
1138         }
1139
1140         vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1141         vhost_txq->m_table[vhost_txq->len++] = m;
1142
1143         if (enable_stats) {
1144                 vdev->stats.tx_total++;
1145                 vdev->stats.tx++;
1146         }
1147
1148         if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1149                 drain_vhost(dst_vdev);
1150                 vhost_txq->len = 0;
1151                 vhost_txq->pre_tsc = rte_rdtsc();
1152         }
1153         return 0;
1154 }
1155
1156 /*
1157  * Check if the destination MAC of a packet is one local VM,
1158  * and get its vlan tag, and offset if it is.
1159  */
1160 static __rte_always_inline int
1161 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1162         uint32_t *offset, uint16_t *vlan_tag)
1163 {
1164         struct vhost_dev *dst_vdev;
1165         struct rte_ether_hdr *pkt_hdr =
1166                 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1167
1168         dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1169         if (!dst_vdev)
1170                 return 0;
1171
1172         if (vdev->vid == dst_vdev->vid) {
1173                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1174                         "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1175                         vdev->vid);
1176                 return -1;
1177         }
1178
1179         /*
1180          * HW vlan strip will reduce the packet length
1181          * by minus length of vlan tag, so need restore
1182          * the packet length by plus it.
1183          */
1184         *offset  = RTE_VLAN_HLEN;
1185         *vlan_tag = vlan_tags[vdev->vid];
1186
1187         RTE_LOG_DP(DEBUG, VHOST_DATA,
1188                 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1189                 vdev->vid, dst_vdev->vid, *vlan_tag);
1190
1191         return 0;
1192 }
1193
1194 static void virtio_tx_offload(struct rte_mbuf *m)
1195 {
1196         struct rte_net_hdr_lens hdr_lens;
1197         struct rte_ipv4_hdr *ipv4_hdr;
1198         struct rte_tcp_hdr *tcp_hdr;
1199         uint32_t ptype;
1200         void *l3_hdr;
1201
1202         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1203         m->l2_len = hdr_lens.l2_len;
1204         m->l3_len = hdr_lens.l3_len;
1205         m->l4_len = hdr_lens.l4_len;
1206
1207         l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1208         tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1209                 m->l2_len + m->l3_len);
1210
1211         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1212         if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1213                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1214                 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1215                 ipv4_hdr = l3_hdr;
1216                 ipv4_hdr->hdr_checksum = 0;
1217                 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1218         } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1219                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1220                 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1221         }
1222 }
1223
1224 static __rte_always_inline void
1225 do_drain_mbuf_table(struct mbuf_table *tx_q)
1226 {
1227         uint16_t count;
1228
1229         count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1230                                  tx_q->m_table, tx_q->len);
1231         if (unlikely(count < tx_q->len))
1232                 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1233
1234         tx_q->len = 0;
1235 }
1236
1237 /*
1238  * This function routes the TX packet to the correct interface. This
1239  * may be a local device or the physical port.
1240  */
1241 static __rte_always_inline void
1242 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1243 {
1244         struct mbuf_table *tx_q;
1245         unsigned offset = 0;
1246         const uint16_t lcore_id = rte_lcore_id();
1247         struct rte_ether_hdr *nh;
1248
1249
1250         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1251         if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1252                 struct vhost_dev *vdev2;
1253
1254                 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1255                         if (vdev2 != vdev)
1256                                 sync_virtio_xmit(vdev2, vdev, m);
1257                 }
1258                 goto queue2nic;
1259         }
1260
1261         /*check if destination is local VM*/
1262         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1263                 return;
1264
1265         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1266                 if (unlikely(find_local_dest(vdev, m, &offset,
1267                                              &vlan_tag) != 0)) {
1268                         rte_pktmbuf_free(m);
1269                         return;
1270                 }
1271         }
1272
1273         RTE_LOG_DP(DEBUG, VHOST_DATA,
1274                 "(%d) TX: MAC address is external\n", vdev->vid);
1275
1276 queue2nic:
1277
1278         /*Add packet to the port tx queue*/
1279         tx_q = &lcore_tx_queue[lcore_id];
1280
1281         nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1282         if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1283                 /* Guest has inserted the vlan tag. */
1284                 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1285                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1286                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1287                         (vh->vlan_tci != vlan_tag_be))
1288                         vh->vlan_tci = vlan_tag_be;
1289         } else {
1290                 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1291
1292                 /*
1293                  * Find the right seg to adjust the data len when offset is
1294                  * bigger than tail room size.
1295                  */
1296                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1297                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1298                                 m->data_len += offset;
1299                         else {
1300                                 struct rte_mbuf *seg = m;
1301
1302                                 while ((seg->next != NULL) &&
1303                                         (offset > rte_pktmbuf_tailroom(seg)))
1304                                         seg = seg->next;
1305
1306                                 seg->data_len += offset;
1307                         }
1308                         m->pkt_len += offset;
1309                 }
1310
1311                 m->vlan_tci = vlan_tag;
1312         }
1313
1314         if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1315                 virtio_tx_offload(m);
1316
1317         tx_q->m_table[tx_q->len++] = m;
1318         if (enable_stats) {
1319                 vdev->stats.tx_total++;
1320                 vdev->stats.tx++;
1321         }
1322
1323         if (unlikely(tx_q->len == MAX_PKT_BURST))
1324                 do_drain_mbuf_table(tx_q);
1325 }
1326
1327
1328 static __rte_always_inline void
1329 drain_mbuf_table(struct mbuf_table *tx_q)
1330 {
1331         static uint64_t prev_tsc;
1332         uint64_t cur_tsc;
1333
1334         if (tx_q->len == 0)
1335                 return;
1336
1337         cur_tsc = rte_rdtsc();
1338         if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1339                 prev_tsc = cur_tsc;
1340
1341                 RTE_LOG_DP(DEBUG, VHOST_DATA,
1342                         "TX queue drained after timeout with burst size %u\n",
1343                         tx_q->len);
1344                 do_drain_mbuf_table(tx_q);
1345         }
1346 }
1347
1348 uint16_t
1349 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1350                 struct rte_mbuf **pkts, uint32_t rx_count)
1351 {
1352         uint16_t enqueue_count;
1353         uint16_t enqueue_fail = 0;
1354         uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1355
1356         complete_async_pkts(dev);
1357         enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1358                                         pkts, rx_count, dma_id, 0);
1359
1360         enqueue_fail = rx_count - enqueue_count;
1361         if (enqueue_fail)
1362                 free_pkts(&pkts[enqueue_count], enqueue_fail);
1363
1364         return enqueue_count;
1365 }
1366
1367 uint16_t
1368 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1369                 struct rte_mbuf **pkts, uint32_t rx_count)
1370 {
1371         return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1372 }
1373
1374 static __rte_always_inline void
1375 drain_eth_rx(struct vhost_dev *vdev)
1376 {
1377         uint16_t rx_count, enqueue_count;
1378         struct rte_mbuf *pkts[MAX_PKT_BURST];
1379
1380         rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1381                                     pkts, MAX_PKT_BURST);
1382
1383         if (!rx_count)
1384                 return;
1385
1386         /*
1387          * When "enable_retry" is set, here we wait and retry when there
1388          * is no enough free slots in the queue to hold @rx_count packets,
1389          * to diminish packet loss.
1390          */
1391         if (enable_retry &&
1392             unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1393                         VIRTIO_RXQ))) {
1394                 uint32_t retry;
1395
1396                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1397                         rte_delay_us(burst_rx_delay_time);
1398                         if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1399                                         VIRTIO_RXQ))
1400                                 break;
1401                 }
1402         }
1403
1404         enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1405                                         VIRTIO_RXQ, pkts, rx_count);
1406
1407         if (enable_stats) {
1408                 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1409                                 __ATOMIC_SEQ_CST);
1410                 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1411                                 __ATOMIC_SEQ_CST);
1412         }
1413
1414         if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1415                 free_pkts(pkts, rx_count);
1416 }
1417
1418 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1419                             struct rte_mempool *mbuf_pool,
1420                             struct rte_mbuf **pkts, uint16_t count)
1421 {
1422         int nr_inflight;
1423         uint16_t dequeue_count;
1424         int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1425
1426         dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1427                         mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1428
1429         return dequeue_count;
1430 }
1431
1432 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1433                            struct rte_mempool *mbuf_pool,
1434                            struct rte_mbuf **pkts, uint16_t count)
1435 {
1436         return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1437 }
1438
1439 static __rte_always_inline void
1440 drain_virtio_tx(struct vhost_dev *vdev)
1441 {
1442         struct rte_mbuf *pkts[MAX_PKT_BURST];
1443         uint16_t count;
1444         uint16_t i;
1445
1446         count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1447                                 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1448
1449         /* setup VMDq for the first packet */
1450         if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1451                 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1452                         free_pkts(pkts, count);
1453         }
1454
1455         for (i = 0; i < count; ++i)
1456                 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1457 }
1458
1459 /*
1460  * Main function of vhost-switch. It basically does:
1461  *
1462  * for each vhost device {
1463  *    - drain_eth_rx()
1464  *
1465  *      Which drains the host eth Rx queue linked to the vhost device,
1466  *      and deliver all of them to guest virito Rx ring associated with
1467  *      this vhost device.
1468  *
1469  *    - drain_virtio_tx()
1470  *
1471  *      Which drains the guest virtio Tx queue and deliver all of them
1472  *      to the target, which could be another vhost device, or the
1473  *      physical eth dev. The route is done in function "virtio_tx_route".
1474  * }
1475  */
1476 static int
1477 switch_worker(void *arg __rte_unused)
1478 {
1479         unsigned i;
1480         unsigned lcore_id = rte_lcore_id();
1481         struct vhost_dev *vdev;
1482         struct mbuf_table *tx_q;
1483
1484         RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1485
1486         tx_q = &lcore_tx_queue[lcore_id];
1487         for (i = 0; i < rte_lcore_count(); i++) {
1488                 if (lcore_ids[i] == lcore_id) {
1489                         tx_q->txq_id = i;
1490                         break;
1491                 }
1492         }
1493
1494         while(1) {
1495                 drain_mbuf_table(tx_q);
1496                 drain_vhost_table();
1497                 /*
1498                  * Inform the configuration core that we have exited the
1499                  * linked list and that no devices are in use if requested.
1500                  */
1501                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1502                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1503
1504                 /*
1505                  * Process vhost devices
1506                  */
1507                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1508                               lcore_vdev_entry) {
1509                         if (unlikely(vdev->remove)) {
1510                                 unlink_vmdq(vdev);
1511                                 vdev->ready = DEVICE_SAFE_REMOVE;
1512                                 continue;
1513                         }
1514
1515                         if (likely(vdev->ready == DEVICE_RX))
1516                                 drain_eth_rx(vdev);
1517
1518                         if (likely(!vdev->remove))
1519                                 drain_virtio_tx(vdev);
1520                 }
1521         }
1522
1523         return 0;
1524 }
1525
1526 static void
1527 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1528 {
1529         uint16_t n_pkt = 0;
1530         int pkts_inflight;
1531
1532         int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1533         pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1534
1535         struct rte_mbuf *m_cpl[pkts_inflight];
1536
1537         while (pkts_inflight) {
1538                 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1539                                                         pkts_inflight, dma_id, 0);
1540                 free_pkts(m_cpl, n_pkt);
1541                 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1542                                                                         queue_id);
1543         }
1544 }
1545
1546 /*
1547  * Remove a device from the specific data core linked list and from the
1548  * main linked list. Synchronization  occurs through the use of the
1549  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1550  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1551  */
1552 static void
1553 destroy_device(int vid)
1554 {
1555         struct vhost_dev *vdev = NULL;
1556         int lcore;
1557         uint16_t i;
1558
1559         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1560                 if (vdev->vid == vid)
1561                         break;
1562         }
1563         if (!vdev)
1564                 return;
1565         /*set the remove flag. */
1566         vdev->remove = 1;
1567         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1568                 rte_pause();
1569         }
1570
1571         for (i = 0; i < RTE_MAX_LCORE; i++)
1572                 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1573
1574         if (builtin_net_driver)
1575                 vs_vhost_net_remove(vdev);
1576
1577         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1578                      lcore_vdev_entry);
1579         TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1580
1581
1582         /* Set the dev_removal_flag on each lcore. */
1583         RTE_LCORE_FOREACH_WORKER(lcore)
1584                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1585
1586         /*
1587          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1588          * we can be sure that they can no longer access the device removed
1589          * from the linked lists and that the devices are no longer in use.
1590          */
1591         RTE_LCORE_FOREACH_WORKER(lcore) {
1592                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1593                         rte_pause();
1594         }
1595
1596         lcore_info[vdev->coreid].device_num--;
1597
1598         RTE_LOG(INFO, VHOST_DATA,
1599                 "(%d) device has been removed from data core\n",
1600                 vdev->vid);
1601
1602         if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1603                 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ);
1604                 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1605                 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1606         }
1607
1608         if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1609                 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ);
1610                 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1611                 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1612         }
1613
1614         rte_free(vdev);
1615 }
1616
1617 static inline int
1618 get_socketid_by_vid(int vid)
1619 {
1620         int i;
1621         char ifname[PATH_MAX];
1622         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1623
1624         for (i = 0; i < nb_sockets; i++) {
1625                 char *file = socket_files + i * PATH_MAX;
1626                 if (strcmp(file, ifname) == 0)
1627                         return i;
1628         }
1629
1630         return -1;
1631 }
1632
1633 static int
1634 init_vhost_queue_ops(int vid)
1635 {
1636         if (builtin_net_driver) {
1637                 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1638                 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1639         } else {
1640                 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1641                         vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1642                 else
1643                         vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1644
1645                 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1646                         vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1647                 else
1648                         vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1649         }
1650
1651         return 0;
1652 }
1653
1654 static inline int
1655 vhost_async_channel_register(int vid)
1656 {
1657         int rx_ret = 0, tx_ret = 0;
1658
1659         if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1660                 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1661                 if (rx_ret == 0)
1662                         dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1663         }
1664
1665         if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1666                 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1667                 if (tx_ret == 0)
1668                         dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1669         }
1670
1671         return rx_ret | tx_ret;
1672 }
1673
1674
1675
1676 /*
1677  * A new device is added to a data core. First the device is added to the main linked list
1678  * and then allocated to a specific data core.
1679  */
1680 static int
1681 new_device(int vid)
1682 {
1683         int lcore, core_add = 0;
1684         uint16_t i;
1685         uint32_t device_num_min = num_devices;
1686         struct vhost_dev *vdev;
1687         int ret;
1688
1689         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1690         if (vdev == NULL) {
1691                 RTE_LOG(INFO, VHOST_DATA,
1692                         "(%d) couldn't allocate memory for vhost dev\n",
1693                         vid);
1694                 return -1;
1695         }
1696         vdev->vid = vid;
1697
1698         for (i = 0; i < RTE_MAX_LCORE; i++) {
1699                 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1700                         = rte_zmalloc("vhost bufftable",
1701                                 sizeof(struct vhost_bufftable),
1702                                 RTE_CACHE_LINE_SIZE);
1703
1704                 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1705                         RTE_LOG(INFO, VHOST_DATA,
1706                           "(%d) couldn't allocate memory for vhost TX\n", vid);
1707                         return -1;
1708                 }
1709         }
1710
1711         int socketid = get_socketid_by_vid(vid);
1712         if (socketid == -1)
1713                 return -1;
1714
1715         init_vid2socketid_array(vid, socketid);
1716
1717         ret =  vhost_async_channel_register(vid);
1718
1719         if (init_vhost_queue_ops(vid) != 0)
1720                 return -1;
1721
1722         if (builtin_net_driver)
1723                 vs_vhost_net_setup(vdev);
1724
1725         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1726         vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1727
1728         /*reset ready flag*/
1729         vdev->ready = DEVICE_MAC_LEARNING;
1730         vdev->remove = 0;
1731
1732         /* Find a suitable lcore to add the device. */
1733         RTE_LCORE_FOREACH_WORKER(lcore) {
1734                 if (lcore_info[lcore].device_num < device_num_min) {
1735                         device_num_min = lcore_info[lcore].device_num;
1736                         core_add = lcore;
1737                 }
1738         }
1739         vdev->coreid = core_add;
1740
1741         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1742                           lcore_vdev_entry);
1743         lcore_info[vdev->coreid].device_num++;
1744
1745         /* Disable notifications. */
1746         rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1747         rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1748
1749         RTE_LOG(INFO, VHOST_DATA,
1750                 "(%d) device has been added to data core %d\n",
1751                 vid, vdev->coreid);
1752
1753         return ret;
1754 }
1755
1756 static int
1757 vring_state_changed(int vid, uint16_t queue_id, int enable)
1758 {
1759         struct vhost_dev *vdev = NULL;
1760
1761         TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1762                 if (vdev->vid == vid)
1763                         break;
1764         }
1765         if (!vdev)
1766                 return -1;
1767
1768         if (queue_id != VIRTIO_RXQ)
1769                 return 0;
1770
1771         if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1772                 if (!enable)
1773                         vhost_clear_queue_thread_unsafe(vdev, queue_id);
1774         }
1775
1776         return 0;
1777 }
1778
1779 /*
1780  * These callback allow devices to be added to the data core when configuration
1781  * has been fully complete.
1782  */
1783 static const struct rte_vhost_device_ops virtio_net_device_ops =
1784 {
1785         .new_device =  new_device,
1786         .destroy_device = destroy_device,
1787         .vring_state_changed = vring_state_changed,
1788 };
1789
1790 /*
1791  * This is a thread will wake up after a period to print stats if the user has
1792  * enabled them.
1793  */
1794 static void *
1795 print_stats(__rte_unused void *arg)
1796 {
1797         struct vhost_dev *vdev;
1798         uint64_t tx_dropped, rx_dropped;
1799         uint64_t tx, tx_total, rx, rx_total;
1800         const char clr[] = { 27, '[', '2', 'J', '\0' };
1801         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1802
1803         while(1) {
1804                 sleep(enable_stats);
1805
1806                 /* Clear screen and move to top left */
1807                 printf("%s%s\n", clr, top_left);
1808                 printf("Device statistics =================================\n");
1809
1810                 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1811                         tx_total   = vdev->stats.tx_total;
1812                         tx         = vdev->stats.tx;
1813                         tx_dropped = tx_total - tx;
1814
1815                         rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1816                                 __ATOMIC_SEQ_CST);
1817                         rx         = __atomic_load_n(&vdev->stats.rx_atomic,
1818                                 __ATOMIC_SEQ_CST);
1819                         rx_dropped = rx_total - rx;
1820
1821                         printf("Statistics for device %d\n"
1822                                 "-----------------------\n"
1823                                 "TX total:              %" PRIu64 "\n"
1824                                 "TX dropped:            %" PRIu64 "\n"
1825                                 "TX successful:         %" PRIu64 "\n"
1826                                 "RX total:              %" PRIu64 "\n"
1827                                 "RX dropped:            %" PRIu64 "\n"
1828                                 "RX successful:         %" PRIu64 "\n",
1829                                 vdev->vid,
1830                                 tx_total, tx_dropped, tx,
1831                                 rx_total, rx_dropped, rx);
1832                 }
1833
1834                 printf("===================================================\n");
1835
1836                 fflush(stdout);
1837         }
1838
1839         return NULL;
1840 }
1841
1842 static void
1843 unregister_drivers(int socket_num)
1844 {
1845         int i, ret;
1846
1847         for (i = 0; i < socket_num; i++) {
1848                 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1849                 if (ret != 0)
1850                         RTE_LOG(ERR, VHOST_CONFIG,
1851                                 "Fail to unregister vhost driver for %s.\n",
1852                                 socket_files + i * PATH_MAX);
1853         }
1854 }
1855
1856 /* When we receive a INT signal, unregister vhost driver */
1857 static void
1858 sigint_handler(__rte_unused int signum)
1859 {
1860         /* Unregister vhost driver. */
1861         unregister_drivers(nb_sockets);
1862
1863         exit(0);
1864 }
1865
1866 static void
1867 reset_dma(void)
1868 {
1869         int i;
1870
1871         for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1872                 int j;
1873
1874                 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1875                         dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1876                         dma_bind[i].dmas[j].async_enabled = false;
1877                 }
1878         }
1879
1880         for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1881                 dmas_id[i] = INVALID_DMA_ID;
1882 }
1883
1884 /*
1885  * Main function, does initialisation and calls the per-lcore functions.
1886  */
1887 int
1888 main(int argc, char *argv[])
1889 {
1890         unsigned lcore_id, core_id = 0;
1891         unsigned nb_ports, valid_num_ports;
1892         int ret, i;
1893         uint16_t portid;
1894         static pthread_t tid;
1895         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1896
1897         signal(SIGINT, sigint_handler);
1898
1899         /* init EAL */
1900         ret = rte_eal_init(argc, argv);
1901         if (ret < 0)
1902                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1903         argc -= ret;
1904         argv += ret;
1905
1906         /* initialize dma structures */
1907         reset_dma();
1908
1909         /* parse app arguments */
1910         ret = us_vhost_parse_args(argc, argv);
1911         if (ret < 0)
1912                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1913
1914         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1915                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1916
1917                 if (rte_lcore_is_enabled(lcore_id))
1918                         lcore_ids[core_id++] = lcore_id;
1919         }
1920
1921         if (rte_lcore_count() > RTE_MAX_LCORE)
1922                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1923
1924         /* Get the number of physical ports. */
1925         nb_ports = rte_eth_dev_count_avail();
1926
1927         /*
1928          * Update the global var NUM_PORTS and global array PORTS
1929          * and get value of var VALID_NUM_PORTS according to system ports number
1930          */
1931         valid_num_ports = check_ports_num(nb_ports);
1932
1933         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1934                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1935                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1936                 return -1;
1937         }
1938
1939         /*
1940          * FIXME: here we are trying to allocate mbufs big enough for
1941          * @MAX_QUEUES, but the truth is we're never going to use that
1942          * many queues here. We probably should only do allocation for
1943          * those queues we are going to use.
1944          */
1945         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1946                                             MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1947                                             rte_socket_id());
1948         if (mbuf_pool == NULL)
1949                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1950
1951         if (vm2vm_mode == VM2VM_HARDWARE) {
1952                 /* Enable VT loop back to let L2 switch to do it. */
1953                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1954                 RTE_LOG(DEBUG, VHOST_CONFIG,
1955                         "Enable loop back for L2 switch in vmdq.\n");
1956         }
1957
1958         /* initialize all ports */
1959         RTE_ETH_FOREACH_DEV(portid) {
1960                 /* skip ports that are not enabled */
1961                 if ((enabled_port_mask & (1 << portid)) == 0) {
1962                         RTE_LOG(INFO, VHOST_PORT,
1963                                 "Skipping disabled port %d\n", portid);
1964                         continue;
1965                 }
1966                 if (port_init(portid) != 0)
1967                         rte_exit(EXIT_FAILURE,
1968                                 "Cannot initialize network ports\n");
1969         }
1970
1971         /* Enable stats if the user option is set. */
1972         if (enable_stats) {
1973                 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1974                                         print_stats, NULL);
1975                 if (ret < 0)
1976                         rte_exit(EXIT_FAILURE,
1977                                 "Cannot create print-stats thread\n");
1978         }
1979
1980         /* Launch all data cores. */
1981         RTE_LCORE_FOREACH_WORKER(lcore_id)
1982                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1983
1984         if (client_mode)
1985                 flags |= RTE_VHOST_USER_CLIENT;
1986
1987         for (i = 0; i < dma_count; i++) {
1988                 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1989                         RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1990                         rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1991                 }
1992         }
1993
1994         /* Register vhost user driver to handle vhost messages. */
1995         for (i = 0; i < nb_sockets; i++) {
1996                 char *file = socket_files + i * PATH_MAX;
1997
1998                 if (dma_count && get_async_flag_by_socketid(i) != 0)
1999                         flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2000
2001                 ret = rte_vhost_driver_register(file, flags);
2002                 if (ret != 0) {
2003                         unregister_drivers(i);
2004                         rte_exit(EXIT_FAILURE,
2005                                 "vhost driver register failure.\n");
2006                 }
2007
2008                 if (builtin_net_driver)
2009                         rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2010
2011                 if (mergeable == 0) {
2012                         rte_vhost_driver_disable_features(file,
2013                                 1ULL << VIRTIO_NET_F_MRG_RXBUF);
2014                 }
2015
2016                 if (enable_tx_csum == 0) {
2017                         rte_vhost_driver_disable_features(file,
2018                                 1ULL << VIRTIO_NET_F_CSUM);
2019                 }
2020
2021                 if (enable_tso == 0) {
2022                         rte_vhost_driver_disable_features(file,
2023                                 1ULL << VIRTIO_NET_F_HOST_TSO4);
2024                         rte_vhost_driver_disable_features(file,
2025                                 1ULL << VIRTIO_NET_F_HOST_TSO6);
2026                         rte_vhost_driver_disable_features(file,
2027                                 1ULL << VIRTIO_NET_F_GUEST_TSO4);
2028                         rte_vhost_driver_disable_features(file,
2029                                 1ULL << VIRTIO_NET_F_GUEST_TSO6);
2030                 }
2031
2032                 if (promiscuous) {
2033                         rte_vhost_driver_enable_features(file,
2034                                 1ULL << VIRTIO_NET_F_CTRL_RX);
2035                 }
2036
2037                 ret = rte_vhost_driver_callback_register(file,
2038                         &virtio_net_device_ops);
2039                 if (ret != 0) {
2040                         rte_exit(EXIT_FAILURE,
2041                                 "failed to register vhost driver callbacks.\n");
2042                 }
2043
2044                 if (rte_vhost_driver_start(file) < 0) {
2045                         rte_exit(EXIT_FAILURE,
2046                                 "failed to start vhost driver.\n");
2047                 }
2048         }
2049
2050         RTE_LCORE_FOREACH_WORKER(lcore_id)
2051                 rte_eal_wait_lcore(lcore_id);
2052
2053         /* clean up the EAL */
2054         rte_eal_cleanup();
2055
2056         return 0;
2057 }