examples/vhost: use MAC compare helper
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
77 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
78
79 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
80 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
81
82 #define JUMBO_FRAME_MAX_SIZE    0x2600
83
84 /* State of virtio device. */
85 #define DEVICE_MAC_LEARNING 0
86 #define DEVICE_RX                       1
87 #define DEVICE_SAFE_REMOVE      2
88
89 /* Configurable number of RX/TX ring descriptors */
90 #define RTE_TEST_RX_DESC_DEFAULT 1024
91 #define RTE_TEST_TX_DESC_DEFAULT 512
92
93 #define INVALID_PORT_ID 0xFF
94
95 /* Max number of devices. Limited by vmdq. */
96 #define MAX_DEVICES 64
97
98 /* Size of buffers used for snprintfs. */
99 #define MAX_PRINT_BUFF 6072
100
101 /* Maximum character device basename size. */
102 #define MAX_BASENAME_SZ 10
103
104 /* Maximum long option length for option parsing. */
105 #define MAX_LONG_OPT_SZ 64
106
107 /* mask of enabled ports */
108 static uint32_t enabled_port_mask = 0;
109
110 /* Promiscuous mode */
111 static uint32_t promiscuous;
112
113 /*Number of switching cores enabled*/
114 static uint32_t num_switching_cores = 0;
115
116 /* number of devices/queues to support*/
117 static uint32_t num_queues = 0;
118 static uint32_t num_devices;
119
120 static struct rte_mempool *mbuf_pool;
121 static int mergeable;
122
123 /* Do vlan strip on host, enabled on default */
124 static uint32_t vlan_strip = 1;
125
126 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
127 typedef enum {
128         VM2VM_DISABLED = 0,
129         VM2VM_SOFTWARE = 1,
130         VM2VM_HARDWARE = 2,
131         VM2VM_LAST
132 } vm2vm_type;
133 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
134
135 /* Enable stats. */
136 static uint32_t enable_stats = 0;
137 /* Enable retries on RX. */
138 static uint32_t enable_retry = 1;
139
140 /* Disable TX checksum offload */
141 static uint32_t enable_tx_csum;
142
143 /* Disable TSO offload */
144 static uint32_t enable_tso;
145
146 /* Specify timeout (in useconds) between retries on RX. */
147 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
148 /* Specify the number of retries on RX. */
149 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
150
151 /* Character device basename. Can be set by user. */
152 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
153
154 /* empty vmdq configuration structure. Filled in programatically */
155 static struct rte_eth_conf vmdq_conf_default = {
156         .rxmode = {
157                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
158                 .split_hdr_size = 0,
159                 .header_split   = 0, /**< Header Split disabled */
160                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
161                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
162                 /*
163                  * It is necessary for 1G NIC such as I350,
164                  * this fixes bug of ipv4 forwarding in guest can't
165                  * forward pakets from one virtio dev to another virtio dev.
166                  */
167                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
168                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
169                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
170         },
171
172         .txmode = {
173                 .mq_mode = ETH_MQ_TX_NONE,
174         },
175         .rx_adv_conf = {
176                 /*
177                  * should be overridden separately in code with
178                  * appropriate values
179                  */
180                 .vmdq_rx_conf = {
181                         .nb_queue_pools = ETH_8_POOLS,
182                         .enable_default_pool = 0,
183                         .default_pool = 0,
184                         .nb_pool_maps = 0,
185                         .pool_map = {{0, 0},},
186                 },
187         },
188 };
189
190 static unsigned lcore_ids[RTE_MAX_LCORE];
191 static uint8_t ports[RTE_MAX_ETHPORTS];
192 static unsigned num_ports = 0; /**< The number of ports specified in command line */
193 static uint16_t num_pf_queues, num_vmdq_queues;
194 static uint16_t vmdq_pool_base, vmdq_queue_base;
195 static uint16_t queues_per_pool;
196
197 const uint16_t vlan_tags[] = {
198         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
199         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
200         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
201         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
202         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
203         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
204         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
205         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
206 };
207
208 /* ethernet addresses of ports */
209 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
210
211 static struct vhost_dev_tailq_list vhost_dev_list =
212         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
213
214 static struct lcore_info lcore_info[RTE_MAX_LCORE];
215
216 /* Used for queueing bursts of TX packets. */
217 struct mbuf_table {
218         unsigned len;
219         unsigned txq_id;
220         struct rte_mbuf *m_table[MAX_PKT_BURST];
221 };
222
223 /* TX queue for each data core. */
224 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
225
226 #define VLAN_HLEN       4
227
228 /* Per-device statistics struct */
229 struct device_statistics {
230         uint64_t tx_total;
231         rte_atomic64_t rx_total_atomic;
232         uint64_t tx;
233         rte_atomic64_t rx_atomic;
234 } __rte_cache_aligned;
235 struct device_statistics dev_statistics[MAX_DEVICES];
236
237 /*
238  * Builds up the correct configuration for VMDQ VLAN pool map
239  * according to the pool & queue limits.
240  */
241 static inline int
242 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
243 {
244         struct rte_eth_vmdq_rx_conf conf;
245         struct rte_eth_vmdq_rx_conf *def_conf =
246                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
247         unsigned i;
248
249         memset(&conf, 0, sizeof(conf));
250         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
251         conf.nb_pool_maps = num_devices;
252         conf.enable_loop_back = def_conf->enable_loop_back;
253         conf.rx_mode = def_conf->rx_mode;
254
255         for (i = 0; i < conf.nb_pool_maps; i++) {
256                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
257                 conf.pool_map[i].pools = (1UL << i);
258         }
259
260         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
261         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
262                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
263         return 0;
264 }
265
266 /*
267  * Validate the device number according to the max pool number gotten form
268  * dev_info. If the device number is invalid, give the error message and
269  * return -1. Each device must have its own pool.
270  */
271 static inline int
272 validate_num_devices(uint32_t max_nb_devices)
273 {
274         if (num_devices > max_nb_devices) {
275                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
276                 return -1;
277         }
278         return 0;
279 }
280
281 /*
282  * Initialises a given port using global settings and with the rx buffers
283  * coming from the mbuf_pool passed as parameter
284  */
285 static inline int
286 port_init(uint8_t port)
287 {
288         struct rte_eth_dev_info dev_info;
289         struct rte_eth_conf port_conf;
290         struct rte_eth_rxconf *rxconf;
291         struct rte_eth_txconf *txconf;
292         int16_t rx_rings, tx_rings;
293         uint16_t rx_ring_size, tx_ring_size;
294         int retval;
295         uint16_t q;
296
297         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
298         rte_eth_dev_info_get (port, &dev_info);
299
300         if (dev_info.max_rx_queues > MAX_QUEUES) {
301                 rte_exit(EXIT_FAILURE,
302                         "please define MAX_QUEUES no less than %u in %s\n",
303                         dev_info.max_rx_queues, __FILE__);
304         }
305
306         rxconf = &dev_info.default_rxconf;
307         txconf = &dev_info.default_txconf;
308         rxconf->rx_drop_en = 1;
309
310         /* Enable vlan offload */
311         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
312
313         /*configure the number of supported virtio devices based on VMDQ limits */
314         num_devices = dev_info.max_vmdq_pools;
315
316         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
317         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
318         tx_rings = (uint16_t)rte_lcore_count();
319
320         retval = validate_num_devices(MAX_DEVICES);
321         if (retval < 0)
322                 return retval;
323
324         /* Get port configuration. */
325         retval = get_eth_conf(&port_conf, num_devices);
326         if (retval < 0)
327                 return retval;
328         /* NIC queues are divided into pf queues and vmdq queues.  */
329         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
330         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
331         num_vmdq_queues = num_devices * queues_per_pool;
332         num_queues = num_pf_queues + num_vmdq_queues;
333         vmdq_queue_base = dev_info.vmdq_queue_base;
334         vmdq_pool_base  = dev_info.vmdq_pool_base;
335         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
336                 num_pf_queues, num_devices, queues_per_pool);
337
338         if (port >= rte_eth_dev_count()) return -1;
339
340         if (enable_tx_csum == 0)
341                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
342
343         if (enable_tso == 0) {
344                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
345                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
346         }
347
348         rx_rings = (uint16_t)dev_info.max_rx_queues;
349         /* Configure ethernet device. */
350         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
351         if (retval != 0)
352                 return retval;
353
354         /* Setup the queues. */
355         for (q = 0; q < rx_rings; q ++) {
356                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
357                                                 rte_eth_dev_socket_id(port),
358                                                 rxconf,
359                                                 mbuf_pool);
360                 if (retval < 0)
361                         return retval;
362         }
363         for (q = 0; q < tx_rings; q ++) {
364                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
365                                                 rte_eth_dev_socket_id(port),
366                                                 txconf);
367                 if (retval < 0)
368                         return retval;
369         }
370
371         /* Start the device. */
372         retval  = rte_eth_dev_start(port);
373         if (retval < 0) {
374                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
375                 return retval;
376         }
377
378         if (promiscuous)
379                 rte_eth_promiscuous_enable(port);
380
381         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
382         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
383         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
384                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
385                         (unsigned)port,
386                         vmdq_ports_eth_addr[port].addr_bytes[0],
387                         vmdq_ports_eth_addr[port].addr_bytes[1],
388                         vmdq_ports_eth_addr[port].addr_bytes[2],
389                         vmdq_ports_eth_addr[port].addr_bytes[3],
390                         vmdq_ports_eth_addr[port].addr_bytes[4],
391                         vmdq_ports_eth_addr[port].addr_bytes[5]);
392
393         return 0;
394 }
395
396 /*
397  * Set character device basename.
398  */
399 static int
400 us_vhost_parse_basename(const char *q_arg)
401 {
402         /* parse number string */
403
404         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
405                 return -1;
406         else
407                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
408
409         return 0;
410 }
411
412 /*
413  * Parse the portmask provided at run time.
414  */
415 static int
416 parse_portmask(const char *portmask)
417 {
418         char *end = NULL;
419         unsigned long pm;
420
421         errno = 0;
422
423         /* parse hexadecimal string */
424         pm = strtoul(portmask, &end, 16);
425         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
426                 return -1;
427
428         if (pm == 0)
429                 return -1;
430
431         return pm;
432
433 }
434
435 /*
436  * Parse num options at run time.
437  */
438 static int
439 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
440 {
441         char *end = NULL;
442         unsigned long num;
443
444         errno = 0;
445
446         /* parse unsigned int string */
447         num = strtoul(q_arg, &end, 10);
448         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
449                 return -1;
450
451         if (num > max_valid_value)
452                 return -1;
453
454         return num;
455
456 }
457
458 /*
459  * Display usage
460  */
461 static void
462 us_vhost_usage(const char *prgname)
463 {
464         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
465         "               --vm2vm [0|1|2]\n"
466         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
467         "               --dev-basename <name>\n"
468         "               --nb-devices ND\n"
469         "               -p PORTMASK: Set mask for ports to be used by application\n"
470         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
471         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
472         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
473         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
474         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
475         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
476         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
477         "               --dev-basename: The basename to be used for the character device.\n"
478         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
479         "               --tso [0|1] disable/enable TCP segment offload.\n",
480                prgname);
481 }
482
483 /*
484  * Parse the arguments given in the command line of the application.
485  */
486 static int
487 us_vhost_parse_args(int argc, char **argv)
488 {
489         int opt, ret;
490         int option_index;
491         unsigned i;
492         const char *prgname = argv[0];
493         static struct option long_option[] = {
494                 {"vm2vm", required_argument, NULL, 0},
495                 {"rx-retry", required_argument, NULL, 0},
496                 {"rx-retry-delay", required_argument, NULL, 0},
497                 {"rx-retry-num", required_argument, NULL, 0},
498                 {"mergeable", required_argument, NULL, 0},
499                 {"vlan-strip", required_argument, NULL, 0},
500                 {"stats", required_argument, NULL, 0},
501                 {"dev-basename", required_argument, NULL, 0},
502                 {"tx-csum", required_argument, NULL, 0},
503                 {"tso", required_argument, NULL, 0},
504                 {NULL, 0, 0, 0},
505         };
506
507         /* Parse command line */
508         while ((opt = getopt_long(argc, argv, "p:P",
509                         long_option, &option_index)) != EOF) {
510                 switch (opt) {
511                 /* Portmask */
512                 case 'p':
513                         enabled_port_mask = parse_portmask(optarg);
514                         if (enabled_port_mask == 0) {
515                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
516                                 us_vhost_usage(prgname);
517                                 return -1;
518                         }
519                         break;
520
521                 case 'P':
522                         promiscuous = 1;
523                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
524                                 ETH_VMDQ_ACCEPT_BROADCAST |
525                                 ETH_VMDQ_ACCEPT_MULTICAST;
526                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
527
528                         break;
529
530                 case 0:
531                         /* Enable/disable vm2vm comms. */
532                         if (!strncmp(long_option[option_index].name, "vm2vm",
533                                 MAX_LONG_OPT_SZ)) {
534                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
535                                 if (ret == -1) {
536                                         RTE_LOG(INFO, VHOST_CONFIG,
537                                                 "Invalid argument for "
538                                                 "vm2vm [0|1|2]\n");
539                                         us_vhost_usage(prgname);
540                                         return -1;
541                                 } else {
542                                         vm2vm_mode = (vm2vm_type)ret;
543                                 }
544                         }
545
546                         /* Enable/disable retries on RX. */
547                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
548                                 ret = parse_num_opt(optarg, 1);
549                                 if (ret == -1) {
550                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
551                                         us_vhost_usage(prgname);
552                                         return -1;
553                                 } else {
554                                         enable_retry = ret;
555                                 }
556                         }
557
558                         /* Enable/disable TX checksum offload. */
559                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
560                                 ret = parse_num_opt(optarg, 1);
561                                 if (ret == -1) {
562                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
563                                         us_vhost_usage(prgname);
564                                         return -1;
565                                 } else
566                                         enable_tx_csum = ret;
567                         }
568
569                         /* Enable/disable TSO offload. */
570                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
571                                 ret = parse_num_opt(optarg, 1);
572                                 if (ret == -1) {
573                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
574                                         us_vhost_usage(prgname);
575                                         return -1;
576                                 } else
577                                         enable_tso = ret;
578                         }
579
580                         /* Specify the retries delay time (in useconds) on RX. */
581                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
582                                 ret = parse_num_opt(optarg, INT32_MAX);
583                                 if (ret == -1) {
584                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
585                                         us_vhost_usage(prgname);
586                                         return -1;
587                                 } else {
588                                         burst_rx_delay_time = ret;
589                                 }
590                         }
591
592                         /* Specify the retries number on RX. */
593                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
594                                 ret = parse_num_opt(optarg, INT32_MAX);
595                                 if (ret == -1) {
596                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
597                                         us_vhost_usage(prgname);
598                                         return -1;
599                                 } else {
600                                         burst_rx_retry_num = ret;
601                                 }
602                         }
603
604                         /* Enable/disable RX mergeable buffers. */
605                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
606                                 ret = parse_num_opt(optarg, 1);
607                                 if (ret == -1) {
608                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
609                                         us_vhost_usage(prgname);
610                                         return -1;
611                                 } else {
612                                         mergeable = !!ret;
613                                         if (ret) {
614                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
615                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
616                                                         = JUMBO_FRAME_MAX_SIZE;
617                                         }
618                                 }
619                         }
620
621                         /* Enable/disable RX VLAN strip on host. */
622                         if (!strncmp(long_option[option_index].name,
623                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
624                                 ret = parse_num_opt(optarg, 1);
625                                 if (ret == -1) {
626                                         RTE_LOG(INFO, VHOST_CONFIG,
627                                                 "Invalid argument for VLAN strip [0|1]\n");
628                                         us_vhost_usage(prgname);
629                                         return -1;
630                                 } else {
631                                         vlan_strip = !!ret;
632                                         vmdq_conf_default.rxmode.hw_vlan_strip =
633                                                 vlan_strip;
634                                 }
635                         }
636
637                         /* Enable/disable stats. */
638                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
639                                 ret = parse_num_opt(optarg, INT32_MAX);
640                                 if (ret == -1) {
641                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 } else {
645                                         enable_stats = ret;
646                                 }
647                         }
648
649                         /* Set character device basename. */
650                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
651                                 if (us_vhost_parse_basename(optarg) == -1) {
652                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
653                                         us_vhost_usage(prgname);
654                                         return -1;
655                                 }
656                         }
657
658                         break;
659
660                         /* Invalid option - print options. */
661                 default:
662                         us_vhost_usage(prgname);
663                         return -1;
664                 }
665         }
666
667         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
668                 if (enabled_port_mask & (1 << i))
669                         ports[num_ports++] = (uint8_t)i;
670         }
671
672         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
673                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
674                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
675                 return -1;
676         }
677
678         return 0;
679 }
680
681 /*
682  * Update the global var NUM_PORTS and array PORTS according to system ports number
683  * and return valid ports number
684  */
685 static unsigned check_ports_num(unsigned nb_ports)
686 {
687         unsigned valid_num_ports = num_ports;
688         unsigned portid;
689
690         if (num_ports > nb_ports) {
691                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
692                         num_ports, nb_ports);
693                 num_ports = nb_ports;
694         }
695
696         for (portid = 0; portid < num_ports; portid ++) {
697                 if (ports[portid] >= nb_ports) {
698                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
699                                 ports[portid], (nb_ports - 1));
700                         ports[portid] = INVALID_PORT_ID;
701                         valid_num_ports--;
702                 }
703         }
704         return valid_num_ports;
705 }
706
707 static inline struct vhost_dev *__attribute__((always_inline))
708 find_vhost_dev(struct ether_addr *mac)
709 {
710         struct vhost_dev *vdev;
711
712         TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
713                 if (vdev->ready == DEVICE_RX &&
714                     is_same_ether_addr(mac, &vdev->mac_address))
715                         return vdev;
716         }
717
718         return NULL;
719 }
720
721 /*
722  * This function learns the MAC address of the device and registers this along with a
723  * vlan tag to a VMDQ.
724  */
725 static int
726 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727 {
728         struct ether_hdr *pkt_hdr;
729         struct virtio_net *dev = vdev->dev;
730         int i, ret;
731
732         /* Learn MAC address of guest device from packet */
733         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
734
735         if (find_vhost_dev(&pkt_hdr->s_addr)) {
736                 RTE_LOG(ERR, VHOST_DATA,
737                         "Device (%" PRIu64 ") is using a registered MAC!\n",
738                         dev->device_fh);
739                 return -1;
740         }
741
742         for (i = 0; i < ETHER_ADDR_LEN; i++)
743                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
744
745         /* vlan_tag currently uses the device_id. */
746         vdev->vlan_tag = vlan_tags[dev->device_fh];
747
748         /* Print out VMDQ registration info. */
749         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
750                 dev->device_fh,
751                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754                 vdev->vlan_tag);
755
756         /* Register the MAC address. */
757         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758                                 (uint32_t)dev->device_fh + vmdq_pool_base);
759         if (ret)
760                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
761                                         dev->device_fh);
762
763         /* Enable stripping of the vlan tag as we handle routing. */
764         if (vlan_strip)
765                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
766                         (uint16_t)vdev->vmdq_rx_q, 1);
767
768         /* Set device as ready for RX. */
769         vdev->ready = DEVICE_RX;
770
771         return 0;
772 }
773
774 /*
775  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
776  * queue before disabling RX on the device.
777  */
778 static inline void
779 unlink_vmdq(struct vhost_dev *vdev)
780 {
781         unsigned i = 0;
782         unsigned rx_count;
783         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
784
785         if (vdev->ready == DEVICE_RX) {
786                 /*clear MAC and VLAN settings*/
787                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
788                 for (i = 0; i < 6; i++)
789                         vdev->mac_address.addr_bytes[i] = 0;
790
791                 vdev->vlan_tag = 0;
792
793                 /*Clear out the receive buffers*/
794                 rx_count = rte_eth_rx_burst(ports[0],
795                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
796
797                 while (rx_count) {
798                         for (i = 0; i < rx_count; i++)
799                                 rte_pktmbuf_free(pkts_burst[i]);
800
801                         rx_count = rte_eth_rx_burst(ports[0],
802                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
803                 }
804
805                 vdev->ready = DEVICE_MAC_LEARNING;
806         }
807 }
808
809 /*
810  * Check if the packet destination MAC address is for a local device. If so then put
811  * the packet on that devices RX queue. If not then return.
812  */
813 static inline int __attribute__((always_inline))
814 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
815 {
816         struct ether_hdr *pkt_hdr;
817         uint64_t ret = 0;
818         struct vhost_dev *dst_vdev;
819         uint64_t fh;
820
821         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
822
823         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
824         if (!dst_vdev)
825                 return -1;
826
827         fh = dst_vdev->dev->device_fh;
828         if (fh == vdev->dev->device_fh) {
829                 RTE_LOG(DEBUG, VHOST_DATA,
830                         "(%" PRIu64 ") TX: src and dst MAC is same. "
831                         "Dropping packet.\n", fh);
832                 return 0;
833         }
834
835         RTE_LOG(DEBUG, VHOST_DATA,
836                 "(%" PRIu64 ") TX: MAC address is local\n", fh);
837
838         if (unlikely(dst_vdev->remove)) {
839                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
840                         "Device is marked for removal\n", fh);
841                 return 0;
842         }
843
844         /* send the packet to the local virtio device */
845         ret = rte_vhost_enqueue_burst(dst_vdev->dev, VIRTIO_RXQ, &m, 1);
846         if (enable_stats) {
847                 rte_atomic64_inc(&dev_statistics[fh].rx_total_atomic);
848                 rte_atomic64_add(&dev_statistics[fh].rx_atomic, ret);
849                 dev_statistics[vdev->dev->device_fh].tx_total++;
850                 dev_statistics[vdev->dev->device_fh].tx += ret;
851         }
852
853         return 0;
854 }
855
856 /*
857  * Check if the destination MAC of a packet is one local VM,
858  * and get its vlan tag, and offset if it is.
859  */
860 static inline int __attribute__((always_inline))
861 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
862         uint32_t *offset, uint16_t *vlan_tag)
863 {
864         struct vhost_dev *dst_vdev;
865         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
866
867         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
868         if (!dst_vdev)
869                 return 0;
870
871         if (dst_vdev->dev->device_fh == dev->device_fh) {
872                 RTE_LOG(DEBUG, VHOST_DATA,
873                         "(%" PRIu64 ") TX: src and dst MAC is same. "
874                         " Dropping packet.\n", dst_vdev->dev->device_fh);
875                 return -1;
876         }
877
878         /*
879          * HW vlan strip will reduce the packet length
880          * by minus length of vlan tag, so need restore
881          * the packet length by plus it.
882          */
883         *offset  = VLAN_HLEN;
884         *vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
885
886         RTE_LOG(DEBUG, VHOST_DATA,
887                 "(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
888                 "vlan tag: %u.\n",
889                 dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
890
891         return 0;
892 }
893
894 static uint16_t
895 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
896 {
897         if (ol_flags & PKT_TX_IPV4)
898                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
899         else /* assume ethertype == ETHER_TYPE_IPv6 */
900                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
901 }
902
903 static void virtio_tx_offload(struct rte_mbuf *m)
904 {
905         void *l3_hdr;
906         struct ipv4_hdr *ipv4_hdr = NULL;
907         struct tcp_hdr *tcp_hdr = NULL;
908         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
909
910         l3_hdr = (char *)eth_hdr + m->l2_len;
911
912         if (m->ol_flags & PKT_TX_IPV4) {
913                 ipv4_hdr = l3_hdr;
914                 ipv4_hdr->hdr_checksum = 0;
915                 m->ol_flags |= PKT_TX_IP_CKSUM;
916         }
917
918         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
919         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
920 }
921
922 /*
923  * This function routes the TX packet to the correct interface. This may be a local device
924  * or the physical port.
925  */
926 static inline void __attribute__((always_inline))
927 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
928 {
929         struct mbuf_table *tx_q;
930         struct rte_mbuf **m_table;
931         unsigned len, ret, offset = 0;
932         const uint16_t lcore_id = rte_lcore_id();
933         struct virtio_net *dev = vdev->dev;
934         struct ether_hdr *nh;
935
936         /*check if destination is local VM*/
937         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
938                 rte_pktmbuf_free(m);
939                 return;
940         }
941
942         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
943                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
944                         rte_pktmbuf_free(m);
945                         return;
946                 }
947         }
948
949         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
950                 "MAC address is external\n", dev->device_fh);
951
952         /*Add packet to the port tx queue*/
953         tx_q = &lcore_tx_queue[lcore_id];
954         len = tx_q->len;
955
956         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
957         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
958                 /* Guest has inserted the vlan tag. */
959                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
960                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
961                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
962                         (vh->vlan_tci != vlan_tag_be))
963                         vh->vlan_tci = vlan_tag_be;
964         } else {
965                 m->ol_flags |= PKT_TX_VLAN_PKT;
966
967                 /*
968                  * Find the right seg to adjust the data len when offset is
969                  * bigger than tail room size.
970                  */
971                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
972                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
973                                 m->data_len += offset;
974                         else {
975                                 struct rte_mbuf *seg = m;
976
977                                 while ((seg->next != NULL) &&
978                                         (offset > rte_pktmbuf_tailroom(seg)))
979                                         seg = seg->next;
980
981                                 seg->data_len += offset;
982                         }
983                         m->pkt_len += offset;
984                 }
985
986                 m->vlan_tci = vlan_tag;
987         }
988
989         if (m->ol_flags & PKT_TX_TCP_SEG)
990                 virtio_tx_offload(m);
991
992         tx_q->m_table[len] = m;
993         len++;
994         if (enable_stats) {
995                 dev_statistics[dev->device_fh].tx_total++;
996                 dev_statistics[dev->device_fh].tx++;
997         }
998
999         if (unlikely(len == MAX_PKT_BURST)) {
1000                 m_table = (struct rte_mbuf **)tx_q->m_table;
1001                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1002                 /* Free any buffers not handled by TX and update the port stats. */
1003                 if (unlikely(ret < len)) {
1004                         do {
1005                                 rte_pktmbuf_free(m_table[ret]);
1006                         } while (++ret < len);
1007                 }
1008
1009                 len = 0;
1010         }
1011
1012         tx_q->len = len;
1013         return;
1014 }
1015 /*
1016  * This function is called by each data core. It handles all RX/TX registered with the
1017  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1018  * with all devices in the main linked list.
1019  */
1020 static int
1021 switch_worker(__attribute__((unused)) void *arg)
1022 {
1023         struct virtio_net *dev = NULL;
1024         struct vhost_dev *vdev = NULL;
1025         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1026         struct mbuf_table *tx_q;
1027         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1028         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1029         unsigned ret, i;
1030         const uint16_t lcore_id = rte_lcore_id();
1031         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1032         uint16_t rx_count = 0;
1033         uint16_t tx_count;
1034         uint32_t retry = 0;
1035
1036         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1037         prev_tsc = 0;
1038
1039         tx_q = &lcore_tx_queue[lcore_id];
1040         for (i = 0; i < num_cores; i ++) {
1041                 if (lcore_ids[i] == lcore_id) {
1042                         tx_q->txq_id = i;
1043                         break;
1044                 }
1045         }
1046
1047         while(1) {
1048                 cur_tsc = rte_rdtsc();
1049                 /*
1050                  * TX burst queue drain
1051                  */
1052                 diff_tsc = cur_tsc - prev_tsc;
1053                 if (unlikely(diff_tsc > drain_tsc)) {
1054
1055                         if (tx_q->len) {
1056                                 RTE_LOG(DEBUG, VHOST_DATA,
1057                                         "TX queue drained after timeout with burst size %u\n",
1058                                         tx_q->len);
1059
1060                                 /*Tx any packets in the queue*/
1061                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1062                                                                            (struct rte_mbuf **)tx_q->m_table,
1063                                                                            (uint16_t)tx_q->len);
1064                                 if (unlikely(ret < tx_q->len)) {
1065                                         do {
1066                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1067                                         } while (++ret < tx_q->len);
1068                                 }
1069
1070                                 tx_q->len = 0;
1071                         }
1072
1073                         prev_tsc = cur_tsc;
1074
1075                 }
1076
1077                 /*
1078                  * Inform the configuration core that we have exited the
1079                  * linked list and that no devices are in use if requested.
1080                  */
1081                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1082                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1083
1084                 /*
1085                  * Process devices
1086                  */
1087                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1088                         uint64_t fh;
1089
1090                         dev = vdev->dev;
1091                         fh  = dev->device_fh;
1092
1093                         if (unlikely(vdev->remove)) {
1094                                 unlink_vmdq(vdev);
1095                                 vdev->ready = DEVICE_SAFE_REMOVE;
1096                                 continue;
1097                         }
1098
1099                         if (likely(vdev->ready == DEVICE_RX)) {
1100                                 /*Handle guest RX*/
1101                                 rx_count = rte_eth_rx_burst(ports[0],
1102                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1103
1104                                 if (rx_count) {
1105                                         /*
1106                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1107                                         * Here MAX_PKT_BURST must be less than virtio queue size
1108                                         */
1109                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1110                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1111                                                         rte_delay_us(burst_rx_delay_time);
1112                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1113                                                                 break;
1114                                                 }
1115                                         }
1116                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1117                                         if (enable_stats) {
1118                                                 rte_atomic64_add(
1119                                                         &dev_statistics[fh].rx_total_atomic,
1120                                                         rx_count);
1121                                                 rte_atomic64_add(
1122                                                         &dev_statistics[fh].rx_atomic,
1123                                                         ret_count);
1124                                         }
1125                                         while (likely(rx_count)) {
1126                                                 rx_count--;
1127                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1128                                         }
1129
1130                                 }
1131                         }
1132
1133                         if (likely(!vdev->remove)) {
1134                                 /* Handle guest TX*/
1135                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1136                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1137                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1138                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1139                                                 while (tx_count)
1140                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1141                                         }
1142                                 }
1143                                 for (i = 0; i < tx_count; ++i) {
1144                                         virtio_tx_route(vdev, pkts_burst[i],
1145                                                 vlan_tags[(uint16_t)dev->device_fh]);
1146                                 }
1147                         }
1148                 }
1149         }
1150
1151         return 0;
1152 }
1153
1154 /*
1155  * Remove a device from the specific data core linked list and from the
1156  * main linked list. Synchonization  occurs through the use of the
1157  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1158  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1159  */
1160 static void
1161 destroy_device (volatile struct virtio_net *dev)
1162 {
1163         struct vhost_dev *vdev;
1164         int lcore;
1165
1166         dev->flags &= ~VIRTIO_DEV_RUNNING;
1167
1168         vdev = (struct vhost_dev *)dev->priv;
1169         /*set the remove flag. */
1170         vdev->remove = 1;
1171         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1172                 rte_pause();
1173         }
1174
1175         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1176         TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1177
1178         /* Set the dev_removal_flag on each lcore. */
1179         RTE_LCORE_FOREACH_SLAVE(lcore)
1180                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1181
1182         /*
1183          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1184          * we can be sure that they can no longer access the device removed
1185          * from the linked lists and that the devices are no longer in use.
1186          */
1187         RTE_LCORE_FOREACH_SLAVE(lcore) {
1188                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1189                         rte_pause();
1190         }
1191
1192         lcore_info[vdev->coreid].device_num--;
1193
1194         RTE_LOG(INFO, VHOST_DATA,
1195                 "(%" PRIu64 ") Device has been removed from data core\n",
1196                 dev->device_fh);
1197
1198         rte_free(vdev);
1199 }
1200
1201 /*
1202  * A new device is added to a data core. First the device is added to the main linked list
1203  * and the allocated to a specific data core.
1204  */
1205 static int
1206 new_device (struct virtio_net *dev)
1207 {
1208         int lcore, core_add = 0;
1209         uint32_t device_num_min = num_devices;
1210         struct vhost_dev *vdev;
1211
1212         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1213         if (vdev == NULL) {
1214                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1215                         dev->device_fh);
1216                 return -1;
1217         }
1218         vdev->dev = dev;
1219         dev->priv = vdev;
1220
1221         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1222         vdev->vmdq_rx_q
1223                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
1224
1225         /*reset ready flag*/
1226         vdev->ready = DEVICE_MAC_LEARNING;
1227         vdev->remove = 0;
1228
1229         /* Find a suitable lcore to add the device. */
1230         RTE_LCORE_FOREACH_SLAVE(lcore) {
1231                 if (lcore_info[lcore].device_num < device_num_min) {
1232                         device_num_min = lcore_info[lcore].device_num;
1233                         core_add = lcore;
1234                 }
1235         }
1236         vdev->coreid = core_add;
1237
1238         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1239         lcore_info[vdev->coreid].device_num++;
1240
1241         /* Initialize device stats */
1242         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1243
1244         /* Disable notifications. */
1245         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1246         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1247         dev->flags |= VIRTIO_DEV_RUNNING;
1248
1249         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1250
1251         return 0;
1252 }
1253
1254 /*
1255  * These callback allow devices to be added to the data core when configuration
1256  * has been fully complete.
1257  */
1258 static const struct virtio_net_device_ops virtio_net_device_ops =
1259 {
1260         .new_device =  new_device,
1261         .destroy_device = destroy_device,
1262 };
1263
1264 /*
1265  * This is a thread will wake up after a period to print stats if the user has
1266  * enabled them.
1267  */
1268 static void
1269 print_stats(void)
1270 {
1271         struct vhost_dev *vdev;
1272         uint64_t tx_dropped, rx_dropped;
1273         uint64_t tx, tx_total, rx, rx_total;
1274         uint32_t device_fh;
1275         const char clr[] = { 27, '[', '2', 'J', '\0' };
1276         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1277
1278         while(1) {
1279                 sleep(enable_stats);
1280
1281                 /* Clear screen and move to top left */
1282                 printf("%s%s", clr, top_left);
1283
1284                 printf("\nDevice statistics ====================================");
1285
1286                 TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1287                         device_fh = vdev->dev->device_fh;
1288                         tx_total = dev_statistics[device_fh].tx_total;
1289                         tx = dev_statistics[device_fh].tx;
1290                         tx_dropped = tx_total - tx;
1291                         rx_total = rte_atomic64_read(
1292                                 &dev_statistics[device_fh].rx_total_atomic);
1293                         rx = rte_atomic64_read(
1294                                 &dev_statistics[device_fh].rx_atomic);
1295                         rx_dropped = rx_total - rx;
1296
1297                         printf("\nStatistics for device %"PRIu32" ------------------------------"
1298                                         "\nTX total:            %"PRIu64""
1299                                         "\nTX dropped:          %"PRIu64""
1300                                         "\nTX successful:               %"PRIu64""
1301                                         "\nRX total:            %"PRIu64""
1302                                         "\nRX dropped:          %"PRIu64""
1303                                         "\nRX successful:               %"PRIu64"",
1304                                         device_fh,
1305                                         tx_total,
1306                                         tx_dropped,
1307                                         tx,
1308                                         rx_total,
1309                                         rx_dropped,
1310                                         rx);
1311                 }
1312                 printf("\n======================================================\n");
1313         }
1314 }
1315
1316 /* When we receive a INT signal, unregister vhost driver */
1317 static void
1318 sigint_handler(__rte_unused int signum)
1319 {
1320         /* Unregister vhost driver. */
1321         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1322         if (ret != 0)
1323                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1324         exit(0);
1325 }
1326
1327 /*
1328  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1329  * device is also registered here to handle the IOCTLs.
1330  */
1331 int
1332 main(int argc, char *argv[])
1333 {
1334         unsigned lcore_id, core_id = 0;
1335         unsigned nb_ports, valid_num_ports;
1336         int ret;
1337         uint8_t portid;
1338         static pthread_t tid;
1339         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1340
1341         signal(SIGINT, sigint_handler);
1342
1343         /* init EAL */
1344         ret = rte_eal_init(argc, argv);
1345         if (ret < 0)
1346                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1347         argc -= ret;
1348         argv += ret;
1349
1350         /* parse app arguments */
1351         ret = us_vhost_parse_args(argc, argv);
1352         if (ret < 0)
1353                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1354
1355         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1356                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1357
1358                 if (rte_lcore_is_enabled(lcore_id))
1359                         lcore_ids[core_id ++] = lcore_id;
1360
1361         if (rte_lcore_count() > RTE_MAX_LCORE)
1362                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1363
1364         /*set the number of swithcing cores available*/
1365         num_switching_cores = rte_lcore_count()-1;
1366
1367         /* Get the number of physical ports. */
1368         nb_ports = rte_eth_dev_count();
1369         if (nb_ports > RTE_MAX_ETHPORTS)
1370                 nb_ports = RTE_MAX_ETHPORTS;
1371
1372         /*
1373          * Update the global var NUM_PORTS and global array PORTS
1374          * and get value of var VALID_NUM_PORTS according to system ports number
1375          */
1376         valid_num_ports = check_ports_num(nb_ports);
1377
1378         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1379                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1380                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1381                 return -1;
1382         }
1383
1384         /* Create the mbuf pool. */
1385         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
1386                 NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
1387                 0, MBUF_DATA_SIZE, rte_socket_id());
1388         if (mbuf_pool == NULL)
1389                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1390
1391         if (vm2vm_mode == VM2VM_HARDWARE) {
1392                 /* Enable VT loop back to let L2 switch to do it. */
1393                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1394                 RTE_LOG(DEBUG, VHOST_CONFIG,
1395                         "Enable loop back for L2 switch in vmdq.\n");
1396         }
1397
1398         /* initialize all ports */
1399         for (portid = 0; portid < nb_ports; portid++) {
1400                 /* skip ports that are not enabled */
1401                 if ((enabled_port_mask & (1 << portid)) == 0) {
1402                         RTE_LOG(INFO, VHOST_PORT,
1403                                 "Skipping disabled port %d\n", portid);
1404                         continue;
1405                 }
1406                 if (port_init(portid) != 0)
1407                         rte_exit(EXIT_FAILURE,
1408                                 "Cannot initialize network ports\n");
1409         }
1410
1411         /* Initialize device stats */
1412         memset(&dev_statistics, 0, sizeof(dev_statistics));
1413
1414         /* Enable stats if the user option is set. */
1415         if (enable_stats) {
1416                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1417                 if (ret != 0)
1418                         rte_exit(EXIT_FAILURE,
1419                                 "Cannot create print-stats thread\n");
1420
1421                 /* Set thread_name for aid in debugging.  */
1422                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1423                 ret = rte_thread_setname(tid, thread_name);
1424                 if (ret != 0)
1425                         RTE_LOG(ERR, VHOST_CONFIG,
1426                                 "Cannot set print-stats name\n");
1427         }
1428
1429         /* Launch all data cores. */
1430         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1431                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1432
1433         if (mergeable == 0)
1434                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1435
1436         /* Register vhost(cuse or user) driver to handle vhost messages. */
1437         ret = rte_vhost_driver_register((char *)&dev_basename);
1438         if (ret != 0)
1439                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1440
1441         rte_vhost_driver_callback_register(&virtio_net_device_ops);
1442
1443         /* Start CUSE session. */
1444         rte_vhost_driver_session_start();
1445         return 0;
1446
1447 }