examples/vhost: fix mbuf allocation failure
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 #define MBUF_CACHE_SIZE 128
66 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
67
68 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
69 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
70
71 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
73
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX                       1
79 #define DEVICE_SAFE_REMOVE      2
80
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84
85 #define INVALID_PORT_ID 0xFF
86
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92
93 /* Maximum character device basename size. */
94 #define MAX_BASENAME_SZ 10
95
96 /* Maximum long option length for option parsing. */
97 #define MAX_LONG_OPT_SZ 64
98
99 /* mask of enabled ports */
100 static uint32_t enabled_port_mask = 0;
101
102 /* Promiscuous mode */
103 static uint32_t promiscuous;
104
105 /* number of devices/queues to support*/
106 static uint32_t num_queues = 0;
107 static uint32_t num_devices;
108
109 static struct rte_mempool *mbuf_pool;
110 static int mergeable;
111
112 /* Do vlan strip on host, enabled on default */
113 static uint32_t vlan_strip = 1;
114
115 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
116 typedef enum {
117         VM2VM_DISABLED = 0,
118         VM2VM_SOFTWARE = 1,
119         VM2VM_HARDWARE = 2,
120         VM2VM_LAST
121 } vm2vm_type;
122 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
123
124 /* Enable stats. */
125 static uint32_t enable_stats = 0;
126 /* Enable retries on RX. */
127 static uint32_t enable_retry = 1;
128
129 /* Disable TX checksum offload */
130 static uint32_t enable_tx_csum;
131
132 /* Disable TSO offload */
133 static uint32_t enable_tso;
134
135 /* Specify timeout (in useconds) between retries on RX. */
136 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
137 /* Specify the number of retries on RX. */
138 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
139
140 /* Character device basename. Can be set by user. */
141 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
142
143 /* empty vmdq configuration structure. Filled in programatically */
144 static struct rte_eth_conf vmdq_conf_default = {
145         .rxmode = {
146                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
147                 .split_hdr_size = 0,
148                 .header_split   = 0, /**< Header Split disabled */
149                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
150                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
151                 /*
152                  * It is necessary for 1G NIC such as I350,
153                  * this fixes bug of ipv4 forwarding in guest can't
154                  * forward pakets from one virtio dev to another virtio dev.
155                  */
156                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
157                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
158                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
159         },
160
161         .txmode = {
162                 .mq_mode = ETH_MQ_TX_NONE,
163         },
164         .rx_adv_conf = {
165                 /*
166                  * should be overridden separately in code with
167                  * appropriate values
168                  */
169                 .vmdq_rx_conf = {
170                         .nb_queue_pools = ETH_8_POOLS,
171                         .enable_default_pool = 0,
172                         .default_pool = 0,
173                         .nb_pool_maps = 0,
174                         .pool_map = {{0, 0},},
175                 },
176         },
177 };
178
179 static unsigned lcore_ids[RTE_MAX_LCORE];
180 static uint8_t ports[RTE_MAX_ETHPORTS];
181 static unsigned num_ports = 0; /**< The number of ports specified in command line */
182 static uint16_t num_pf_queues, num_vmdq_queues;
183 static uint16_t vmdq_pool_base, vmdq_queue_base;
184 static uint16_t queues_per_pool;
185
186 const uint16_t vlan_tags[] = {
187         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
188         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
189         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
190         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
191         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
192         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
193         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
194         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
195 };
196
197 /* ethernet addresses of ports */
198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
199
200 static struct vhost_dev_tailq_list vhost_dev_list =
201         TAILQ_HEAD_INITIALIZER(vhost_dev_list);
202
203 static struct lcore_info lcore_info[RTE_MAX_LCORE];
204
205 /* Used for queueing bursts of TX packets. */
206 struct mbuf_table {
207         unsigned len;
208         unsigned txq_id;
209         struct rte_mbuf *m_table[MAX_PKT_BURST];
210 };
211
212 /* TX queue for each data core. */
213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
214
215 #define VLAN_HLEN       4
216
217 /* Per-device statistics struct */
218 struct device_statistics {
219         uint64_t tx_total;
220         rte_atomic64_t rx_total_atomic;
221         uint64_t tx;
222         rte_atomic64_t rx_atomic;
223 } __rte_cache_aligned;
224 struct device_statistics dev_statistics[MAX_DEVICES];
225
226 /*
227  * Builds up the correct configuration for VMDQ VLAN pool map
228  * according to the pool & queue limits.
229  */
230 static inline int
231 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
232 {
233         struct rte_eth_vmdq_rx_conf conf;
234         struct rte_eth_vmdq_rx_conf *def_conf =
235                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
236         unsigned i;
237
238         memset(&conf, 0, sizeof(conf));
239         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
240         conf.nb_pool_maps = num_devices;
241         conf.enable_loop_back = def_conf->enable_loop_back;
242         conf.rx_mode = def_conf->rx_mode;
243
244         for (i = 0; i < conf.nb_pool_maps; i++) {
245                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
246                 conf.pool_map[i].pools = (1UL << i);
247         }
248
249         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
250         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
251                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
252         return 0;
253 }
254
255 /*
256  * Validate the device number according to the max pool number gotten form
257  * dev_info. If the device number is invalid, give the error message and
258  * return -1. Each device must have its own pool.
259  */
260 static inline int
261 validate_num_devices(uint32_t max_nb_devices)
262 {
263         if (num_devices > max_nb_devices) {
264                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
265                 return -1;
266         }
267         return 0;
268 }
269
270 /*
271  * Initialises a given port using global settings and with the rx buffers
272  * coming from the mbuf_pool passed as parameter
273  */
274 static inline int
275 port_init(uint8_t port)
276 {
277         struct rte_eth_dev_info dev_info;
278         struct rte_eth_conf port_conf;
279         struct rte_eth_rxconf *rxconf;
280         struct rte_eth_txconf *txconf;
281         int16_t rx_rings, tx_rings;
282         uint16_t rx_ring_size, tx_ring_size;
283         int retval;
284         uint16_t q;
285
286         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
287         rte_eth_dev_info_get (port, &dev_info);
288
289         if (dev_info.max_rx_queues > MAX_QUEUES) {
290                 rte_exit(EXIT_FAILURE,
291                         "please define MAX_QUEUES no less than %u in %s\n",
292                         dev_info.max_rx_queues, __FILE__);
293         }
294
295         rxconf = &dev_info.default_rxconf;
296         txconf = &dev_info.default_txconf;
297         rxconf->rx_drop_en = 1;
298
299         /* Enable vlan offload */
300         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
301
302         /*configure the number of supported virtio devices based on VMDQ limits */
303         num_devices = dev_info.max_vmdq_pools;
304
305         rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
306         tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
307         tx_rings = (uint16_t)rte_lcore_count();
308
309         retval = validate_num_devices(MAX_DEVICES);
310         if (retval < 0)
311                 return retval;
312
313         /* Get port configuration. */
314         retval = get_eth_conf(&port_conf, num_devices);
315         if (retval < 0)
316                 return retval;
317         /* NIC queues are divided into pf queues and vmdq queues.  */
318         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
319         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
320         num_vmdq_queues = num_devices * queues_per_pool;
321         num_queues = num_pf_queues + num_vmdq_queues;
322         vmdq_queue_base = dev_info.vmdq_queue_base;
323         vmdq_pool_base  = dev_info.vmdq_pool_base;
324         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
325                 num_pf_queues, num_devices, queues_per_pool);
326
327         if (port >= rte_eth_dev_count()) return -1;
328
329         if (enable_tx_csum == 0)
330                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
331
332         if (enable_tso == 0) {
333                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
334                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
335         }
336
337         rx_rings = (uint16_t)dev_info.max_rx_queues;
338         /* Configure ethernet device. */
339         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
340         if (retval != 0)
341                 return retval;
342
343         /* Setup the queues. */
344         for (q = 0; q < rx_rings; q ++) {
345                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
346                                                 rte_eth_dev_socket_id(port),
347                                                 rxconf,
348                                                 mbuf_pool);
349                 if (retval < 0)
350                         return retval;
351         }
352         for (q = 0; q < tx_rings; q ++) {
353                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
354                                                 rte_eth_dev_socket_id(port),
355                                                 txconf);
356                 if (retval < 0)
357                         return retval;
358         }
359
360         /* Start the device. */
361         retval  = rte_eth_dev_start(port);
362         if (retval < 0) {
363                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
364                 return retval;
365         }
366
367         if (promiscuous)
368                 rte_eth_promiscuous_enable(port);
369
370         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
371         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
372         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
373                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374                         (unsigned)port,
375                         vmdq_ports_eth_addr[port].addr_bytes[0],
376                         vmdq_ports_eth_addr[port].addr_bytes[1],
377                         vmdq_ports_eth_addr[port].addr_bytes[2],
378                         vmdq_ports_eth_addr[port].addr_bytes[3],
379                         vmdq_ports_eth_addr[port].addr_bytes[4],
380                         vmdq_ports_eth_addr[port].addr_bytes[5]);
381
382         return 0;
383 }
384
385 /*
386  * Set character device basename.
387  */
388 static int
389 us_vhost_parse_basename(const char *q_arg)
390 {
391         /* parse number string */
392
393         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
394                 return -1;
395         else
396                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
397
398         return 0;
399 }
400
401 /*
402  * Parse the portmask provided at run time.
403  */
404 static int
405 parse_portmask(const char *portmask)
406 {
407         char *end = NULL;
408         unsigned long pm;
409
410         errno = 0;
411
412         /* parse hexadecimal string */
413         pm = strtoul(portmask, &end, 16);
414         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
415                 return -1;
416
417         if (pm == 0)
418                 return -1;
419
420         return pm;
421
422 }
423
424 /*
425  * Parse num options at run time.
426  */
427 static int
428 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
429 {
430         char *end = NULL;
431         unsigned long num;
432
433         errno = 0;
434
435         /* parse unsigned int string */
436         num = strtoul(q_arg, &end, 10);
437         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
438                 return -1;
439
440         if (num > max_valid_value)
441                 return -1;
442
443         return num;
444
445 }
446
447 /*
448  * Display usage
449  */
450 static void
451 us_vhost_usage(const char *prgname)
452 {
453         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
454         "               --vm2vm [0|1|2]\n"
455         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
456         "               --dev-basename <name>\n"
457         "               --nb-devices ND\n"
458         "               -p PORTMASK: Set mask for ports to be used by application\n"
459         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
460         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
461         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
462         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
463         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
464         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
465         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
466         "               --dev-basename: The basename to be used for the character device.\n"
467         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
468         "               --tso [0|1] disable/enable TCP segment offload.\n",
469                prgname);
470 }
471
472 /*
473  * Parse the arguments given in the command line of the application.
474  */
475 static int
476 us_vhost_parse_args(int argc, char **argv)
477 {
478         int opt, ret;
479         int option_index;
480         unsigned i;
481         const char *prgname = argv[0];
482         static struct option long_option[] = {
483                 {"vm2vm", required_argument, NULL, 0},
484                 {"rx-retry", required_argument, NULL, 0},
485                 {"rx-retry-delay", required_argument, NULL, 0},
486                 {"rx-retry-num", required_argument, NULL, 0},
487                 {"mergeable", required_argument, NULL, 0},
488                 {"vlan-strip", required_argument, NULL, 0},
489                 {"stats", required_argument, NULL, 0},
490                 {"dev-basename", required_argument, NULL, 0},
491                 {"tx-csum", required_argument, NULL, 0},
492                 {"tso", required_argument, NULL, 0},
493                 {NULL, 0, 0, 0},
494         };
495
496         /* Parse command line */
497         while ((opt = getopt_long(argc, argv, "p:P",
498                         long_option, &option_index)) != EOF) {
499                 switch (opt) {
500                 /* Portmask */
501                 case 'p':
502                         enabled_port_mask = parse_portmask(optarg);
503                         if (enabled_port_mask == 0) {
504                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505                                 us_vhost_usage(prgname);
506                                 return -1;
507                         }
508                         break;
509
510                 case 'P':
511                         promiscuous = 1;
512                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513                                 ETH_VMDQ_ACCEPT_BROADCAST |
514                                 ETH_VMDQ_ACCEPT_MULTICAST;
515                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
516
517                         break;
518
519                 case 0:
520                         /* Enable/disable vm2vm comms. */
521                         if (!strncmp(long_option[option_index].name, "vm2vm",
522                                 MAX_LONG_OPT_SZ)) {
523                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
524                                 if (ret == -1) {
525                                         RTE_LOG(INFO, VHOST_CONFIG,
526                                                 "Invalid argument for "
527                                                 "vm2vm [0|1|2]\n");
528                                         us_vhost_usage(prgname);
529                                         return -1;
530                                 } else {
531                                         vm2vm_mode = (vm2vm_type)ret;
532                                 }
533                         }
534
535                         /* Enable/disable retries on RX. */
536                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
537                                 ret = parse_num_opt(optarg, 1);
538                                 if (ret == -1) {
539                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
540                                         us_vhost_usage(prgname);
541                                         return -1;
542                                 } else {
543                                         enable_retry = ret;
544                                 }
545                         }
546
547                         /* Enable/disable TX checksum offload. */
548                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
549                                 ret = parse_num_opt(optarg, 1);
550                                 if (ret == -1) {
551                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
552                                         us_vhost_usage(prgname);
553                                         return -1;
554                                 } else
555                                         enable_tx_csum = ret;
556                         }
557
558                         /* Enable/disable TSO offload. */
559                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
560                                 ret = parse_num_opt(optarg, 1);
561                                 if (ret == -1) {
562                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
563                                         us_vhost_usage(prgname);
564                                         return -1;
565                                 } else
566                                         enable_tso = ret;
567                         }
568
569                         /* Specify the retries delay time (in useconds) on RX. */
570                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
571                                 ret = parse_num_opt(optarg, INT32_MAX);
572                                 if (ret == -1) {
573                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
574                                         us_vhost_usage(prgname);
575                                         return -1;
576                                 } else {
577                                         burst_rx_delay_time = ret;
578                                 }
579                         }
580
581                         /* Specify the retries number on RX. */
582                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
583                                 ret = parse_num_opt(optarg, INT32_MAX);
584                                 if (ret == -1) {
585                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
586                                         us_vhost_usage(prgname);
587                                         return -1;
588                                 } else {
589                                         burst_rx_retry_num = ret;
590                                 }
591                         }
592
593                         /* Enable/disable RX mergeable buffers. */
594                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
595                                 ret = parse_num_opt(optarg, 1);
596                                 if (ret == -1) {
597                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
598                                         us_vhost_usage(prgname);
599                                         return -1;
600                                 } else {
601                                         mergeable = !!ret;
602                                         if (ret) {
603                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
604                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
605                                                         = JUMBO_FRAME_MAX_SIZE;
606                                         }
607                                 }
608                         }
609
610                         /* Enable/disable RX VLAN strip on host. */
611                         if (!strncmp(long_option[option_index].name,
612                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
613                                 ret = parse_num_opt(optarg, 1);
614                                 if (ret == -1) {
615                                         RTE_LOG(INFO, VHOST_CONFIG,
616                                                 "Invalid argument for VLAN strip [0|1]\n");
617                                         us_vhost_usage(prgname);
618                                         return -1;
619                                 } else {
620                                         vlan_strip = !!ret;
621                                         vmdq_conf_default.rxmode.hw_vlan_strip =
622                                                 vlan_strip;
623                                 }
624                         }
625
626                         /* Enable/disable stats. */
627                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
628                                 ret = parse_num_opt(optarg, INT32_MAX);
629                                 if (ret == -1) {
630                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
631                                         us_vhost_usage(prgname);
632                                         return -1;
633                                 } else {
634                                         enable_stats = ret;
635                                 }
636                         }
637
638                         /* Set character device basename. */
639                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
640                                 if (us_vhost_parse_basename(optarg) == -1) {
641                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 }
645                         }
646
647                         break;
648
649                         /* Invalid option - print options. */
650                 default:
651                         us_vhost_usage(prgname);
652                         return -1;
653                 }
654         }
655
656         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
657                 if (enabled_port_mask & (1 << i))
658                         ports[num_ports++] = (uint8_t)i;
659         }
660
661         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
662                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
663                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
664                 return -1;
665         }
666
667         return 0;
668 }
669
670 /*
671  * Update the global var NUM_PORTS and array PORTS according to system ports number
672  * and return valid ports number
673  */
674 static unsigned check_ports_num(unsigned nb_ports)
675 {
676         unsigned valid_num_ports = num_ports;
677         unsigned portid;
678
679         if (num_ports > nb_ports) {
680                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
681                         num_ports, nb_ports);
682                 num_ports = nb_ports;
683         }
684
685         for (portid = 0; portid < num_ports; portid ++) {
686                 if (ports[portid] >= nb_ports) {
687                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
688                                 ports[portid], (nb_ports - 1));
689                         ports[portid] = INVALID_PORT_ID;
690                         valid_num_ports--;
691                 }
692         }
693         return valid_num_ports;
694 }
695
696 static inline struct vhost_dev *__attribute__((always_inline))
697 find_vhost_dev(struct ether_addr *mac)
698 {
699         struct vhost_dev *vdev;
700
701         TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
702                 if (vdev->ready == DEVICE_RX &&
703                     is_same_ether_addr(mac, &vdev->mac_address))
704                         return vdev;
705         }
706
707         return NULL;
708 }
709
710 /*
711  * This function learns the MAC address of the device and registers this along with a
712  * vlan tag to a VMDQ.
713  */
714 static int
715 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
716 {
717         struct ether_hdr *pkt_hdr;
718         struct virtio_net *dev = vdev->dev;
719         int i, ret;
720
721         /* Learn MAC address of guest device from packet */
722         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
723
724         if (find_vhost_dev(&pkt_hdr->s_addr)) {
725                 RTE_LOG(ERR, VHOST_DATA,
726                         "Device (%" PRIu64 ") is using a registered MAC!\n",
727                         dev->device_fh);
728                 return -1;
729         }
730
731         for (i = 0; i < ETHER_ADDR_LEN; i++)
732                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
733
734         /* vlan_tag currently uses the device_id. */
735         vdev->vlan_tag = vlan_tags[dev->device_fh];
736
737         /* Print out VMDQ registration info. */
738         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
739                 dev->device_fh,
740                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
741                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
742                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
743                 vdev->vlan_tag);
744
745         /* Register the MAC address. */
746         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
747                                 (uint32_t)dev->device_fh + vmdq_pool_base);
748         if (ret)
749                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
750                                         dev->device_fh);
751
752         /* Enable stripping of the vlan tag as we handle routing. */
753         if (vlan_strip)
754                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
755                         (uint16_t)vdev->vmdq_rx_q, 1);
756
757         /* Set device as ready for RX. */
758         vdev->ready = DEVICE_RX;
759
760         return 0;
761 }
762
763 /*
764  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
765  * queue before disabling RX on the device.
766  */
767 static inline void
768 unlink_vmdq(struct vhost_dev *vdev)
769 {
770         unsigned i = 0;
771         unsigned rx_count;
772         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
773
774         if (vdev->ready == DEVICE_RX) {
775                 /*clear MAC and VLAN settings*/
776                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
777                 for (i = 0; i < 6; i++)
778                         vdev->mac_address.addr_bytes[i] = 0;
779
780                 vdev->vlan_tag = 0;
781
782                 /*Clear out the receive buffers*/
783                 rx_count = rte_eth_rx_burst(ports[0],
784                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
785
786                 while (rx_count) {
787                         for (i = 0; i < rx_count; i++)
788                                 rte_pktmbuf_free(pkts_burst[i]);
789
790                         rx_count = rte_eth_rx_burst(ports[0],
791                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
792                 }
793
794                 vdev->ready = DEVICE_MAC_LEARNING;
795         }
796 }
797
798 static inline void __attribute__((always_inline))
799 virtio_xmit(struct virtio_net *dst_dev, struct virtio_net *src_dev,
800             struct rte_mbuf *m)
801 {
802         uint16_t ret;
803
804         ret = rte_vhost_enqueue_burst(dst_dev, VIRTIO_RXQ, &m, 1);
805         if (enable_stats) {
806                 rte_atomic64_inc(&dev_statistics[dst_dev->device_fh].rx_total_atomic);
807                 rte_atomic64_add(&dev_statistics[dst_dev->device_fh].rx_atomic, ret);
808                 dev_statistics[src_dev->device_fh].tx_total++;
809                 dev_statistics[src_dev->device_fh].tx += ret;
810         }
811 }
812
813 /*
814  * Check if the packet destination MAC address is for a local device. If so then put
815  * the packet on that devices RX queue. If not then return.
816  */
817 static inline int __attribute__((always_inline))
818 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
819 {
820         struct ether_hdr *pkt_hdr;
821         struct vhost_dev *dst_vdev;
822         uint64_t fh;
823
824         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
825
826         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
827         if (!dst_vdev)
828                 return -1;
829
830         fh = dst_vdev->dev->device_fh;
831         if (fh == vdev->dev->device_fh) {
832                 RTE_LOG(DEBUG, VHOST_DATA,
833                         "(%" PRIu64 ") TX: src and dst MAC is same. "
834                         "Dropping packet.\n", fh);
835                 return 0;
836         }
837
838         RTE_LOG(DEBUG, VHOST_DATA,
839                 "(%" PRIu64 ") TX: MAC address is local\n", fh);
840
841         if (unlikely(dst_vdev->remove)) {
842                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
843                         "Device is marked for removal\n", fh);
844                 return 0;
845         }
846
847         virtio_xmit(dst_vdev->dev, vdev->dev, m);
848         return 0;
849 }
850
851 /*
852  * Check if the destination MAC of a packet is one local VM,
853  * and get its vlan tag, and offset if it is.
854  */
855 static inline int __attribute__((always_inline))
856 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
857         uint32_t *offset, uint16_t *vlan_tag)
858 {
859         struct vhost_dev *dst_vdev;
860         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
861
862         dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
863         if (!dst_vdev)
864                 return 0;
865
866         if (dst_vdev->dev->device_fh == dev->device_fh) {
867                 RTE_LOG(DEBUG, VHOST_DATA,
868                         "(%" PRIu64 ") TX: src and dst MAC is same. "
869                         " Dropping packet.\n", dst_vdev->dev->device_fh);
870                 return -1;
871         }
872
873         /*
874          * HW vlan strip will reduce the packet length
875          * by minus length of vlan tag, so need restore
876          * the packet length by plus it.
877          */
878         *offset  = VLAN_HLEN;
879         *vlan_tag = vlan_tags[(uint16_t)dst_vdev->dev->device_fh];
880
881         RTE_LOG(DEBUG, VHOST_DATA,
882                 "(%" PRIu64 ") TX: pkt to local VM device id: (%" PRIu64 ") "
883                 "vlan tag: %u.\n",
884                 dev->device_fh, dst_vdev->dev->device_fh, *vlan_tag);
885
886         return 0;
887 }
888
889 static uint16_t
890 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
891 {
892         if (ol_flags & PKT_TX_IPV4)
893                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
894         else /* assume ethertype == ETHER_TYPE_IPv6 */
895                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
896 }
897
898 static void virtio_tx_offload(struct rte_mbuf *m)
899 {
900         void *l3_hdr;
901         struct ipv4_hdr *ipv4_hdr = NULL;
902         struct tcp_hdr *tcp_hdr = NULL;
903         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
904
905         l3_hdr = (char *)eth_hdr + m->l2_len;
906
907         if (m->ol_flags & PKT_TX_IPV4) {
908                 ipv4_hdr = l3_hdr;
909                 ipv4_hdr->hdr_checksum = 0;
910                 m->ol_flags |= PKT_TX_IP_CKSUM;
911         }
912
913         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
914         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
915 }
916
917 /*
918  * This function routes the TX packet to the correct interface. This may be a local device
919  * or the physical port.
920  */
921 static inline void __attribute__((always_inline))
922 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
923 {
924         struct mbuf_table *tx_q;
925         struct rte_mbuf **m_table;
926         unsigned len, ret, offset = 0;
927         const uint16_t lcore_id = rte_lcore_id();
928         struct virtio_net *dev = vdev->dev;
929         struct ether_hdr *nh;
930
931
932         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
933         if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
934                 struct vhost_dev *vdev2;
935
936                 TAILQ_FOREACH(vdev2, &vhost_dev_list, next) {
937                         virtio_xmit(vdev2->dev, vdev->dev, m);
938                 }
939                 goto queue2nic;
940         }
941
942         /*check if destination is local VM*/
943         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
944                 rte_pktmbuf_free(m);
945                 return;
946         }
947
948         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
949                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
950                         rte_pktmbuf_free(m);
951                         return;
952                 }
953         }
954
955         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
956                 "MAC address is external\n", dev->device_fh);
957
958 queue2nic:
959
960         /*Add packet to the port tx queue*/
961         tx_q = &lcore_tx_queue[lcore_id];
962         len = tx_q->len;
963
964         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
965         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
966                 /* Guest has inserted the vlan tag. */
967                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
968                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
969                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
970                         (vh->vlan_tci != vlan_tag_be))
971                         vh->vlan_tci = vlan_tag_be;
972         } else {
973                 m->ol_flags |= PKT_TX_VLAN_PKT;
974
975                 /*
976                  * Find the right seg to adjust the data len when offset is
977                  * bigger than tail room size.
978                  */
979                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
980                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
981                                 m->data_len += offset;
982                         else {
983                                 struct rte_mbuf *seg = m;
984
985                                 while ((seg->next != NULL) &&
986                                         (offset > rte_pktmbuf_tailroom(seg)))
987                                         seg = seg->next;
988
989                                 seg->data_len += offset;
990                         }
991                         m->pkt_len += offset;
992                 }
993
994                 m->vlan_tci = vlan_tag;
995         }
996
997         if (m->ol_flags & PKT_TX_TCP_SEG)
998                 virtio_tx_offload(m);
999
1000         tx_q->m_table[len] = m;
1001         len++;
1002         if (enable_stats) {
1003                 dev_statistics[dev->device_fh].tx_total++;
1004                 dev_statistics[dev->device_fh].tx++;
1005         }
1006
1007         if (unlikely(len == MAX_PKT_BURST)) {
1008                 m_table = (struct rte_mbuf **)tx_q->m_table;
1009                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1010                 /* Free any buffers not handled by TX and update the port stats. */
1011                 if (unlikely(ret < len)) {
1012                         do {
1013                                 rte_pktmbuf_free(m_table[ret]);
1014                         } while (++ret < len);
1015                 }
1016
1017                 len = 0;
1018         }
1019
1020         tx_q->len = len;
1021         return;
1022 }
1023 /*
1024  * This function is called by each data core. It handles all RX/TX registered with the
1025  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1026  * with all devices in the main linked list.
1027  */
1028 static int
1029 switch_worker(__attribute__((unused)) void *arg)
1030 {
1031         struct virtio_net *dev = NULL;
1032         struct vhost_dev *vdev = NULL;
1033         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1034         struct mbuf_table *tx_q;
1035         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1036         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1037         unsigned ret, i;
1038         const uint16_t lcore_id = rte_lcore_id();
1039         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1040         uint16_t rx_count = 0;
1041         uint16_t tx_count;
1042         uint32_t retry = 0;
1043
1044         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1045         prev_tsc = 0;
1046
1047         tx_q = &lcore_tx_queue[lcore_id];
1048         for (i = 0; i < num_cores; i ++) {
1049                 if (lcore_ids[i] == lcore_id) {
1050                         tx_q->txq_id = i;
1051                         break;
1052                 }
1053         }
1054
1055         while(1) {
1056                 cur_tsc = rte_rdtsc();
1057                 /*
1058                  * TX burst queue drain
1059                  */
1060                 diff_tsc = cur_tsc - prev_tsc;
1061                 if (unlikely(diff_tsc > drain_tsc)) {
1062
1063                         if (tx_q->len) {
1064                                 RTE_LOG(DEBUG, VHOST_DATA,
1065                                         "TX queue drained after timeout with burst size %u\n",
1066                                         tx_q->len);
1067
1068                                 /*Tx any packets in the queue*/
1069                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1070                                                                            (struct rte_mbuf **)tx_q->m_table,
1071                                                                            (uint16_t)tx_q->len);
1072                                 if (unlikely(ret < tx_q->len)) {
1073                                         do {
1074                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1075                                         } while (++ret < tx_q->len);
1076                                 }
1077
1078                                 tx_q->len = 0;
1079                         }
1080
1081                         prev_tsc = cur_tsc;
1082
1083                 }
1084
1085                 /*
1086                  * Inform the configuration core that we have exited the
1087                  * linked list and that no devices are in use if requested.
1088                  */
1089                 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1090                         lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1091
1092                 /*
1093                  * Process devices
1094                  */
1095                 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, next) {
1096                         uint64_t fh;
1097
1098                         dev = vdev->dev;
1099                         fh  = dev->device_fh;
1100
1101                         if (unlikely(vdev->remove)) {
1102                                 unlink_vmdq(vdev);
1103                                 vdev->ready = DEVICE_SAFE_REMOVE;
1104                                 continue;
1105                         }
1106
1107                         if (likely(vdev->ready == DEVICE_RX)) {
1108                                 /*Handle guest RX*/
1109                                 rx_count = rte_eth_rx_burst(ports[0],
1110                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1111
1112                                 if (rx_count) {
1113                                         /*
1114                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1115                                         * Here MAX_PKT_BURST must be less than virtio queue size
1116                                         */
1117                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1118                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1119                                                         rte_delay_us(burst_rx_delay_time);
1120                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1121                                                                 break;
1122                                                 }
1123                                         }
1124                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1125                                         if (enable_stats) {
1126                                                 rte_atomic64_add(
1127                                                         &dev_statistics[fh].rx_total_atomic,
1128                                                         rx_count);
1129                                                 rte_atomic64_add(
1130                                                         &dev_statistics[fh].rx_atomic,
1131                                                         ret_count);
1132                                         }
1133                                         while (likely(rx_count)) {
1134                                                 rx_count--;
1135                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1136                                         }
1137
1138                                 }
1139                         }
1140
1141                         if (likely(!vdev->remove)) {
1142                                 /* Handle guest TX*/
1143                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1144                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1145                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1146                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1147                                                 while (tx_count)
1148                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1149                                         }
1150                                 }
1151                                 for (i = 0; i < tx_count; ++i) {
1152                                         virtio_tx_route(vdev, pkts_burst[i],
1153                                                 vlan_tags[(uint16_t)dev->device_fh]);
1154                                 }
1155                         }
1156                 }
1157         }
1158
1159         return 0;
1160 }
1161
1162 /*
1163  * Remove a device from the specific data core linked list and from the
1164  * main linked list. Synchonization  occurs through the use of the
1165  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1166  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1167  */
1168 static void
1169 destroy_device (volatile struct virtio_net *dev)
1170 {
1171         struct vhost_dev *vdev;
1172         int lcore;
1173
1174         dev->flags &= ~VIRTIO_DEV_RUNNING;
1175
1176         vdev = (struct vhost_dev *)dev->priv;
1177         /*set the remove flag. */
1178         vdev->remove = 1;
1179         while(vdev->ready != DEVICE_SAFE_REMOVE) {
1180                 rte_pause();
1181         }
1182
1183         TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1184         TAILQ_REMOVE(&vhost_dev_list, vdev, next);
1185
1186         /* Set the dev_removal_flag on each lcore. */
1187         RTE_LCORE_FOREACH_SLAVE(lcore)
1188                 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1189
1190         /*
1191          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1192          * we can be sure that they can no longer access the device removed
1193          * from the linked lists and that the devices are no longer in use.
1194          */
1195         RTE_LCORE_FOREACH_SLAVE(lcore) {
1196                 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1197                         rte_pause();
1198         }
1199
1200         lcore_info[vdev->coreid].device_num--;
1201
1202         RTE_LOG(INFO, VHOST_DATA,
1203                 "(%" PRIu64 ") Device has been removed from data core\n",
1204                 dev->device_fh);
1205
1206         rte_free(vdev);
1207 }
1208
1209 /*
1210  * A new device is added to a data core. First the device is added to the main linked list
1211  * and the allocated to a specific data core.
1212  */
1213 static int
1214 new_device (struct virtio_net *dev)
1215 {
1216         int lcore, core_add = 0;
1217         uint32_t device_num_min = num_devices;
1218         struct vhost_dev *vdev;
1219
1220         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1221         if (vdev == NULL) {
1222                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
1223                         dev->device_fh);
1224                 return -1;
1225         }
1226         vdev->dev = dev;
1227         dev->priv = vdev;
1228
1229         TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, next);
1230         vdev->vmdq_rx_q
1231                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
1232
1233         /*reset ready flag*/
1234         vdev->ready = DEVICE_MAC_LEARNING;
1235         vdev->remove = 0;
1236
1237         /* Find a suitable lcore to add the device. */
1238         RTE_LCORE_FOREACH_SLAVE(lcore) {
1239                 if (lcore_info[lcore].device_num < device_num_min) {
1240                         device_num_min = lcore_info[lcore].device_num;
1241                         core_add = lcore;
1242                 }
1243         }
1244         vdev->coreid = core_add;
1245
1246         TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, next);
1247         lcore_info[vdev->coreid].device_num++;
1248
1249         /* Initialize device stats */
1250         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
1251
1252         /* Disable notifications. */
1253         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
1254         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
1255         dev->flags |= VIRTIO_DEV_RUNNING;
1256
1257         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
1258
1259         return 0;
1260 }
1261
1262 /*
1263  * These callback allow devices to be added to the data core when configuration
1264  * has been fully complete.
1265  */
1266 static const struct virtio_net_device_ops virtio_net_device_ops =
1267 {
1268         .new_device =  new_device,
1269         .destroy_device = destroy_device,
1270 };
1271
1272 /*
1273  * This is a thread will wake up after a period to print stats if the user has
1274  * enabled them.
1275  */
1276 static void
1277 print_stats(void)
1278 {
1279         struct vhost_dev *vdev;
1280         uint64_t tx_dropped, rx_dropped;
1281         uint64_t tx, tx_total, rx, rx_total;
1282         uint32_t device_fh;
1283         const char clr[] = { 27, '[', '2', 'J', '\0' };
1284         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1285
1286         while(1) {
1287                 sleep(enable_stats);
1288
1289                 /* Clear screen and move to top left */
1290                 printf("%s%s", clr, top_left);
1291
1292                 printf("\nDevice statistics ====================================");
1293
1294                 TAILQ_FOREACH(vdev, &vhost_dev_list, next) {
1295                         device_fh = vdev->dev->device_fh;
1296                         tx_total = dev_statistics[device_fh].tx_total;
1297                         tx = dev_statistics[device_fh].tx;
1298                         tx_dropped = tx_total - tx;
1299                         rx_total = rte_atomic64_read(
1300                                 &dev_statistics[device_fh].rx_total_atomic);
1301                         rx = rte_atomic64_read(
1302                                 &dev_statistics[device_fh].rx_atomic);
1303                         rx_dropped = rx_total - rx;
1304
1305                         printf("\nStatistics for device %"PRIu32" ------------------------------"
1306                                         "\nTX total:            %"PRIu64""
1307                                         "\nTX dropped:          %"PRIu64""
1308                                         "\nTX successful:               %"PRIu64""
1309                                         "\nRX total:            %"PRIu64""
1310                                         "\nRX dropped:          %"PRIu64""
1311                                         "\nRX successful:               %"PRIu64"",
1312                                         device_fh,
1313                                         tx_total,
1314                                         tx_dropped,
1315                                         tx,
1316                                         rx_total,
1317                                         rx_dropped,
1318                                         rx);
1319                 }
1320                 printf("\n======================================================\n");
1321         }
1322 }
1323
1324 /* When we receive a INT signal, unregister vhost driver */
1325 static void
1326 sigint_handler(__rte_unused int signum)
1327 {
1328         /* Unregister vhost driver. */
1329         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
1330         if (ret != 0)
1331                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
1332         exit(0);
1333 }
1334
1335 /*
1336  * While creating an mbuf pool, one key thing is to figure out how
1337  * many mbuf entries is enough for our use. FYI, here are some
1338  * guidelines:
1339  *
1340  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1341  *
1342  * - For each switch core (A CPU core does the packet switch), we need
1343  *   also make some reservation for receiving the packets from virtio
1344  *   Tx queue. How many is enough depends on the usage. It's normally
1345  *   a simple calculation like following:
1346  *
1347  *       MAX_PKT_BURST * max packet size / mbuf size
1348  *
1349  *   So, we definitely need allocate more mbufs when TSO is enabled.
1350  *
1351  * - Similarly, for each switching core, we should serve @nr_rx_desc
1352  *   mbufs for receiving the packets from physical NIC device.
1353  *
1354  * - We also need make sure, for each switch core, we have allocated
1355  *   enough mbufs to fill up the mbuf cache.
1356  */
1357 static void
1358 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1359         uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1360 {
1361         uint32_t nr_mbufs;
1362         uint32_t nr_mbufs_per_core;
1363         uint32_t mtu = 1500;
1364
1365         if (mergeable)
1366                 mtu = 9000;
1367         if (enable_tso)
1368                 mtu = 64 * 1024;
1369
1370         nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1371                         (mbuf_size - RTE_PKTMBUF_HEADROOM) * MAX_PKT_BURST;
1372         nr_mbufs_per_core += nr_rx_desc;
1373         nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1374
1375         nr_mbufs  = nr_queues * nr_rx_desc;
1376         nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1377         nr_mbufs *= nr_port;
1378
1379         mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1380                                             nr_mbuf_cache, 0, mbuf_size,
1381                                             rte_socket_id());
1382         if (mbuf_pool == NULL)
1383                 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1384 }
1385
1386 /*
1387  * Main function, does initialisation and calls the per-lcore functions. The CUSE
1388  * device is also registered here to handle the IOCTLs.
1389  */
1390 int
1391 main(int argc, char *argv[])
1392 {
1393         unsigned lcore_id, core_id = 0;
1394         unsigned nb_ports, valid_num_ports;
1395         int ret;
1396         uint8_t portid;
1397         static pthread_t tid;
1398         char thread_name[RTE_MAX_THREAD_NAME_LEN];
1399
1400         signal(SIGINT, sigint_handler);
1401
1402         /* init EAL */
1403         ret = rte_eal_init(argc, argv);
1404         if (ret < 0)
1405                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1406         argc -= ret;
1407         argv += ret;
1408
1409         /* parse app arguments */
1410         ret = us_vhost_parse_args(argc, argv);
1411         if (ret < 0)
1412                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1413
1414         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
1415                 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1416
1417                 if (rte_lcore_is_enabled(lcore_id))
1418                         lcore_ids[core_id ++] = lcore_id;
1419
1420         if (rte_lcore_count() > RTE_MAX_LCORE)
1421                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1422
1423         /* Get the number of physical ports. */
1424         nb_ports = rte_eth_dev_count();
1425         if (nb_ports > RTE_MAX_ETHPORTS)
1426                 nb_ports = RTE_MAX_ETHPORTS;
1427
1428         /*
1429          * Update the global var NUM_PORTS and global array PORTS
1430          * and get value of var VALID_NUM_PORTS according to system ports number
1431          */
1432         valid_num_ports = check_ports_num(nb_ports);
1433
1434         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1435                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1436                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1437                 return -1;
1438         }
1439
1440         /*
1441          * FIXME: here we are trying to allocate mbufs big enough for
1442          * @MAX_QUEUES, but the truth is we're never going to use that
1443          * many queues here. We probably should only do allocation for
1444          * those queues we are going to use.
1445          */
1446         create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1447                          MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1448
1449         if (vm2vm_mode == VM2VM_HARDWARE) {
1450                 /* Enable VT loop back to let L2 switch to do it. */
1451                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1452                 RTE_LOG(DEBUG, VHOST_CONFIG,
1453                         "Enable loop back for L2 switch in vmdq.\n");
1454         }
1455
1456         /* initialize all ports */
1457         for (portid = 0; portid < nb_ports; portid++) {
1458                 /* skip ports that are not enabled */
1459                 if ((enabled_port_mask & (1 << portid)) == 0) {
1460                         RTE_LOG(INFO, VHOST_PORT,
1461                                 "Skipping disabled port %d\n", portid);
1462                         continue;
1463                 }
1464                 if (port_init(portid) != 0)
1465                         rte_exit(EXIT_FAILURE,
1466                                 "Cannot initialize network ports\n");
1467         }
1468
1469         /* Initialize device stats */
1470         memset(&dev_statistics, 0, sizeof(dev_statistics));
1471
1472         /* Enable stats if the user option is set. */
1473         if (enable_stats) {
1474                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1475                 if (ret != 0)
1476                         rte_exit(EXIT_FAILURE,
1477                                 "Cannot create print-stats thread\n");
1478
1479                 /* Set thread_name for aid in debugging.  */
1480                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1481                 ret = rte_thread_setname(tid, thread_name);
1482                 if (ret != 0)
1483                         RTE_LOG(ERR, VHOST_CONFIG,
1484                                 "Cannot set print-stats name\n");
1485         }
1486
1487         /* Launch all data cores. */
1488         RTE_LCORE_FOREACH_SLAVE(lcore_id)
1489                 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1490
1491         if (mergeable == 0)
1492                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
1493
1494         /* Register vhost(cuse or user) driver to handle vhost messages. */
1495         ret = rte_vhost_driver_register((char *)&dev_basename);
1496         if (ret != 0)
1497                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
1498
1499         rte_vhost_driver_callback_register(&virtio_net_device_ops);
1500
1501         /* Start CUSE session. */
1502         rte_vhost_driver_session_start();
1503         return 0;
1504
1505 }