examples/vhost: remove IPv4 header definition
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54
55 #include "main.h"
56
57 #ifndef MAX_QUEUES
58 #define MAX_QUEUES 128
59 #endif
60
61 /* the maximum number of external ports supported */
62 #define MAX_SUP_PORTS 1
63
64 /*
65  * Calculate the number of buffers needed per port
66  */
67 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
68                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
69                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
70                                                         (num_switching_cores*MBUF_CACHE_SIZE))
71
72 #define MBUF_CACHE_SIZE 128
73 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
74
75 /*
76  * No frame data buffer allocated from host are required for zero copy
77  * implementation, guest will allocate the frame data buffer, and vhost
78  * directly use it.
79  */
80 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
81 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
82 #define MBUF_CACHE_SIZE_ZCP 0
83
84 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
85 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
86
87 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
88 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
89
90 #define JUMBO_FRAME_MAX_SIZE    0x2600
91
92 /* State of virtio device. */
93 #define DEVICE_MAC_LEARNING 0
94 #define DEVICE_RX                       1
95 #define DEVICE_SAFE_REMOVE      2
96
97 /* Config_core_flag status definitions. */
98 #define REQUEST_DEV_REMOVAL 1
99 #define ACK_DEV_REMOVAL 0
100
101 /* Configurable number of RX/TX ring descriptors */
102 #define RTE_TEST_RX_DESC_DEFAULT 1024
103 #define RTE_TEST_TX_DESC_DEFAULT 512
104
105 /*
106  * Need refine these 2 macros for legacy and DPDK based front end:
107  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
108  * And then adjust power 2.
109  */
110 /*
111  * For legacy front end, 128 descriptors,
112  * half for virtio header, another half for mbuf.
113  */
114 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
115 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
116
117 /* Get first 4 bytes in mbuf headroom. */
118 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
119                 + sizeof(struct rte_mbuf)))
120
121 /* true if x is a power of 2 */
122 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
123
124 #define INVALID_PORT_ID 0xFF
125
126 /* Max number of devices. Limited by vmdq. */
127 #define MAX_DEVICES 64
128
129 /* Size of buffers used for snprintfs. */
130 #define MAX_PRINT_BUFF 6072
131
132 /* Maximum character device basename size. */
133 #define MAX_BASENAME_SZ 10
134
135 /* Maximum long option length for option parsing. */
136 #define MAX_LONG_OPT_SZ 64
137
138 /* Used to compare MAC addresses. */
139 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
140
141 /* Number of descriptors per cacheline. */
142 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
143
144 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
145
146 /* mask of enabled ports */
147 static uint32_t enabled_port_mask = 0;
148
149 /* Promiscuous mode */
150 static uint32_t promiscuous;
151
152 /*Number of switching cores enabled*/
153 static uint32_t num_switching_cores = 0;
154
155 /* number of devices/queues to support*/
156 static uint32_t num_queues = 0;
157 static uint32_t num_devices;
158
159 /*
160  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
161  * disabled on default.
162  */
163 static uint32_t zero_copy;
164 static int mergeable;
165
166 /* Do vlan strip on host, enabled on default */
167 static uint32_t vlan_strip = 1;
168
169 /* number of descriptors to apply*/
170 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
171 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
172
173 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
174 #define MAX_RING_DESC 4096
175
176 struct vpool {
177         struct rte_mempool *pool;
178         struct rte_ring *ring;
179         uint32_t buf_size;
180 } vpool_array[MAX_QUEUES+MAX_QUEUES];
181
182 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
183 typedef enum {
184         VM2VM_DISABLED = 0,
185         VM2VM_SOFTWARE = 1,
186         VM2VM_HARDWARE = 2,
187         VM2VM_LAST
188 } vm2vm_type;
189 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
190
191 /* The type of host physical address translated from guest physical address. */
192 typedef enum {
193         PHYS_ADDR_CONTINUOUS = 0,
194         PHYS_ADDR_CROSS_SUBREG = 1,
195         PHYS_ADDR_INVALID = 2,
196         PHYS_ADDR_LAST
197 } hpa_type;
198
199 /* Enable stats. */
200 static uint32_t enable_stats = 0;
201 /* Enable retries on RX. */
202 static uint32_t enable_retry = 1;
203 /* Specify timeout (in useconds) between retries on RX. */
204 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
205 /* Specify the number of retries on RX. */
206 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
207
208 /* Character device basename. Can be set by user. */
209 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
210
211 /* empty vmdq configuration structure. Filled in programatically */
212 static struct rte_eth_conf vmdq_conf_default = {
213         .rxmode = {
214                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
215                 .split_hdr_size = 0,
216                 .header_split   = 0, /**< Header Split disabled */
217                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
218                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
219                 /*
220                  * It is necessary for 1G NIC such as I350,
221                  * this fixes bug of ipv4 forwarding in guest can't
222                  * forward pakets from one virtio dev to another virtio dev.
223                  */
224                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
225                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
226                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
227         },
228
229         .txmode = {
230                 .mq_mode = ETH_MQ_TX_NONE,
231         },
232         .rx_adv_conf = {
233                 /*
234                  * should be overridden separately in code with
235                  * appropriate values
236                  */
237                 .vmdq_rx_conf = {
238                         .nb_queue_pools = ETH_8_POOLS,
239                         .enable_default_pool = 0,
240                         .default_pool = 0,
241                         .nb_pool_maps = 0,
242                         .pool_map = {{0, 0},},
243                 },
244         },
245 };
246
247 static unsigned lcore_ids[RTE_MAX_LCORE];
248 static uint8_t ports[RTE_MAX_ETHPORTS];
249 static unsigned num_ports = 0; /**< The number of ports specified in command line */
250 static uint16_t num_pf_queues, num_vmdq_queues;
251 static uint16_t vmdq_pool_base, vmdq_queue_base;
252 static uint16_t queues_per_pool;
253
254 static const uint16_t external_pkt_default_vlan_tag = 2000;
255 const uint16_t vlan_tags[] = {
256         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
257         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
258         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
259         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
260         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
261         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
262         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
263         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
264 };
265
266 /* ethernet addresses of ports */
267 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
268
269 /* heads for the main used and free linked lists for the data path. */
270 static struct virtio_net_data_ll *ll_root_used = NULL;
271 static struct virtio_net_data_ll *ll_root_free = NULL;
272
273 /* Array of data core structures containing information on individual core linked lists. */
274 static struct lcore_info lcore_info[RTE_MAX_LCORE];
275
276 /* Used for queueing bursts of TX packets. */
277 struct mbuf_table {
278         unsigned len;
279         unsigned txq_id;
280         struct rte_mbuf *m_table[MAX_PKT_BURST];
281 };
282
283 /* TX queue for each data core. */
284 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
285
286 /* TX queue fori each virtio device for zero copy. */
287 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
288
289 /* Vlan header struct used to insert vlan tags on TX. */
290 struct vlan_ethhdr {
291         unsigned char   h_dest[ETH_ALEN];
292         unsigned char   h_source[ETH_ALEN];
293         __be16          h_vlan_proto;
294         __be16          h_vlan_TCI;
295         __be16          h_vlan_encapsulated_proto;
296 };
297
298 /* Header lengths. */
299 #define VLAN_HLEN       4
300 #define VLAN_ETH_HLEN   18
301
302 /* Per-device statistics struct */
303 struct device_statistics {
304         uint64_t tx_total;
305         rte_atomic64_t rx_total_atomic;
306         uint64_t rx_total;
307         uint64_t tx;
308         rte_atomic64_t rx_atomic;
309         uint64_t rx;
310 } __rte_cache_aligned;
311 struct device_statistics dev_statistics[MAX_DEVICES];
312
313 /*
314  * Builds up the correct configuration for VMDQ VLAN pool map
315  * according to the pool & queue limits.
316  */
317 static inline int
318 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
319 {
320         struct rte_eth_vmdq_rx_conf conf;
321         struct rte_eth_vmdq_rx_conf *def_conf =
322                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
323         unsigned i;
324
325         memset(&conf, 0, sizeof(conf));
326         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
327         conf.nb_pool_maps = num_devices;
328         conf.enable_loop_back = def_conf->enable_loop_back;
329         conf.rx_mode = def_conf->rx_mode;
330
331         for (i = 0; i < conf.nb_pool_maps; i++) {
332                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
333                 conf.pool_map[i].pools = (1UL << i);
334         }
335
336         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
337         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
338                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
339         return 0;
340 }
341
342 /*
343  * Validate the device number according to the max pool number gotten form
344  * dev_info. If the device number is invalid, give the error message and
345  * return -1. Each device must have its own pool.
346  */
347 static inline int
348 validate_num_devices(uint32_t max_nb_devices)
349 {
350         if (num_devices > max_nb_devices) {
351                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
352                 return -1;
353         }
354         return 0;
355 }
356
357 /*
358  * Initialises a given port using global settings and with the rx buffers
359  * coming from the mbuf_pool passed as parameter
360  */
361 static inline int
362 port_init(uint8_t port)
363 {
364         struct rte_eth_dev_info dev_info;
365         struct rte_eth_conf port_conf;
366         struct rte_eth_rxconf *rxconf;
367         struct rte_eth_txconf *txconf;
368         int16_t rx_rings, tx_rings;
369         uint16_t rx_ring_size, tx_ring_size;
370         int retval;
371         uint16_t q;
372
373         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
374         rte_eth_dev_info_get (port, &dev_info);
375
376         if (dev_info.max_rx_queues > MAX_QUEUES) {
377                 rte_exit(EXIT_FAILURE,
378                         "please define MAX_QUEUES no less than %u in %s\n",
379                         dev_info.max_rx_queues, __FILE__);
380         }
381
382         rxconf = &dev_info.default_rxconf;
383         txconf = &dev_info.default_txconf;
384         rxconf->rx_drop_en = 1;
385
386         /* Enable vlan offload */
387         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
388
389         /*
390          * Zero copy defers queue RX/TX start to the time when guest
391          * finishes its startup and packet buffers from that guest are
392          * available.
393          */
394         if (zero_copy) {
395                 rxconf->rx_deferred_start = 1;
396                 rxconf->rx_drop_en = 0;
397                 txconf->tx_deferred_start = 1;
398         }
399
400         /*configure the number of supported virtio devices based on VMDQ limits */
401         num_devices = dev_info.max_vmdq_pools;
402
403         if (zero_copy) {
404                 rx_ring_size = num_rx_descriptor;
405                 tx_ring_size = num_tx_descriptor;
406                 tx_rings = dev_info.max_tx_queues;
407         } else {
408                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
409                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
410                 tx_rings = (uint16_t)rte_lcore_count();
411         }
412
413         retval = validate_num_devices(MAX_DEVICES);
414         if (retval < 0)
415                 return retval;
416
417         /* Get port configuration. */
418         retval = get_eth_conf(&port_conf, num_devices);
419         if (retval < 0)
420                 return retval;
421         /* NIC queues are divided into pf queues and vmdq queues.  */
422         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
423         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
424         num_vmdq_queues = num_devices * queues_per_pool;
425         num_queues = num_pf_queues + num_vmdq_queues;
426         vmdq_queue_base = dev_info.vmdq_queue_base;
427         vmdq_pool_base  = dev_info.vmdq_pool_base;
428         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
429                 num_pf_queues, num_devices, queues_per_pool);
430
431         if (port >= rte_eth_dev_count()) return -1;
432
433         rx_rings = (uint16_t)dev_info.max_rx_queues;
434         /* Configure ethernet device. */
435         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
436         if (retval != 0)
437                 return retval;
438
439         /* Setup the queues. */
440         for (q = 0; q < rx_rings; q ++) {
441                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
442                                                 rte_eth_dev_socket_id(port),
443                                                 rxconf,
444                                                 vpool_array[q].pool);
445                 if (retval < 0)
446                         return retval;
447         }
448         for (q = 0; q < tx_rings; q ++) {
449                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
450                                                 rte_eth_dev_socket_id(port),
451                                                 txconf);
452                 if (retval < 0)
453                         return retval;
454         }
455
456         /* Start the device. */
457         retval  = rte_eth_dev_start(port);
458         if (retval < 0) {
459                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
460                 return retval;
461         }
462
463         if (promiscuous)
464                 rte_eth_promiscuous_enable(port);
465
466         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
467         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
468         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
469                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
470                         (unsigned)port,
471                         vmdq_ports_eth_addr[port].addr_bytes[0],
472                         vmdq_ports_eth_addr[port].addr_bytes[1],
473                         vmdq_ports_eth_addr[port].addr_bytes[2],
474                         vmdq_ports_eth_addr[port].addr_bytes[3],
475                         vmdq_ports_eth_addr[port].addr_bytes[4],
476                         vmdq_ports_eth_addr[port].addr_bytes[5]);
477
478         return 0;
479 }
480
481 /*
482  * Set character device basename.
483  */
484 static int
485 us_vhost_parse_basename(const char *q_arg)
486 {
487         /* parse number string */
488
489         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
490                 return -1;
491         else
492                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
493
494         return 0;
495 }
496
497 /*
498  * Parse the portmask provided at run time.
499  */
500 static int
501 parse_portmask(const char *portmask)
502 {
503         char *end = NULL;
504         unsigned long pm;
505
506         errno = 0;
507
508         /* parse hexadecimal string */
509         pm = strtoul(portmask, &end, 16);
510         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
511                 return -1;
512
513         if (pm == 0)
514                 return -1;
515
516         return pm;
517
518 }
519
520 /*
521  * Parse num options at run time.
522  */
523 static int
524 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
525 {
526         char *end = NULL;
527         unsigned long num;
528
529         errno = 0;
530
531         /* parse unsigned int string */
532         num = strtoul(q_arg, &end, 10);
533         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
534                 return -1;
535
536         if (num > max_valid_value)
537                 return -1;
538
539         return num;
540
541 }
542
543 /*
544  * Display usage
545  */
546 static void
547 us_vhost_usage(const char *prgname)
548 {
549         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
550         "               --vm2vm [0|1|2]\n"
551         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
552         "               --dev-basename <name>\n"
553         "               --nb-devices ND\n"
554         "               -p PORTMASK: Set mask for ports to be used by application\n"
555         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
556         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
557         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
558         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
559         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
560         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
561         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
562         "               --dev-basename: The basename to be used for the character device.\n"
563         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
564                         "zero copy\n"
565         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
566                         "used only when zero copy is enabled.\n"
567         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
568                         "used only when zero copy is enabled.\n",
569                prgname);
570 }
571
572 /*
573  * Parse the arguments given in the command line of the application.
574  */
575 static int
576 us_vhost_parse_args(int argc, char **argv)
577 {
578         int opt, ret;
579         int option_index;
580         unsigned i;
581         const char *prgname = argv[0];
582         static struct option long_option[] = {
583                 {"vm2vm", required_argument, NULL, 0},
584                 {"rx-retry", required_argument, NULL, 0},
585                 {"rx-retry-delay", required_argument, NULL, 0},
586                 {"rx-retry-num", required_argument, NULL, 0},
587                 {"mergeable", required_argument, NULL, 0},
588                 {"vlan-strip", required_argument, NULL, 0},
589                 {"stats", required_argument, NULL, 0},
590                 {"dev-basename", required_argument, NULL, 0},
591                 {"zero-copy", required_argument, NULL, 0},
592                 {"rx-desc-num", required_argument, NULL, 0},
593                 {"tx-desc-num", required_argument, NULL, 0},
594                 {NULL, 0, 0, 0},
595         };
596
597         /* Parse command line */
598         while ((opt = getopt_long(argc, argv, "p:P",
599                         long_option, &option_index)) != EOF) {
600                 switch (opt) {
601                 /* Portmask */
602                 case 'p':
603                         enabled_port_mask = parse_portmask(optarg);
604                         if (enabled_port_mask == 0) {
605                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
606                                 us_vhost_usage(prgname);
607                                 return -1;
608                         }
609                         break;
610
611                 case 'P':
612                         promiscuous = 1;
613                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
614                                 ETH_VMDQ_ACCEPT_BROADCAST |
615                                 ETH_VMDQ_ACCEPT_MULTICAST;
616                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
617
618                         break;
619
620                 case 0:
621                         /* Enable/disable vm2vm comms. */
622                         if (!strncmp(long_option[option_index].name, "vm2vm",
623                                 MAX_LONG_OPT_SZ)) {
624                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
625                                 if (ret == -1) {
626                                         RTE_LOG(INFO, VHOST_CONFIG,
627                                                 "Invalid argument for "
628                                                 "vm2vm [0|1|2]\n");
629                                         us_vhost_usage(prgname);
630                                         return -1;
631                                 } else {
632                                         vm2vm_mode = (vm2vm_type)ret;
633                                 }
634                         }
635
636                         /* Enable/disable retries on RX. */
637                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
638                                 ret = parse_num_opt(optarg, 1);
639                                 if (ret == -1) {
640                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
641                                         us_vhost_usage(prgname);
642                                         return -1;
643                                 } else {
644                                         enable_retry = ret;
645                                 }
646                         }
647
648                         /* Specify the retries delay time (in useconds) on RX. */
649                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
650                                 ret = parse_num_opt(optarg, INT32_MAX);
651                                 if (ret == -1) {
652                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
653                                         us_vhost_usage(prgname);
654                                         return -1;
655                                 } else {
656                                         burst_rx_delay_time = ret;
657                                 }
658                         }
659
660                         /* Specify the retries number on RX. */
661                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
662                                 ret = parse_num_opt(optarg, INT32_MAX);
663                                 if (ret == -1) {
664                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
665                                         us_vhost_usage(prgname);
666                                         return -1;
667                                 } else {
668                                         burst_rx_retry_num = ret;
669                                 }
670                         }
671
672                         /* Enable/disable RX mergeable buffers. */
673                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
674                                 ret = parse_num_opt(optarg, 1);
675                                 if (ret == -1) {
676                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
677                                         us_vhost_usage(prgname);
678                                         return -1;
679                                 } else {
680                                         mergeable = !!ret;
681                                         if (ret) {
682                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
683                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
684                                                         = JUMBO_FRAME_MAX_SIZE;
685                                         }
686                                 }
687                         }
688
689                         /* Enable/disable RX VLAN strip on host. */
690                         if (!strncmp(long_option[option_index].name,
691                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, 1);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG,
695                                                 "Invalid argument for VLAN strip [0|1]\n");
696                                         us_vhost_usage(prgname);
697                                         return -1;
698                                 } else {
699                                         vlan_strip = !!ret;
700                                         vmdq_conf_default.rxmode.hw_vlan_strip =
701                                                 vlan_strip;
702                                 }
703                         }
704
705                         /* Enable/disable stats. */
706                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
707                                 ret = parse_num_opt(optarg, INT32_MAX);
708                                 if (ret == -1) {
709                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
710                                         us_vhost_usage(prgname);
711                                         return -1;
712                                 } else {
713                                         enable_stats = ret;
714                                 }
715                         }
716
717                         /* Set character device basename. */
718                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
719                                 if (us_vhost_parse_basename(optarg) == -1) {
720                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
721                                         us_vhost_usage(prgname);
722                                         return -1;
723                                 }
724                         }
725
726                         /* Enable/disable rx/tx zero copy. */
727                         if (!strncmp(long_option[option_index].name,
728                                 "zero-copy", MAX_LONG_OPT_SZ)) {
729                                 ret = parse_num_opt(optarg, 1);
730                                 if (ret == -1) {
731                                         RTE_LOG(INFO, VHOST_CONFIG,
732                                                 "Invalid argument"
733                                                 " for zero-copy [0|1]\n");
734                                         us_vhost_usage(prgname);
735                                         return -1;
736                                 } else
737                                         zero_copy = ret;
738                         }
739
740                         /* Specify the descriptor number on RX. */
741                         if (!strncmp(long_option[option_index].name,
742                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
743                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
744                                 if ((ret == -1) || (!POWEROF2(ret))) {
745                                         RTE_LOG(INFO, VHOST_CONFIG,
746                                         "Invalid argument for rx-desc-num[0-N],"
747                                         "power of 2 required.\n");
748                                         us_vhost_usage(prgname);
749                                         return -1;
750                                 } else {
751                                         num_rx_descriptor = ret;
752                                 }
753                         }
754
755                         /* Specify the descriptor number on TX. */
756                         if (!strncmp(long_option[option_index].name,
757                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
758                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
759                                 if ((ret == -1) || (!POWEROF2(ret))) {
760                                         RTE_LOG(INFO, VHOST_CONFIG,
761                                         "Invalid argument for tx-desc-num [0-N],"
762                                         "power of 2 required.\n");
763                                         us_vhost_usage(prgname);
764                                         return -1;
765                                 } else {
766                                         num_tx_descriptor = ret;
767                                 }
768                         }
769
770                         break;
771
772                         /* Invalid option - print options. */
773                 default:
774                         us_vhost_usage(prgname);
775                         return -1;
776                 }
777         }
778
779         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
780                 if (enabled_port_mask & (1 << i))
781                         ports[num_ports++] = (uint8_t)i;
782         }
783
784         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
785                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
786                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
787                 return -1;
788         }
789
790         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
791                 RTE_LOG(INFO, VHOST_PORT,
792                         "Vhost zero copy doesn't support software vm2vm,"
793                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
794                 return -1;
795         }
796
797         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
798                 RTE_LOG(INFO, VHOST_PORT,
799                         "Vhost zero copy doesn't support jumbo frame,"
800                         "please specify '--mergeable 0' to disable the "
801                         "mergeable feature.\n");
802                 return -1;
803         }
804
805         return 0;
806 }
807
808 /*
809  * Update the global var NUM_PORTS and array PORTS according to system ports number
810  * and return valid ports number
811  */
812 static unsigned check_ports_num(unsigned nb_ports)
813 {
814         unsigned valid_num_ports = num_ports;
815         unsigned portid;
816
817         if (num_ports > nb_ports) {
818                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
819                         num_ports, nb_ports);
820                 num_ports = nb_ports;
821         }
822
823         for (portid = 0; portid < num_ports; portid ++) {
824                 if (ports[portid] >= nb_ports) {
825                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
826                                 ports[portid], (nb_ports - 1));
827                         ports[portid] = INVALID_PORT_ID;
828                         valid_num_ports--;
829                 }
830         }
831         return valid_num_ports;
832 }
833
834 /*
835  * Macro to print out packet contents. Wrapped in debug define so that the
836  * data path is not effected when debug is disabled.
837  */
838 #ifdef DEBUG
839 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
840         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
841         unsigned int index;                                                                                                                                                                                             \
842         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
843                                                                                                                                                                                                                                         \
844         if ((header))                                                                                                                                                                                                   \
845                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
846         else                                                                                                                                                                                                                    \
847                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
848         for (index = 0; index < (size); index++) {                                                                                                                                              \
849                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
850                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
851         }                                                                                                                                                                                                                               \
852         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
853                                                                                                                                                                                                                                         \
854         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
855 } while(0)
856 #else
857 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
858 #endif
859
860 /*
861  * Function to convert guest physical addresses to vhost physical addresses.
862  * This is used to convert virtio buffer addresses.
863  */
864 static inline uint64_t __attribute__((always_inline))
865 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
866         uint32_t buf_len, hpa_type *addr_type)
867 {
868         struct virtio_memory_regions_hpa *region;
869         uint32_t regionidx;
870         uint64_t vhost_pa = 0;
871
872         *addr_type = PHYS_ADDR_INVALID;
873
874         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
875                 region = &vdev->regions_hpa[regionidx];
876                 if ((guest_pa >= region->guest_phys_address) &&
877                         (guest_pa <= region->guest_phys_address_end)) {
878                         vhost_pa = region->host_phys_addr_offset + guest_pa;
879                         if (likely((guest_pa + buf_len - 1)
880                                 <= region->guest_phys_address_end))
881                                 *addr_type = PHYS_ADDR_CONTINUOUS;
882                         else
883                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
884                         break;
885                 }
886         }
887
888         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
889                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
890                 (void *)(uintptr_t)vhost_pa);
891
892         return vhost_pa;
893 }
894
895 /*
896  * Compares a packet destination MAC address to a device MAC address.
897  */
898 static inline int __attribute__((always_inline))
899 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
900 {
901         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
902 }
903
904 /*
905  * This function learns the MAC address of the device and registers this along with a
906  * vlan tag to a VMDQ.
907  */
908 static int
909 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
910 {
911         struct ether_hdr *pkt_hdr;
912         struct virtio_net_data_ll *dev_ll;
913         struct virtio_net *dev = vdev->dev;
914         int i, ret;
915
916         /* Learn MAC address of guest device from packet */
917         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
918
919         dev_ll = ll_root_used;
920
921         while (dev_ll != NULL) {
922                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
923                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
924                         return -1;
925                 }
926                 dev_ll = dev_ll->next;
927         }
928
929         for (i = 0; i < ETHER_ADDR_LEN; i++)
930                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
931
932         /* vlan_tag currently uses the device_id. */
933         vdev->vlan_tag = vlan_tags[dev->device_fh];
934
935         /* Print out VMDQ registration info. */
936         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
937                 dev->device_fh,
938                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
939                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
940                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
941                 vdev->vlan_tag);
942
943         /* Register the MAC address. */
944         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
945                                 (uint32_t)dev->device_fh + vmdq_pool_base);
946         if (ret)
947                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
948                                         dev->device_fh);
949
950         /* Enable stripping of the vlan tag as we handle routing. */
951         if (vlan_strip)
952                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
953                         (uint16_t)vdev->vmdq_rx_q, 1);
954
955         /* Set device as ready for RX. */
956         vdev->ready = DEVICE_RX;
957
958         return 0;
959 }
960
961 /*
962  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
963  * queue before disabling RX on the device.
964  */
965 static inline void
966 unlink_vmdq(struct vhost_dev *vdev)
967 {
968         unsigned i = 0;
969         unsigned rx_count;
970         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
971
972         if (vdev->ready == DEVICE_RX) {
973                 /*clear MAC and VLAN settings*/
974                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
975                 for (i = 0; i < 6; i++)
976                         vdev->mac_address.addr_bytes[i] = 0;
977
978                 vdev->vlan_tag = 0;
979
980                 /*Clear out the receive buffers*/
981                 rx_count = rte_eth_rx_burst(ports[0],
982                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
983
984                 while (rx_count) {
985                         for (i = 0; i < rx_count; i++)
986                                 rte_pktmbuf_free(pkts_burst[i]);
987
988                         rx_count = rte_eth_rx_burst(ports[0],
989                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990                 }
991
992                 vdev->ready = DEVICE_MAC_LEARNING;
993         }
994 }
995
996 /*
997  * Check if the packet destination MAC address is for a local device. If so then put
998  * the packet on that devices RX queue. If not then return.
999  */
1000 static inline int __attribute__((always_inline))
1001 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1002 {
1003         struct virtio_net_data_ll *dev_ll;
1004         struct ether_hdr *pkt_hdr;
1005         uint64_t ret = 0;
1006         struct virtio_net *dev = vdev->dev;
1007         struct virtio_net *tdev; /* destination virito device */
1008
1009         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1010
1011         /*get the used devices list*/
1012         dev_ll = ll_root_used;
1013
1014         while (dev_ll != NULL) {
1015                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1016                                           &dev_ll->vdev->mac_address)) {
1017
1018                         /* Drop the packet if the TX packet is destined for the TX device. */
1019                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1020                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1021                                                         dev->device_fh);
1022                                 return 0;
1023                         }
1024                         tdev = dev_ll->vdev->dev;
1025
1026
1027                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1028
1029                         if (unlikely(dev_ll->vdev->remove)) {
1030                                 /*drop the packet if the device is marked for removal*/
1031                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1032                         } else {
1033                                 /*send the packet to the local virtio device*/
1034                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1035                                 if (enable_stats) {
1036                                         rte_atomic64_add(
1037                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1038                                         1);
1039                                         rte_atomic64_add(
1040                                         &dev_statistics[tdev->device_fh].rx_atomic,
1041                                         ret);
1042                                         dev_statistics[dev->device_fh].tx_total++;
1043                                         dev_statistics[dev->device_fh].tx += ret;
1044                                 }
1045                         }
1046
1047                         return 0;
1048                 }
1049                 dev_ll = dev_ll->next;
1050         }
1051
1052         return -1;
1053 }
1054
1055 /*
1056  * Check if the destination MAC of a packet is one local VM,
1057  * and get its vlan tag, and offset if it is.
1058  */
1059 static inline int __attribute__((always_inline))
1060 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1061         uint32_t *offset, uint16_t *vlan_tag)
1062 {
1063         struct virtio_net_data_ll *dev_ll = ll_root_used;
1064         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1065
1066         while (dev_ll != NULL) {
1067                 if ((dev_ll->vdev->ready == DEVICE_RX)
1068                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1069                 &dev_ll->vdev->mac_address)) {
1070                         /*
1071                          * Drop the packet if the TX packet is
1072                          * destined for the TX device.
1073                          */
1074                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1075                                 LOG_DEBUG(VHOST_DATA,
1076                                 "(%"PRIu64") TX: Source and destination"
1077                                 " MAC addresses are the same. Dropping "
1078                                 "packet.\n",
1079                                 dev_ll->vdev->dev->device_fh);
1080                                 return -1;
1081                         }
1082
1083                         /*
1084                          * HW vlan strip will reduce the packet length
1085                          * by minus length of vlan tag, so need restore
1086                          * the packet length by plus it.
1087                          */
1088                         *offset = VLAN_HLEN;
1089                         *vlan_tag =
1090                         (uint16_t)
1091                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1092
1093                         LOG_DEBUG(VHOST_DATA,
1094                         "(%"PRIu64") TX: pkt to local VM device id:"
1095                         "(%"PRIu64") vlan tag: %d.\n",
1096                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1097                         (int)*vlan_tag);
1098
1099                         break;
1100                 }
1101                 dev_ll = dev_ll->next;
1102         }
1103         return 0;
1104 }
1105
1106 /*
1107  * This function routes the TX packet to the correct interface. This may be a local device
1108  * or the physical port.
1109  */
1110 static inline void __attribute__((always_inline))
1111 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1112 {
1113         struct mbuf_table *tx_q;
1114         struct rte_mbuf **m_table;
1115         unsigned len, ret, offset = 0;
1116         const uint16_t lcore_id = rte_lcore_id();
1117         struct virtio_net *dev = vdev->dev;
1118         struct ether_hdr *nh;
1119
1120         /*check if destination is local VM*/
1121         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1122                 rte_pktmbuf_free(m);
1123                 return;
1124         }
1125
1126         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1127                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1128                         rte_pktmbuf_free(m);
1129                         return;
1130                 }
1131         }
1132
1133         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1134
1135         /*Add packet to the port tx queue*/
1136         tx_q = &lcore_tx_queue[lcore_id];
1137         len = tx_q->len;
1138
1139         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1140         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1141                 /* Guest has inserted the vlan tag. */
1142                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1143                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1144                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1145                         (vh->vlan_tci != vlan_tag_be))
1146                         vh->vlan_tci = vlan_tag_be;
1147         } else {
1148                 m->ol_flags = PKT_TX_VLAN_PKT;
1149
1150                 /*
1151                  * Find the right seg to adjust the data len when offset is
1152                  * bigger than tail room size.
1153                  */
1154                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1155                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1156                                 m->data_len += offset;
1157                         else {
1158                                 struct rte_mbuf *seg = m;
1159
1160                                 while ((seg->next != NULL) &&
1161                                         (offset > rte_pktmbuf_tailroom(seg)))
1162                                         seg = seg->next;
1163
1164                                 seg->data_len += offset;
1165                         }
1166                         m->pkt_len += offset;
1167                 }
1168
1169                 m->vlan_tci = vlan_tag;
1170         }
1171
1172         tx_q->m_table[len] = m;
1173         len++;
1174         if (enable_stats) {
1175                 dev_statistics[dev->device_fh].tx_total++;
1176                 dev_statistics[dev->device_fh].tx++;
1177         }
1178
1179         if (unlikely(len == MAX_PKT_BURST)) {
1180                 m_table = (struct rte_mbuf **)tx_q->m_table;
1181                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1182                 /* Free any buffers not handled by TX and update the port stats. */
1183                 if (unlikely(ret < len)) {
1184                         do {
1185                                 rte_pktmbuf_free(m_table[ret]);
1186                         } while (++ret < len);
1187                 }
1188
1189                 len = 0;
1190         }
1191
1192         tx_q->len = len;
1193         return;
1194 }
1195 /*
1196  * This function is called by each data core. It handles all RX/TX registered with the
1197  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1198  * with all devices in the main linked list.
1199  */
1200 static int
1201 switch_worker(__attribute__((unused)) void *arg)
1202 {
1203         struct rte_mempool *mbuf_pool = arg;
1204         struct virtio_net *dev = NULL;
1205         struct vhost_dev *vdev = NULL;
1206         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1207         struct virtio_net_data_ll *dev_ll;
1208         struct mbuf_table *tx_q;
1209         volatile struct lcore_ll_info *lcore_ll;
1210         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1211         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1212         unsigned ret, i;
1213         const uint16_t lcore_id = rte_lcore_id();
1214         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1215         uint16_t rx_count = 0;
1216         uint16_t tx_count;
1217         uint32_t retry = 0;
1218
1219         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1220         lcore_ll = lcore_info[lcore_id].lcore_ll;
1221         prev_tsc = 0;
1222
1223         tx_q = &lcore_tx_queue[lcore_id];
1224         for (i = 0; i < num_cores; i ++) {
1225                 if (lcore_ids[i] == lcore_id) {
1226                         tx_q->txq_id = i;
1227                         break;
1228                 }
1229         }
1230
1231         while(1) {
1232                 cur_tsc = rte_rdtsc();
1233                 /*
1234                  * TX burst queue drain
1235                  */
1236                 diff_tsc = cur_tsc - prev_tsc;
1237                 if (unlikely(diff_tsc > drain_tsc)) {
1238
1239                         if (tx_q->len) {
1240                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1241
1242                                 /*Tx any packets in the queue*/
1243                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1244                                                                            (struct rte_mbuf **)tx_q->m_table,
1245                                                                            (uint16_t)tx_q->len);
1246                                 if (unlikely(ret < tx_q->len)) {
1247                                         do {
1248                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1249                                         } while (++ret < tx_q->len);
1250                                 }
1251
1252                                 tx_q->len = 0;
1253                         }
1254
1255                         prev_tsc = cur_tsc;
1256
1257                 }
1258
1259                 rte_prefetch0(lcore_ll->ll_root_used);
1260                 /*
1261                  * Inform the configuration core that we have exited the linked list and that no devices are
1262                  * in use if requested.
1263                  */
1264                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1265                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1266
1267                 /*
1268                  * Process devices
1269                  */
1270                 dev_ll = lcore_ll->ll_root_used;
1271
1272                 while (dev_ll != NULL) {
1273                         /*get virtio device ID*/
1274                         vdev = dev_ll->vdev;
1275                         dev = vdev->dev;
1276
1277                         if (unlikely(vdev->remove)) {
1278                                 dev_ll = dev_ll->next;
1279                                 unlink_vmdq(vdev);
1280                                 vdev->ready = DEVICE_SAFE_REMOVE;
1281                                 continue;
1282                         }
1283                         if (likely(vdev->ready == DEVICE_RX)) {
1284                                 /*Handle guest RX*/
1285                                 rx_count = rte_eth_rx_burst(ports[0],
1286                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1287
1288                                 if (rx_count) {
1289                                         /*
1290                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1291                                         * Here MAX_PKT_BURST must be less than virtio queue size
1292                                         */
1293                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1294                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1295                                                         rte_delay_us(burst_rx_delay_time);
1296                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1297                                                                 break;
1298                                                 }
1299                                         }
1300                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1301                                         if (enable_stats) {
1302                                                 rte_atomic64_add(
1303                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1304                                                 rx_count);
1305                                                 rte_atomic64_add(
1306                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1307                                         }
1308                                         while (likely(rx_count)) {
1309                                                 rx_count--;
1310                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1311                                         }
1312
1313                                 }
1314                         }
1315
1316                         if (likely(!vdev->remove)) {
1317                                 /* Handle guest TX*/
1318                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1319                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1320                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1321                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1322                                                 while (tx_count)
1323                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1324                                         }
1325                                 }
1326                                 while (tx_count)
1327                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1328                         }
1329
1330                         /*move to the next device in the list*/
1331                         dev_ll = dev_ll->next;
1332                 }
1333         }
1334
1335         return 0;
1336 }
1337
1338 /*
1339  * This function gets available ring number for zero copy rx.
1340  * Only one thread will call this funciton for a paticular virtio device,
1341  * so, it is designed as non-thread-safe function.
1342  */
1343 static inline uint32_t __attribute__((always_inline))
1344 get_available_ring_num_zcp(struct virtio_net *dev)
1345 {
1346         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1347         uint16_t avail_idx;
1348
1349         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1350         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1351 }
1352
1353 /*
1354  * This function gets available ring index for zero copy rx,
1355  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1356  * Only one thread will call this funciton for a paticular virtio device,
1357  * so, it is designed as non-thread-safe function.
1358  */
1359 static inline uint32_t __attribute__((always_inline))
1360 get_available_ring_index_zcp(struct virtio_net *dev,
1361         uint16_t *res_base_idx, uint32_t count)
1362 {
1363         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1364         uint16_t avail_idx;
1365         uint32_t retry = 0;
1366         uint16_t free_entries;
1367
1368         *res_base_idx = vq->last_used_idx_res;
1369         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1370         free_entries = (avail_idx - *res_base_idx);
1371
1372         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1373                         "avail idx: %d, "
1374                         "res base idx:%d, free entries:%d\n",
1375                         dev->device_fh, avail_idx, *res_base_idx,
1376                         free_entries);
1377
1378         /*
1379          * If retry is enabled and the queue is full then we wait
1380          * and retry to avoid packet loss.
1381          */
1382         if (enable_retry && unlikely(count > free_entries)) {
1383                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1384                         rte_delay_us(burst_rx_delay_time);
1385                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1386                         free_entries = (avail_idx - *res_base_idx);
1387                         if (count <= free_entries)
1388                                 break;
1389                 }
1390         }
1391
1392         /*check that we have enough buffers*/
1393         if (unlikely(count > free_entries))
1394                 count = free_entries;
1395
1396         if (unlikely(count == 0)) {
1397                 LOG_DEBUG(VHOST_DATA,
1398                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1399                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1400                         dev->device_fh, avail_idx,
1401                         *res_base_idx, free_entries);
1402                 return 0;
1403         }
1404
1405         vq->last_used_idx_res = *res_base_idx + count;
1406
1407         return count;
1408 }
1409
1410 /*
1411  * This function put descriptor back to used list.
1412  */
1413 static inline void __attribute__((always_inline))
1414 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1415 {
1416         uint16_t res_cur_idx = vq->last_used_idx;
1417         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1418         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1419         rte_compiler_barrier();
1420         *(volatile uint16_t *)&vq->used->idx += 1;
1421         vq->last_used_idx += 1;
1422
1423         /* Kick the guest if necessary. */
1424         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1425                 eventfd_write(vq->callfd, (eventfd_t)1);
1426 }
1427
1428 /*
1429  * This function get available descriptor from vitio vring and un-attached mbuf
1430  * from vpool->ring, and then attach them together. It needs adjust the offset
1431  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1432  * frame data may be put to wrong location in mbuf.
1433  */
1434 static inline void __attribute__((always_inline))
1435 attach_rxmbuf_zcp(struct virtio_net *dev)
1436 {
1437         uint16_t res_base_idx, desc_idx;
1438         uint64_t buff_addr, phys_addr;
1439         struct vhost_virtqueue *vq;
1440         struct vring_desc *desc;
1441         void *obj = NULL;
1442         struct rte_mbuf *mbuf;
1443         struct vpool *vpool;
1444         hpa_type addr_type;
1445         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1446
1447         vpool = &vpool_array[vdev->vmdq_rx_q];
1448         vq = dev->virtqueue[VIRTIO_RXQ];
1449
1450         do {
1451                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1452                                 1) != 1))
1453                         return;
1454                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1455
1456                 desc = &vq->desc[desc_idx];
1457                 if (desc->flags & VRING_DESC_F_NEXT) {
1458                         desc = &vq->desc[desc->next];
1459                         buff_addr = gpa_to_vva(dev, desc->addr);
1460                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1461                                         &addr_type);
1462                 } else {
1463                         buff_addr = gpa_to_vva(dev,
1464                                         desc->addr + vq->vhost_hlen);
1465                         phys_addr = gpa_to_hpa(vdev,
1466                                         desc->addr + vq->vhost_hlen,
1467                                         desc->len, &addr_type);
1468                 }
1469
1470                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1471                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1472                                 " address found when attaching RX frame buffer"
1473                                 " address!\n", dev->device_fh);
1474                         put_desc_to_used_list_zcp(vq, desc_idx);
1475                         continue;
1476                 }
1477
1478                 /*
1479                  * Check if the frame buffer address from guest crosses
1480                  * sub-region or not.
1481                  */
1482                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1483                         RTE_LOG(ERR, VHOST_DATA,
1484                                 "(%"PRIu64") Frame buffer address cross "
1485                                 "sub-regioin found when attaching RX frame "
1486                                 "buffer address!\n",
1487                                 dev->device_fh);
1488                         put_desc_to_used_list_zcp(vq, desc_idx);
1489                         continue;
1490                 }
1491         } while (unlikely(phys_addr == 0));
1492
1493         rte_ring_sc_dequeue(vpool->ring, &obj);
1494         mbuf = obj;
1495         if (unlikely(mbuf == NULL)) {
1496                 LOG_DEBUG(VHOST_DATA,
1497                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1498                         "ring_sc_dequeue fail.\n",
1499                         dev->device_fh);
1500                 put_desc_to_used_list_zcp(vq, desc_idx);
1501                 return;
1502         }
1503
1504         if (unlikely(vpool->buf_size > desc->len)) {
1505                 LOG_DEBUG(VHOST_DATA,
1506                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1507                         "length(%d) of descriptor idx: %d less than room "
1508                         "size required: %d\n",
1509                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1510                 put_desc_to_used_list_zcp(vq, desc_idx);
1511                 rte_ring_sp_enqueue(vpool->ring, obj);
1512                 return;
1513         }
1514
1515         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1516         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1517         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1518         mbuf->data_len = desc->len;
1519         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1520
1521         LOG_DEBUG(VHOST_DATA,
1522                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1523                 "descriptor idx:%d\n",
1524                 dev->device_fh, res_base_idx, desc_idx);
1525
1526         __rte_mbuf_raw_free(mbuf);
1527
1528         return;
1529 }
1530
1531 /*
1532  * Detach an attched packet mbuf -
1533  *  - restore original mbuf address and length values.
1534  *  - reset pktmbuf data and data_len to their default values.
1535  *  All other fields of the given packet mbuf will be left intact.
1536  *
1537  * @param m
1538  *   The attached packet mbuf.
1539  */
1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1541 {
1542         const struct rte_mempool *mp = m->pool;
1543         void *buf = rte_mbuf_to_baddr(m);
1544         uint32_t buf_ofs;
1545         uint32_t buf_len = mp->elt_size - sizeof(*m);
1546         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1547
1548         m->buf_addr = buf;
1549         m->buf_len = (uint16_t)buf_len;
1550
1551         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1552                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1553         m->data_off = buf_ofs;
1554
1555         m->data_len = 0;
1556 }
1557
1558 /*
1559  * This function is called after packets have been transimited. It fetchs mbuf
1560  * from vpool->pool, detached it and put into vpool->ring. It also update the
1561  * used index and kick the guest if necessary.
1562  */
1563 static inline uint32_t __attribute__((always_inline))
1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1565 {
1566         struct rte_mbuf *mbuf;
1567         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1568         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1569         uint32_t index = 0;
1570         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1571
1572         LOG_DEBUG(VHOST_DATA,
1573                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1574                 "clean is: %d\n",
1575                 dev->device_fh, mbuf_count);
1576         LOG_DEBUG(VHOST_DATA,
1577                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1578                 "clean  is : %d\n",
1579                 dev->device_fh, rte_ring_count(vpool->ring));
1580
1581         for (index = 0; index < mbuf_count; index++) {
1582                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583                 if (likely(MBUF_EXT_MEM(mbuf)))
1584                         pktmbuf_detach_zcp(mbuf);
1585                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1586
1587                 /* Update used index buffer information. */
1588                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1589                 vq->used->ring[used_idx].len = 0;
1590
1591                 used_idx = (used_idx + 1) & (vq->size - 1);
1592         }
1593
1594         LOG_DEBUG(VHOST_DATA,
1595                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1596                 "clean is: %d\n",
1597                 dev->device_fh, rte_mempool_count(vpool->pool));
1598         LOG_DEBUG(VHOST_DATA,
1599                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1600                 "clean  is : %d\n",
1601                 dev->device_fh, rte_ring_count(vpool->ring));
1602         LOG_DEBUG(VHOST_DATA,
1603                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1604                 "vq->last_used_idx:%d\n",
1605                 dev->device_fh, vq->last_used_idx);
1606
1607         vq->last_used_idx += mbuf_count;
1608
1609         LOG_DEBUG(VHOST_DATA,
1610                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1611                 "vq->last_used_idx:%d\n",
1612                 dev->device_fh, vq->last_used_idx);
1613
1614         rte_compiler_barrier();
1615
1616         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1617
1618         /* Kick guest if required. */
1619         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1620                 eventfd_write(vq->callfd, (eventfd_t)1);
1621
1622         return 0;
1623 }
1624
1625 /*
1626  * This function is called when a virtio device is destroy.
1627  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1628  */
1629 static void mbuf_destroy_zcp(struct vpool *vpool)
1630 {
1631         struct rte_mbuf *mbuf = NULL;
1632         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1633
1634         LOG_DEBUG(VHOST_CONFIG,
1635                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1636                 "mbuf_destroy_zcp is: %d\n",
1637                 mbuf_count);
1638         LOG_DEBUG(VHOST_CONFIG,
1639                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1640                 "mbuf_destroy_zcp  is : %d\n",
1641                 rte_ring_count(vpool->ring));
1642
1643         for (index = 0; index < mbuf_count; index++) {
1644                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1645                 if (likely(mbuf != NULL)) {
1646                         if (likely(MBUF_EXT_MEM(mbuf)))
1647                                 pktmbuf_detach_zcp(mbuf);
1648                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1649                 }
1650         }
1651
1652         LOG_DEBUG(VHOST_CONFIG,
1653                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1654                 "mbuf_destroy_zcp is: %d\n",
1655                 rte_mempool_count(vpool->pool));
1656         LOG_DEBUG(VHOST_CONFIG,
1657                 "in mbuf_destroy_zcp: mbuf count in ring after "
1658                 "mbuf_destroy_zcp is : %d\n",
1659                 rte_ring_count(vpool->ring));
1660 }
1661
1662 /*
1663  * This function update the use flag and counter.
1664  */
1665 static inline uint32_t __attribute__((always_inline))
1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1667         uint32_t count)
1668 {
1669         struct vhost_virtqueue *vq;
1670         struct vring_desc *desc;
1671         struct rte_mbuf *buff;
1672         /* The virtio_hdr is initialised to 0. */
1673         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1674                 = {{0, 0, 0, 0, 0, 0}, 0};
1675         uint64_t buff_hdr_addr = 0;
1676         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1677         uint32_t head_idx, packet_success = 0;
1678         uint16_t res_cur_idx;
1679
1680         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1681
1682         if (count == 0)
1683                 return 0;
1684
1685         vq = dev->virtqueue[VIRTIO_RXQ];
1686         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1687
1688         res_cur_idx = vq->last_used_idx;
1689         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1690                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1691
1692         /* Retrieve all of the head indexes first to avoid caching issues. */
1693         for (head_idx = 0; head_idx < count; head_idx++)
1694                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1695
1696         /*Prefetch descriptor index. */
1697         rte_prefetch0(&vq->desc[head[packet_success]]);
1698
1699         while (packet_success != count) {
1700                 /* Get descriptor from available ring */
1701                 desc = &vq->desc[head[packet_success]];
1702
1703                 buff = pkts[packet_success];
1704                 LOG_DEBUG(VHOST_DATA,
1705                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1706                         "pkt[%d] descriptor idx: %d\n",
1707                         dev->device_fh, packet_success,
1708                         MBUF_HEADROOM_UINT32(buff));
1709
1710                 PRINT_PACKET(dev,
1711                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1712                         + RTE_PKTMBUF_HEADROOM),
1713                         rte_pktmbuf_data_len(buff), 0);
1714
1715                 /* Buffer address translation for virtio header. */
1716                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1717                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1718
1719                 /*
1720                  * If the descriptors are chained the header and data are
1721                  * placed in separate buffers.
1722                  */
1723                 if (desc->flags & VRING_DESC_F_NEXT) {
1724                         desc->len = vq->vhost_hlen;
1725                         desc = &vq->desc[desc->next];
1726                         desc->len = rte_pktmbuf_data_len(buff);
1727                 } else {
1728                         desc->len = packet_len;
1729                 }
1730
1731                 /* Update used ring with desc information */
1732                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1733                         = head[packet_success];
1734                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1735                         = packet_len;
1736                 res_cur_idx++;
1737                 packet_success++;
1738
1739                 /* A header is required per buffer. */
1740                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1741                         (const void *)&virtio_hdr, vq->vhost_hlen);
1742
1743                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1744
1745                 if (likely(packet_success < count)) {
1746                         /* Prefetch descriptor index. */
1747                         rte_prefetch0(&vq->desc[head[packet_success]]);
1748                 }
1749         }
1750
1751         rte_compiler_barrier();
1752
1753         LOG_DEBUG(VHOST_DATA,
1754                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1755                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1756                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1757
1758         *(volatile uint16_t *)&vq->used->idx += count;
1759         vq->last_used_idx += count;
1760
1761         LOG_DEBUG(VHOST_DATA,
1762                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1763                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1764                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1765
1766         /* Kick the guest if necessary. */
1767         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1768                 eventfd_write(vq->callfd, (eventfd_t)1);
1769
1770         return count;
1771 }
1772
1773 /*
1774  * This function routes the TX packet to the correct interface.
1775  * This may be a local device or the physical port.
1776  */
1777 static inline void __attribute__((always_inline))
1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1779         uint32_t desc_idx, uint8_t need_copy)
1780 {
1781         struct mbuf_table *tx_q;
1782         struct rte_mbuf **m_table;
1783         void *obj = NULL;
1784         struct rte_mbuf *mbuf;
1785         unsigned len, ret, offset = 0;
1786         struct vpool *vpool;
1787         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1788         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1789
1790         /*Add packet to the port tx queue*/
1791         tx_q = &tx_queue_zcp[vmdq_rx_q];
1792         len = tx_q->len;
1793
1794         /* Allocate an mbuf and populate the structure. */
1795         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1796         rte_ring_sc_dequeue(vpool->ring, &obj);
1797         mbuf = obj;
1798         if (unlikely(mbuf == NULL)) {
1799                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1800                 RTE_LOG(ERR, VHOST_DATA,
1801                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1802                         dev->device_fh);
1803                 put_desc_to_used_list_zcp(vq, desc_idx);
1804                 return;
1805         }
1806
1807         if (vm2vm_mode == VM2VM_HARDWARE) {
1808                 /* Avoid using a vlan tag from any vm for external pkt, such as
1809                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1810                  * selection, MAC address determines it as an external pkt
1811                  * which should go to network, while vlan tag determine it as
1812                  * a vm2vm pkt should forward to another vm. Hardware confuse
1813                  * such a ambiguous situation, so pkt will lost.
1814                  */
1815                 vlan_tag = external_pkt_default_vlan_tag;
1816                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1817                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1818                         __rte_mbuf_raw_free(mbuf);
1819                         return;
1820                 }
1821         }
1822
1823         mbuf->nb_segs = m->nb_segs;
1824         mbuf->next = m->next;
1825         mbuf->data_len = m->data_len + offset;
1826         mbuf->pkt_len = mbuf->data_len;
1827         if (unlikely(need_copy)) {
1828                 /* Copy the packet contents to the mbuf. */
1829                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1830                         rte_pktmbuf_mtod(m, void *),
1831                         m->data_len);
1832         } else {
1833                 mbuf->data_off = m->data_off;
1834                 mbuf->buf_physaddr = m->buf_physaddr;
1835                 mbuf->buf_addr = m->buf_addr;
1836         }
1837         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1838         mbuf->vlan_tci = vlan_tag;
1839         mbuf->l2_len = sizeof(struct ether_hdr);
1840         mbuf->l3_len = sizeof(struct ipv4_hdr);
1841         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1842
1843         tx_q->m_table[len] = mbuf;
1844         len++;
1845
1846         LOG_DEBUG(VHOST_DATA,
1847                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1848                 dev->device_fh,
1849                 mbuf->nb_segs,
1850                 (mbuf->next == NULL) ? "null" : "non-null");
1851
1852         if (enable_stats) {
1853                 dev_statistics[dev->device_fh].tx_total++;
1854                 dev_statistics[dev->device_fh].tx++;
1855         }
1856
1857         if (unlikely(len == MAX_PKT_BURST)) {
1858                 m_table = (struct rte_mbuf **)tx_q->m_table;
1859                 ret = rte_eth_tx_burst(ports[0],
1860                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1861
1862                 /*
1863                  * Free any buffers not handled by TX and update
1864                  * the port stats.
1865                  */
1866                 if (unlikely(ret < len)) {
1867                         do {
1868                                 rte_pktmbuf_free(m_table[ret]);
1869                         } while (++ret < len);
1870                 }
1871
1872                 len = 0;
1873                 txmbuf_clean_zcp(dev, vpool);
1874         }
1875
1876         tx_q->len = len;
1877
1878         return;
1879 }
1880
1881 /*
1882  * This function TX all available packets in virtio TX queue for one
1883  * virtio-net device. If it is first packet, it learns MAC address and
1884  * setup VMDQ.
1885  */
1886 static inline void __attribute__((always_inline))
1887 virtio_dev_tx_zcp(struct virtio_net *dev)
1888 {
1889         struct rte_mbuf m;
1890         struct vhost_virtqueue *vq;
1891         struct vring_desc *desc;
1892         uint64_t buff_addr = 0, phys_addr;
1893         uint32_t head[MAX_PKT_BURST];
1894         uint32_t i;
1895         uint16_t free_entries, packet_success = 0;
1896         uint16_t avail_idx;
1897         uint8_t need_copy = 0;
1898         hpa_type addr_type;
1899         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1900
1901         vq = dev->virtqueue[VIRTIO_TXQ];
1902         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1903
1904         /* If there are no available buffers then return. */
1905         if (vq->last_used_idx_res == avail_idx)
1906                 return;
1907
1908         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1909
1910         /* Prefetch available ring to retrieve head indexes. */
1911         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1912
1913         /* Get the number of free entries in the ring */
1914         free_entries = (avail_idx - vq->last_used_idx_res);
1915
1916         /* Limit to MAX_PKT_BURST. */
1917         free_entries
1918                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1919
1920         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1921                 dev->device_fh, free_entries);
1922
1923         /* Retrieve all of the head indexes first to avoid caching issues. */
1924         for (i = 0; i < free_entries; i++)
1925                 head[i]
1926                         = vq->avail->ring[(vq->last_used_idx_res + i)
1927                         & (vq->size - 1)];
1928
1929         vq->last_used_idx_res += free_entries;
1930
1931         /* Prefetch descriptor index. */
1932         rte_prefetch0(&vq->desc[head[packet_success]]);
1933         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1934
1935         while (packet_success < free_entries) {
1936                 desc = &vq->desc[head[packet_success]];
1937
1938                 /* Discard first buffer as it is the virtio header */
1939                 desc = &vq->desc[desc->next];
1940
1941                 /* Buffer address translation. */
1942                 buff_addr = gpa_to_vva(dev, desc->addr);
1943                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1944                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1945                         &addr_type);
1946
1947                 if (likely(packet_success < (free_entries - 1)))
1948                         /* Prefetch descriptor index. */
1949                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1950
1951                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1952                         RTE_LOG(ERR, VHOST_DATA,
1953                                 "(%"PRIu64") Invalid frame buffer address found"
1954                                 "when TX packets!\n",
1955                                 dev->device_fh);
1956                         packet_success++;
1957                         continue;
1958                 }
1959
1960                 /* Prefetch buffer address. */
1961                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1962
1963                 /*
1964                  * Setup dummy mbuf. This is copied to a real mbuf if
1965                  * transmitted out the physical port.
1966                  */
1967                 m.data_len = desc->len;
1968                 m.nb_segs = 1;
1969                 m.next = NULL;
1970                 m.data_off = 0;
1971                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1972                 m.buf_physaddr = phys_addr;
1973
1974                 /*
1975                  * Check if the frame buffer address from guest crosses
1976                  * sub-region or not.
1977                  */
1978                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1979                         RTE_LOG(ERR, VHOST_DATA,
1980                                 "(%"PRIu64") Frame buffer address cross "
1981                                 "sub-regioin found when attaching TX frame "
1982                                 "buffer address!\n",
1983                                 dev->device_fh);
1984                         need_copy = 1;
1985                 } else
1986                         need_copy = 0;
1987
1988                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1989
1990                 /*
1991                  * If this is the first received packet we need to learn
1992                  * the MAC and setup VMDQ
1993                  */
1994                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1995                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1996                                 /*
1997                                  * Discard frame if device is scheduled for
1998                                  * removal or a duplicate MAC address is found.
1999                                  */
2000                                 packet_success += free_entries;
2001                                 vq->last_used_idx += packet_success;
2002                                 break;
2003                         }
2004                 }
2005
2006                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2007                 packet_success++;
2008         }
2009 }
2010
2011 /*
2012  * This function is called by each data core. It handles all RX/TX registered
2013  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2014  * addresses are compared with all devices in the main linked list.
2015  */
2016 static int
2017 switch_worker_zcp(__attribute__((unused)) void *arg)
2018 {
2019         struct virtio_net *dev = NULL;
2020         struct vhost_dev  *vdev = NULL;
2021         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2022         struct virtio_net_data_ll *dev_ll;
2023         struct mbuf_table *tx_q;
2024         volatile struct lcore_ll_info *lcore_ll;
2025         const uint64_t drain_tsc
2026                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2027                 * BURST_TX_DRAIN_US;
2028         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2029         unsigned ret;
2030         const uint16_t lcore_id = rte_lcore_id();
2031         uint16_t count_in_ring, rx_count = 0;
2032
2033         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2034
2035         lcore_ll = lcore_info[lcore_id].lcore_ll;
2036         prev_tsc = 0;
2037
2038         while (1) {
2039                 cur_tsc = rte_rdtsc();
2040
2041                 /* TX burst queue drain */
2042                 diff_tsc = cur_tsc - prev_tsc;
2043                 if (unlikely(diff_tsc > drain_tsc)) {
2044                         /*
2045                          * Get mbuf from vpool.pool and detach mbuf and
2046                          * put back into vpool.ring.
2047                          */
2048                         dev_ll = lcore_ll->ll_root_used;
2049                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2050                                 /* Get virtio device ID */
2051                                 vdev = dev_ll->vdev;
2052                                 dev = vdev->dev;
2053
2054                                 if (likely(!vdev->remove)) {
2055                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2056                                         if (tx_q->len) {
2057                                                 LOG_DEBUG(VHOST_DATA,
2058                                                 "TX queue drained after timeout"
2059                                                 " with burst size %u\n",
2060                                                 tx_q->len);
2061
2062                                                 /*
2063                                                  * Tx any packets in the queue
2064                                                  */
2065                                                 ret = rte_eth_tx_burst(
2066                                                         ports[0],
2067                                                         (uint16_t)tx_q->txq_id,
2068                                                         (struct rte_mbuf **)
2069                                                         tx_q->m_table,
2070                                                         (uint16_t)tx_q->len);
2071                                                 if (unlikely(ret < tx_q->len)) {
2072                                                         do {
2073                                                                 rte_pktmbuf_free(
2074                                                                         tx_q->m_table[ret]);
2075                                                         } while (++ret < tx_q->len);
2076                                                 }
2077                                                 tx_q->len = 0;
2078
2079                                                 txmbuf_clean_zcp(dev,
2080                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2081                                         }
2082                                 }
2083                                 dev_ll = dev_ll->next;
2084                         }
2085                         prev_tsc = cur_tsc;
2086                 }
2087
2088                 rte_prefetch0(lcore_ll->ll_root_used);
2089
2090                 /*
2091                  * Inform the configuration core that we have exited the linked
2092                  * list and that no devices are in use if requested.
2093                  */
2094                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2095                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2096
2097                 /* Process devices */
2098                 dev_ll = lcore_ll->ll_root_used;
2099
2100                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2101                         vdev = dev_ll->vdev;
2102                         dev  = vdev->dev;
2103                         if (unlikely(vdev->remove)) {
2104                                 dev_ll = dev_ll->next;
2105                                 unlink_vmdq(vdev);
2106                                 vdev->ready = DEVICE_SAFE_REMOVE;
2107                                 continue;
2108                         }
2109
2110                         if (likely(vdev->ready == DEVICE_RX)) {
2111                                 uint32_t index = vdev->vmdq_rx_q;
2112                                 uint16_t i;
2113                                 count_in_ring
2114                                 = rte_ring_count(vpool_array[index].ring);
2115                                 uint16_t free_entries
2116                                 = (uint16_t)get_available_ring_num_zcp(dev);
2117
2118                                 /*
2119                                  * Attach all mbufs in vpool.ring and put back
2120                                  * into vpool.pool.
2121                                  */
2122                                 for (i = 0;
2123                                 i < RTE_MIN(free_entries,
2124                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2125                                 i++)
2126                                         attach_rxmbuf_zcp(dev);
2127
2128                                 /* Handle guest RX */
2129                                 rx_count = rte_eth_rx_burst(ports[0],
2130                                         vdev->vmdq_rx_q, pkts_burst,
2131                                         MAX_PKT_BURST);
2132
2133                                 if (rx_count) {
2134                                         ret_count = virtio_dev_rx_zcp(dev,
2135                                                         pkts_burst, rx_count);
2136                                         if (enable_stats) {
2137                                                 dev_statistics[dev->device_fh].rx_total
2138                                                         += rx_count;
2139                                                 dev_statistics[dev->device_fh].rx
2140                                                         += ret_count;
2141                                         }
2142                                         while (likely(rx_count)) {
2143                                                 rx_count--;
2144                                                 pktmbuf_detach_zcp(
2145                                                         pkts_burst[rx_count]);
2146                                                 rte_ring_sp_enqueue(
2147                                                         vpool_array[index].ring,
2148                                                         (void *)pkts_burst[rx_count]);
2149                                         }
2150                                 }
2151                         }
2152
2153                         if (likely(!vdev->remove))
2154                                 /* Handle guest TX */
2155                                 virtio_dev_tx_zcp(dev);
2156
2157                         /* Move to the next device in the list */
2158                         dev_ll = dev_ll->next;
2159                 }
2160         }
2161
2162         return 0;
2163 }
2164
2165
2166 /*
2167  * Add an entry to a used linked list. A free entry must first be found
2168  * in the free linked list using get_data_ll_free_entry();
2169  */
2170 static void
2171 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2172         struct virtio_net_data_ll *ll_dev)
2173 {
2174         struct virtio_net_data_ll *ll = *ll_root_addr;
2175
2176         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2177         ll_dev->next = NULL;
2178         rte_compiler_barrier();
2179
2180         /* If ll == NULL then this is the first device. */
2181         if (ll) {
2182                 /* Increment to the tail of the linked list. */
2183                 while ((ll->next != NULL) )
2184                         ll = ll->next;
2185
2186                 ll->next = ll_dev;
2187         } else {
2188                 *ll_root_addr = ll_dev;
2189         }
2190 }
2191
2192 /*
2193  * Remove an entry from a used linked list. The entry must then be added to
2194  * the free linked list using put_data_ll_free_entry().
2195  */
2196 static void
2197 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2198         struct virtio_net_data_ll *ll_dev,
2199         struct virtio_net_data_ll *ll_dev_last)
2200 {
2201         struct virtio_net_data_ll *ll = *ll_root_addr;
2202
2203         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2204                 return;
2205
2206         if (ll_dev == ll)
2207                 *ll_root_addr = ll_dev->next;
2208         else
2209                 if (likely(ll_dev_last != NULL))
2210                         ll_dev_last->next = ll_dev->next;
2211                 else
2212                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2213 }
2214
2215 /*
2216  * Find and return an entry from the free linked list.
2217  */
2218 static struct virtio_net_data_ll *
2219 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2220 {
2221         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2222         struct virtio_net_data_ll *ll_dev;
2223
2224         if (ll_free == NULL)
2225                 return NULL;
2226
2227         ll_dev = ll_free;
2228         *ll_root_addr = ll_free->next;
2229
2230         return ll_dev;
2231 }
2232
2233 /*
2234  * Place an entry back on to the free linked list.
2235  */
2236 static void
2237 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2238         struct virtio_net_data_ll *ll_dev)
2239 {
2240         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2241
2242         if (ll_dev == NULL)
2243                 return;
2244
2245         ll_dev->next = ll_free;
2246         *ll_root_addr = ll_dev;
2247 }
2248
2249 /*
2250  * Creates a linked list of a given size.
2251  */
2252 static struct virtio_net_data_ll *
2253 alloc_data_ll(uint32_t size)
2254 {
2255         struct virtio_net_data_ll *ll_new;
2256         uint32_t i;
2257
2258         /* Malloc and then chain the linked list. */
2259         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2260         if (ll_new == NULL) {
2261                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2262                 return NULL;
2263         }
2264
2265         for (i = 0; i < size - 1; i++) {
2266                 ll_new[i].vdev = NULL;
2267                 ll_new[i].next = &ll_new[i+1];
2268         }
2269         ll_new[i].next = NULL;
2270
2271         return ll_new;
2272 }
2273
2274 /*
2275  * Create the main linked list along with each individual cores linked list. A used and a free list
2276  * are created to manage entries.
2277  */
2278 static int
2279 init_data_ll (void)
2280 {
2281         int lcore;
2282
2283         RTE_LCORE_FOREACH_SLAVE(lcore) {
2284                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2285                 if (lcore_info[lcore].lcore_ll == NULL) {
2286                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2287                         return -1;
2288                 }
2289
2290                 lcore_info[lcore].lcore_ll->device_num = 0;
2291                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2292                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2293                 if (num_devices % num_switching_cores)
2294                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2295                 else
2296                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2297         }
2298
2299         /* Allocate devices up to a maximum of MAX_DEVICES. */
2300         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2301
2302         return 0;
2303 }
2304
2305 /*
2306  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2307  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2308  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2309  */
2310 static void
2311 destroy_device (volatile struct virtio_net *dev)
2312 {
2313         struct virtio_net_data_ll *ll_lcore_dev_cur;
2314         struct virtio_net_data_ll *ll_main_dev_cur;
2315         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2316         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2317         struct vhost_dev *vdev;
2318         int lcore;
2319
2320         dev->flags &= ~VIRTIO_DEV_RUNNING;
2321
2322         vdev = (struct vhost_dev *)dev->priv;
2323         /*set the remove flag. */
2324         vdev->remove = 1;
2325         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2326                 rte_pause();
2327         }
2328
2329         /* Search for entry to be removed from lcore ll */
2330         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2331         while (ll_lcore_dev_cur != NULL) {
2332                 if (ll_lcore_dev_cur->vdev == vdev) {
2333                         break;
2334                 } else {
2335                         ll_lcore_dev_last = ll_lcore_dev_cur;
2336                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2337                 }
2338         }
2339
2340         if (ll_lcore_dev_cur == NULL) {
2341                 RTE_LOG(ERR, VHOST_CONFIG,
2342                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2343                         dev->device_fh);
2344                 return;
2345         }
2346
2347         /* Search for entry to be removed from main ll */
2348         ll_main_dev_cur = ll_root_used;
2349         ll_main_dev_last = NULL;
2350         while (ll_main_dev_cur != NULL) {
2351                 if (ll_main_dev_cur->vdev == vdev) {
2352                         break;
2353                 } else {
2354                         ll_main_dev_last = ll_main_dev_cur;
2355                         ll_main_dev_cur = ll_main_dev_cur->next;
2356                 }
2357         }
2358
2359         /* Remove entries from the lcore and main ll. */
2360         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2361         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2362
2363         /* Set the dev_removal_flag on each lcore. */
2364         RTE_LCORE_FOREACH_SLAVE(lcore) {
2365                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2366         }
2367
2368         /*
2369          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2370          * they can no longer access the device removed from the linked lists and that the devices
2371          * are no longer in use.
2372          */
2373         RTE_LCORE_FOREACH_SLAVE(lcore) {
2374                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2375                         rte_pause();
2376                 }
2377         }
2378
2379         /* Add the entries back to the lcore and main free ll.*/
2380         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2381         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2382
2383         /* Decrement number of device on the lcore. */
2384         lcore_info[vdev->coreid].lcore_ll->device_num--;
2385
2386         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2387
2388         if (zero_copy) {
2389                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2390
2391                 /* Stop the RX queue. */
2392                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2393                         LOG_DEBUG(VHOST_CONFIG,
2394                                 "(%"PRIu64") In destroy_device: Failed to stop "
2395                                 "rx queue:%d\n",
2396                                 dev->device_fh,
2397                                 vdev->vmdq_rx_q);
2398                 }
2399
2400                 LOG_DEBUG(VHOST_CONFIG,
2401                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2402                         "mempool back to ring for RX queue: %d\n",
2403                         dev->device_fh, vdev->vmdq_rx_q);
2404
2405                 mbuf_destroy_zcp(vpool);
2406
2407                 /* Stop the TX queue. */
2408                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2409                         LOG_DEBUG(VHOST_CONFIG,
2410                                 "(%"PRIu64") In destroy_device: Failed to "
2411                                 "stop tx queue:%d\n",
2412                                 dev->device_fh, vdev->vmdq_rx_q);
2413                 }
2414
2415                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2416
2417                 LOG_DEBUG(VHOST_CONFIG,
2418                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2419                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2420                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2421                         dev->device_fh);
2422
2423                 mbuf_destroy_zcp(vpool);
2424                 rte_free(vdev->regions_hpa);
2425         }
2426         rte_free(vdev);
2427
2428 }
2429
2430 /*
2431  * Calculate the region count of physical continous regions for one particular
2432  * region of whose vhost virtual address is continous. The particular region
2433  * start from vva_start, with size of 'size' in argument.
2434  */
2435 static uint32_t
2436 check_hpa_regions(uint64_t vva_start, uint64_t size)
2437 {
2438         uint32_t i, nregions = 0, page_size = getpagesize();
2439         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2440         if (vva_start % page_size) {
2441                 LOG_DEBUG(VHOST_CONFIG,
2442                         "in check_countinous: vva start(%p) mod page_size(%d) "
2443                         "has remainder\n",
2444                         (void *)(uintptr_t)vva_start, page_size);
2445                 return 0;
2446         }
2447         if (size % page_size) {
2448                 LOG_DEBUG(VHOST_CONFIG,
2449                         "in check_countinous: "
2450                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2451                         size, page_size);
2452                 return 0;
2453         }
2454         for (i = 0; i < size - page_size; i = i + page_size) {
2455                 cur_phys_addr
2456                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2457                 next_phys_addr = rte_mem_virt2phy(
2458                         (void *)(uintptr_t)(vva_start + i + page_size));
2459                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2460                         ++nregions;
2461                         LOG_DEBUG(VHOST_CONFIG,
2462                                 "in check_continuous: hva addr:(%p) is not "
2463                                 "continuous with hva addr:(%p), diff:%d\n",
2464                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2465                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2466                                 + page_size), page_size);
2467                         LOG_DEBUG(VHOST_CONFIG,
2468                                 "in check_continuous: hpa addr:(%p) is not "
2469                                 "continuous with hpa addr:(%p), "
2470                                 "diff:(%"PRIu64")\n",
2471                                 (void *)(uintptr_t)cur_phys_addr,
2472                                 (void *)(uintptr_t)next_phys_addr,
2473                                 (next_phys_addr-cur_phys_addr));
2474                 }
2475         }
2476         return nregions;
2477 }
2478
2479 /*
2480  * Divide each region whose vhost virtual address is continous into a few
2481  * sub-regions, make sure the physical address within each sub-region are
2482  * continous. And fill offset(to GPA) and size etc. information of each
2483  * sub-region into regions_hpa.
2484  */
2485 static uint32_t
2486 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2487 {
2488         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2489         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2490
2491         if (mem_region_hpa == NULL)
2492                 return 0;
2493
2494         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2495                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2496                         virtio_memory->regions[regionidx].address_offset;
2497                 mem_region_hpa[regionidx_hpa].guest_phys_address
2498                         = virtio_memory->regions[regionidx].guest_phys_address;
2499                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2500                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2501                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2502                 LOG_DEBUG(VHOST_CONFIG,
2503                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2504                         regionidx_hpa,
2505                         (void *)(uintptr_t)
2506                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2507                 LOG_DEBUG(VHOST_CONFIG,
2508                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2509                         regionidx_hpa,
2510                         (void *)(uintptr_t)
2511                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2512                 for (i = 0, k = 0;
2513                         i < virtio_memory->regions[regionidx].memory_size -
2514                                 page_size;
2515                         i += page_size) {
2516                         cur_phys_addr = rte_mem_virt2phy(
2517                                         (void *)(uintptr_t)(vva_start + i));
2518                         next_phys_addr = rte_mem_virt2phy(
2519                                         (void *)(uintptr_t)(vva_start +
2520                                         i + page_size));
2521                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2522                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2523                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2524                                         k + page_size;
2525                                 mem_region_hpa[regionidx_hpa].memory_size
2526                                         = k + page_size;
2527                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2528                                         "phys addr end  [%d]:(%p)\n",
2529                                         regionidx_hpa,
2530                                         (void *)(uintptr_t)
2531                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2532                                 LOG_DEBUG(VHOST_CONFIG,
2533                                         "in fill_hpa_regions: guest phys addr "
2534                                         "size [%d]:(%p)\n",
2535                                         regionidx_hpa,
2536                                         (void *)(uintptr_t)
2537                                         (mem_region_hpa[regionidx_hpa].memory_size));
2538                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2539                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2540                                 ++regionidx_hpa;
2541                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2542                                         next_phys_addr -
2543                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2544                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2545                                         " phys addr start[%d]:(%p)\n",
2546                                         regionidx_hpa,
2547                                         (void *)(uintptr_t)
2548                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2549                                 LOG_DEBUG(VHOST_CONFIG,
2550                                         "in fill_hpa_regions: host  phys addr "
2551                                         "start[%d]:(%p)\n",
2552                                         regionidx_hpa,
2553                                         (void *)(uintptr_t)
2554                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2555                                 k = 0;
2556                         } else {
2557                                 k += page_size;
2558                         }
2559                 }
2560                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2561                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2562                         + k + page_size;
2563                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2564                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2565                         "[%d]:(%p)\n", regionidx_hpa,
2566                         (void *)(uintptr_t)
2567                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2568                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2569                         "[%d]:(%p)\n", regionidx_hpa,
2570                         (void *)(uintptr_t)
2571                         (mem_region_hpa[regionidx_hpa].memory_size));
2572                 ++regionidx_hpa;
2573         }
2574         return regionidx_hpa;
2575 }
2576
2577 /*
2578  * A new device is added to a data core. First the device is added to the main linked list
2579  * and the allocated to a specific data core.
2580  */
2581 static int
2582 new_device (struct virtio_net *dev)
2583 {
2584         struct virtio_net_data_ll *ll_dev;
2585         int lcore, core_add = 0;
2586         uint32_t device_num_min = num_devices;
2587         struct vhost_dev *vdev;
2588         uint32_t regionidx;
2589
2590         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2591         if (vdev == NULL) {
2592                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2593                         dev->device_fh);
2594                 return -1;
2595         }
2596         vdev->dev = dev;
2597         dev->priv = vdev;
2598
2599         if (zero_copy) {
2600                 vdev->nregions_hpa = dev->mem->nregions;
2601                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2602                         vdev->nregions_hpa
2603                                 += check_hpa_regions(
2604                                         dev->mem->regions[regionidx].guest_phys_address
2605                                         + dev->mem->regions[regionidx].address_offset,
2606                                         dev->mem->regions[regionidx].memory_size);
2607
2608                 }
2609
2610                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2611                                                vdev->nregions_hpa,
2612                                                sizeof(struct virtio_memory_regions_hpa),
2613                                                RTE_CACHE_LINE_SIZE);
2614                 if (vdev->regions_hpa == NULL) {
2615                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2616                         rte_free(vdev);
2617                         return -1;
2618                 }
2619
2620
2621                 if (fill_hpa_memory_regions(
2622                         vdev->regions_hpa, dev->mem
2623                         ) != vdev->nregions_hpa) {
2624
2625                         RTE_LOG(ERR, VHOST_CONFIG,
2626                                 "hpa memory regions number mismatch: "
2627                                 "[%d]\n", vdev->nregions_hpa);
2628                         rte_free(vdev->regions_hpa);
2629                         rte_free(vdev);
2630                         return -1;
2631                 }
2632         }
2633
2634
2635         /* Add device to main ll */
2636         ll_dev = get_data_ll_free_entry(&ll_root_free);
2637         if (ll_dev == NULL) {
2638                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2639                         "of %d devices per core has been reached\n",
2640                         dev->device_fh, num_devices);
2641                 if (vdev->regions_hpa)
2642                         rte_free(vdev->regions_hpa);
2643                 rte_free(vdev);
2644                 return -1;
2645         }
2646         ll_dev->vdev = vdev;
2647         add_data_ll_entry(&ll_root_used, ll_dev);
2648         vdev->vmdq_rx_q
2649                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2650
2651         if (zero_copy) {
2652                 uint32_t index = vdev->vmdq_rx_q;
2653                 uint32_t count_in_ring, i;
2654                 struct mbuf_table *tx_q;
2655
2656                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2657
2658                 LOG_DEBUG(VHOST_CONFIG,
2659                         "(%"PRIu64") in new_device: mbuf count in mempool "
2660                         "before attach is: %d\n",
2661                         dev->device_fh,
2662                         rte_mempool_count(vpool_array[index].pool));
2663                 LOG_DEBUG(VHOST_CONFIG,
2664                         "(%"PRIu64") in new_device: mbuf count in  ring "
2665                         "before attach  is : %d\n",
2666                         dev->device_fh, count_in_ring);
2667
2668                 /*
2669                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2670                  */
2671                 for (i = 0; i < count_in_ring; i++)
2672                         attach_rxmbuf_zcp(dev);
2673
2674                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2675                         "mempool after attach is: %d\n",
2676                         dev->device_fh,
2677                         rte_mempool_count(vpool_array[index].pool));
2678                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2679                         "ring after attach  is : %d\n",
2680                         dev->device_fh,
2681                         rte_ring_count(vpool_array[index].ring));
2682
2683                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2684                 tx_q->txq_id = vdev->vmdq_rx_q;
2685
2686                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2687                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2688
2689                         LOG_DEBUG(VHOST_CONFIG,
2690                                 "(%"PRIu64") In new_device: Failed to start "
2691                                 "tx queue:%d\n",
2692                                 dev->device_fh, vdev->vmdq_rx_q);
2693
2694                         mbuf_destroy_zcp(vpool);
2695                         rte_free(vdev->regions_hpa);
2696                         rte_free(vdev);
2697                         return -1;
2698                 }
2699
2700                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2701                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2702
2703                         LOG_DEBUG(VHOST_CONFIG,
2704                                 "(%"PRIu64") In new_device: Failed to start "
2705                                 "rx queue:%d\n",
2706                                 dev->device_fh, vdev->vmdq_rx_q);
2707
2708                         /* Stop the TX queue. */
2709                         if (rte_eth_dev_tx_queue_stop(ports[0],
2710                                 vdev->vmdq_rx_q) != 0) {
2711                                 LOG_DEBUG(VHOST_CONFIG,
2712                                         "(%"PRIu64") In new_device: Failed to "
2713                                         "stop tx queue:%d\n",
2714                                         dev->device_fh, vdev->vmdq_rx_q);
2715                         }
2716
2717                         mbuf_destroy_zcp(vpool);
2718                         rte_free(vdev->regions_hpa);
2719                         rte_free(vdev);
2720                         return -1;
2721                 }
2722
2723         }
2724
2725         /*reset ready flag*/
2726         vdev->ready = DEVICE_MAC_LEARNING;
2727         vdev->remove = 0;
2728
2729         /* Find a suitable lcore to add the device. */
2730         RTE_LCORE_FOREACH_SLAVE(lcore) {
2731                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2732                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2733                         core_add = lcore;
2734                 }
2735         }
2736         /* Add device to lcore ll */
2737         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2738         if (ll_dev == NULL) {
2739                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2740                 vdev->ready = DEVICE_SAFE_REMOVE;
2741                 destroy_device(dev);
2742                 rte_free(vdev->regions_hpa);
2743                 rte_free(vdev);
2744                 return -1;
2745         }
2746         ll_dev->vdev = vdev;
2747         vdev->coreid = core_add;
2748
2749         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2750
2751         /* Initialize device stats */
2752         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2753
2754         /* Disable notifications. */
2755         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2756         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2757         lcore_info[vdev->coreid].lcore_ll->device_num++;
2758         dev->flags |= VIRTIO_DEV_RUNNING;
2759
2760         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2761
2762         return 0;
2763 }
2764
2765 /*
2766  * These callback allow devices to be added to the data core when configuration
2767  * has been fully complete.
2768  */
2769 static const struct virtio_net_device_ops virtio_net_device_ops =
2770 {
2771         .new_device =  new_device,
2772         .destroy_device = destroy_device,
2773 };
2774
2775 /*
2776  * This is a thread will wake up after a period to print stats if the user has
2777  * enabled them.
2778  */
2779 static void
2780 print_stats(void)
2781 {
2782         struct virtio_net_data_ll *dev_ll;
2783         uint64_t tx_dropped, rx_dropped;
2784         uint64_t tx, tx_total, rx, rx_total;
2785         uint32_t device_fh;
2786         const char clr[] = { 27, '[', '2', 'J', '\0' };
2787         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2788
2789         while(1) {
2790                 sleep(enable_stats);
2791
2792                 /* Clear screen and move to top left */
2793                 printf("%s%s", clr, top_left);
2794
2795                 printf("\nDevice statistics ====================================");
2796
2797                 dev_ll = ll_root_used;
2798                 while (dev_ll != NULL) {
2799                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2800                         tx_total = dev_statistics[device_fh].tx_total;
2801                         tx = dev_statistics[device_fh].tx;
2802                         tx_dropped = tx_total - tx;
2803                         if (zero_copy == 0) {
2804                                 rx_total = rte_atomic64_read(
2805                                         &dev_statistics[device_fh].rx_total_atomic);
2806                                 rx = rte_atomic64_read(
2807                                         &dev_statistics[device_fh].rx_atomic);
2808                         } else {
2809                                 rx_total = dev_statistics[device_fh].rx_total;
2810                                 rx = dev_statistics[device_fh].rx;
2811                         }
2812                         rx_dropped = rx_total - rx;
2813
2814                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2815                                         "\nTX total:            %"PRIu64""
2816                                         "\nTX dropped:          %"PRIu64""
2817                                         "\nTX successful:               %"PRIu64""
2818                                         "\nRX total:            %"PRIu64""
2819                                         "\nRX dropped:          %"PRIu64""
2820                                         "\nRX successful:               %"PRIu64"",
2821                                         device_fh,
2822                                         tx_total,
2823                                         tx_dropped,
2824                                         tx,
2825                                         rx_total,
2826                                         rx_dropped,
2827                                         rx);
2828
2829                         dev_ll = dev_ll->next;
2830                 }
2831                 printf("\n======================================================\n");
2832         }
2833 }
2834
2835 static void
2836 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2837         char *ring_name, uint32_t nb_mbuf)
2838 {
2839         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2840                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2841         if (vpool_array[index].pool != NULL) {
2842                 vpool_array[index].ring
2843                         = rte_ring_create(ring_name,
2844                                 rte_align32pow2(nb_mbuf + 1),
2845                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2846                 if (likely(vpool_array[index].ring != NULL)) {
2847                         LOG_DEBUG(VHOST_CONFIG,
2848                                 "in setup_mempool_tbl: mbuf count in "
2849                                 "mempool is: %d\n",
2850                                 rte_mempool_count(vpool_array[index].pool));
2851                         LOG_DEBUG(VHOST_CONFIG,
2852                                 "in setup_mempool_tbl: mbuf count in "
2853                                 "ring   is: %d\n",
2854                                 rte_ring_count(vpool_array[index].ring));
2855                 } else {
2856                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2857                                 ring_name);
2858                 }
2859
2860                 /* Need consider head room. */
2861                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2862         } else {
2863                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2864         }
2865 }
2866
2867 /* When we receive a INT signal, unregister vhost driver */
2868 static void
2869 sigint_handler(__rte_unused int signum)
2870 {
2871         /* Unregister vhost driver. */
2872         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2873         if (ret != 0)
2874                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2875         exit(0);
2876 }
2877
2878 /*
2879  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2880  * device is also registered here to handle the IOCTLs.
2881  */
2882 int
2883 main(int argc, char *argv[])
2884 {
2885         struct rte_mempool *mbuf_pool = NULL;
2886         unsigned lcore_id, core_id = 0;
2887         unsigned nb_ports, valid_num_ports;
2888         int ret;
2889         uint8_t portid;
2890         uint16_t queue_id;
2891         static pthread_t tid;
2892         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2893
2894         signal(SIGINT, sigint_handler);
2895
2896         /* init EAL */
2897         ret = rte_eal_init(argc, argv);
2898         if (ret < 0)
2899                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2900         argc -= ret;
2901         argv += ret;
2902
2903         /* parse app arguments */
2904         ret = us_vhost_parse_args(argc, argv);
2905         if (ret < 0)
2906                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2907
2908         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2909                 if (rte_lcore_is_enabled(lcore_id))
2910                         lcore_ids[core_id ++] = lcore_id;
2911
2912         if (rte_lcore_count() > RTE_MAX_LCORE)
2913                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2914
2915         /*set the number of swithcing cores available*/
2916         num_switching_cores = rte_lcore_count()-1;
2917
2918         /* Get the number of physical ports. */
2919         nb_ports = rte_eth_dev_count();
2920         if (nb_ports > RTE_MAX_ETHPORTS)
2921                 nb_ports = RTE_MAX_ETHPORTS;
2922
2923         /*
2924          * Update the global var NUM_PORTS and global array PORTS
2925          * and get value of var VALID_NUM_PORTS according to system ports number
2926          */
2927         valid_num_ports = check_ports_num(nb_ports);
2928
2929         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2930                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2931                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2932                 return -1;
2933         }
2934
2935         if (zero_copy == 0) {
2936                 /* Create the mbuf pool. */
2937                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
2938                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
2939                         0, MBUF_DATA_SIZE, rte_socket_id());
2940                 if (mbuf_pool == NULL)
2941                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2942
2943                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2944                         vpool_array[queue_id].pool = mbuf_pool;
2945
2946                 if (vm2vm_mode == VM2VM_HARDWARE) {
2947                         /* Enable VT loop back to let L2 switch to do it. */
2948                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2949                         LOG_DEBUG(VHOST_CONFIG,
2950                                 "Enable loop back for L2 switch in vmdq.\n");
2951                 }
2952         } else {
2953                 uint32_t nb_mbuf;
2954                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2955                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2956
2957                 nb_mbuf = num_rx_descriptor
2958                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2959                         + num_switching_cores * MAX_PKT_BURST;
2960
2961                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2962                         snprintf(pool_name, sizeof(pool_name),
2963                                 "rxmbuf_pool_%u", queue_id);
2964                         snprintf(ring_name, sizeof(ring_name),
2965                                 "rxmbuf_ring_%u", queue_id);
2966                         setup_mempool_tbl(rte_socket_id(), queue_id,
2967                                 pool_name, ring_name, nb_mbuf);
2968                 }
2969
2970                 nb_mbuf = num_tx_descriptor
2971                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2972                                 + num_switching_cores * MAX_PKT_BURST;
2973
2974                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2975                         snprintf(pool_name, sizeof(pool_name),
2976                                 "txmbuf_pool_%u", queue_id);
2977                         snprintf(ring_name, sizeof(ring_name),
2978                                 "txmbuf_ring_%u", queue_id);
2979                         setup_mempool_tbl(rte_socket_id(),
2980                                 (queue_id + MAX_QUEUES),
2981                                 pool_name, ring_name, nb_mbuf);
2982                 }
2983
2984                 if (vm2vm_mode == VM2VM_HARDWARE) {
2985                         /* Enable VT loop back to let L2 switch to do it. */
2986                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2987                         LOG_DEBUG(VHOST_CONFIG,
2988                                 "Enable loop back for L2 switch in vmdq.\n");
2989                 }
2990         }
2991         /* Set log level. */
2992         rte_set_log_level(LOG_LEVEL);
2993
2994         /* initialize all ports */
2995         for (portid = 0; portid < nb_ports; portid++) {
2996                 /* skip ports that are not enabled */
2997                 if ((enabled_port_mask & (1 << portid)) == 0) {
2998                         RTE_LOG(INFO, VHOST_PORT,
2999                                 "Skipping disabled port %d\n", portid);
3000                         continue;
3001                 }
3002                 if (port_init(portid) != 0)
3003                         rte_exit(EXIT_FAILURE,
3004                                 "Cannot initialize network ports\n");
3005         }
3006
3007         /* Initialise all linked lists. */
3008         if (init_data_ll() == -1)
3009                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3010
3011         /* Initialize device stats */
3012         memset(&dev_statistics, 0, sizeof(dev_statistics));
3013
3014         /* Enable stats if the user option is set. */
3015         if (enable_stats) {
3016                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3017                 if (ret != 0)
3018                         rte_exit(EXIT_FAILURE,
3019                                 "Cannot create print-stats thread\n");
3020
3021                 /* Set thread_name for aid in debugging.  */
3022                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3023                 ret = rte_thread_setname(tid, thread_name);
3024                 if (ret != 0)
3025                         RTE_LOG(ERR, VHOST_CONFIG,
3026                                 "Cannot set print-stats name\n");
3027         }
3028
3029         /* Launch all data cores. */
3030         if (zero_copy == 0) {
3031                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3032                         rte_eal_remote_launch(switch_worker,
3033                                 mbuf_pool, lcore_id);
3034                 }
3035         } else {
3036                 uint32_t count_in_mempool, index, i;
3037                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3038                         /* For all RX and TX queues. */
3039                         count_in_mempool
3040                                 = rte_mempool_count(vpool_array[index].pool);
3041
3042                         /*
3043                          * Transfer all un-attached mbufs from vpool.pool
3044                          * to vpoo.ring.
3045                          */
3046                         for (i = 0; i < count_in_mempool; i++) {
3047                                 struct rte_mbuf *mbuf
3048                                         = __rte_mbuf_raw_alloc(
3049                                                 vpool_array[index].pool);
3050                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3051                                                 (void *)mbuf);
3052                         }
3053
3054                         LOG_DEBUG(VHOST_CONFIG,
3055                                 "in main: mbuf count in mempool at initial "
3056                                 "is: %d\n", count_in_mempool);
3057                         LOG_DEBUG(VHOST_CONFIG,
3058                                 "in main: mbuf count in  ring at initial  is :"
3059                                 " %d\n",
3060                                 rte_ring_count(vpool_array[index].ring));
3061                 }
3062
3063                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3064                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3065                                 lcore_id);
3066         }
3067
3068         if (mergeable == 0)
3069                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3070
3071         /* Register vhost(cuse or user) driver to handle vhost messages. */
3072         ret = rte_vhost_driver_register((char *)&dev_basename);
3073         if (ret != 0)
3074                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3075
3076         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3077
3078         /* Start CUSE session. */
3079         rte_vhost_driver_session_start();
3080         return 0;
3081
3082 }