examples/vhost: support new VMDQ API for i40e
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53
54 #include "main.h"
55
56 #define MAX_QUEUES 256
57
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
60
61 /*
62  * Calculate the number of buffers needed per port
63  */
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
65                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
66                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67                                                         (num_switching_cores*MBUF_CACHE_SIZE))
68
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
71
72 /*
73  * No frame data buffer allocated from host are required for zero copy
74  * implementation, guest will allocate the frame data buffer, and vhost
75  * directly use it.
76  */
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79         + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
81
82 /*
83  * RX and TX Prefetch, Host, and Write-back threshold values should be
84  * carefully set for optimal performance. Consult the network
85  * controller's datasheet and supporting DPDK documentation for guidance
86  * on how these parameters should be set.
87  */
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
91
92 /*
93  * These default values are optimized for use with the Intel(R) 82599 10 GbE
94  * Controller and the DPDK ixgbe PMD. Consider using other values for other
95  * network controllers and/or network drivers.
96  */
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
100
101 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
103
104 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
106
107 #define JUMBO_FRAME_MAX_SIZE    0x2600
108
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
111 #define DEVICE_RX                       1
112 #define DEVICE_SAFE_REMOVE      2
113
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
117
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
121
122 /*
123  * Need refine these 2 macros for legacy and DPDK based front end:
124  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125  * And then adjust power 2.
126  */
127 /*
128  * For legacy front end, 128 descriptors,
129  * half for virtio header, another half for mbuf.
130  */
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
133
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136                 + sizeof(struct rte_mbuf)))
137
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
140
141 #define INVALID_PORT_ID 0xFF
142
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
145
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
148
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
151
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
154
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
157
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
160
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
163
164 /* Promiscuous mode */
165 static uint32_t promiscuous;
166
167 /*Number of switching cores enabled*/
168 static uint32_t num_switching_cores = 0;
169
170 /* number of devices/queues to support*/
171 static uint32_t num_queues = 0;
172 static uint32_t num_devices;
173
174 /*
175  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
176  * disabled on default.
177  */
178 static uint32_t zero_copy;
179 static int mergeable;
180
181 /* number of descriptors to apply*/
182 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
183 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
184
185 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
186 #define MAX_RING_DESC 4096
187
188 struct vpool {
189         struct rte_mempool *pool;
190         struct rte_ring *ring;
191         uint32_t buf_size;
192 } vpool_array[MAX_QUEUES+MAX_QUEUES];
193
194 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
195 typedef enum {
196         VM2VM_DISABLED = 0,
197         VM2VM_SOFTWARE = 1,
198         VM2VM_HARDWARE = 2,
199         VM2VM_LAST
200 } vm2vm_type;
201 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
202
203 /* The type of host physical address translated from guest physical address. */
204 typedef enum {
205         PHYS_ADDR_CONTINUOUS = 0,
206         PHYS_ADDR_CROSS_SUBREG = 1,
207         PHYS_ADDR_INVALID = 2,
208         PHYS_ADDR_LAST
209 } hpa_type;
210
211 /* Enable stats. */
212 static uint32_t enable_stats = 0;
213 /* Enable retries on RX. */
214 static uint32_t enable_retry = 1;
215 /* Specify timeout (in useconds) between retries on RX. */
216 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
217 /* Specify the number of retries on RX. */
218 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
219
220 /* Character device basename. Can be set by user. */
221 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
222
223
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
226         .rx_thresh = {
227                 .pthresh = RX_PTHRESH,
228                 .hthresh = RX_HTHRESH,
229                 .wthresh = RX_WTHRESH,
230         },
231         .rx_drop_en = 1,
232 };
233
234 /*
235  * These default values are optimized for use with the Intel(R) 82599 10 GbE
236  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237  * network controllers and/or network drivers.
238  */
239 static struct rte_eth_txconf tx_conf_default = {
240         .tx_thresh = {
241                 .pthresh = TX_PTHRESH,
242                 .hthresh = TX_HTHRESH,
243                 .wthresh = TX_WTHRESH,
244         },
245         .tx_free_thresh = 0, /* Use PMD default values */
246         .tx_rs_thresh = 0, /* Use PMD default values */
247 };
248
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
251         .rxmode = {
252                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
253                 .split_hdr_size = 0,
254                 .header_split   = 0, /**< Header Split disabled */
255                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
257                 /*
258                  * It is necessary for 1G NIC such as I350,
259                  * this fixes bug of ipv4 forwarding in guest can't
260                  * forward pakets from one virtio dev to another virtio dev.
261                  */
262                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
263                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
264                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
265         },
266
267         .txmode = {
268                 .mq_mode = ETH_MQ_TX_NONE,
269         },
270         .rx_adv_conf = {
271                 /*
272                  * should be overridden separately in code with
273                  * appropriate values
274                  */
275                 .vmdq_rx_conf = {
276                         .nb_queue_pools = ETH_8_POOLS,
277                         .enable_default_pool = 0,
278                         .default_pool = 0,
279                         .nb_pool_maps = 0,
280                         .pool_map = {{0, 0},},
281                 },
282         },
283 };
284
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
288 static uint16_t num_pf_queues, num_vmdq_queues;
289 static uint16_t vmdq_pool_base, vmdq_queue_base;
290 static uint16_t queues_per_pool;
291
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316         unsigned len;
317         unsigned txq_id;
318         struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329         unsigned char   h_dest[ETH_ALEN];
330         unsigned char   h_source[ETH_ALEN];
331         __be16          h_vlan_proto;
332         __be16          h_vlan_TCI;
333         __be16          h_vlan_encapsulated_proto;
334 };
335
336 /* IPv4 Header */
337 struct ipv4_hdr {
338         uint8_t  version_ihl;           /**< version and header length */
339         uint8_t  type_of_service;       /**< type of service */
340         uint16_t total_length;          /**< length of packet */
341         uint16_t packet_id;             /**< packet ID */
342         uint16_t fragment_offset;       /**< fragmentation offset */
343         uint8_t  time_to_live;          /**< time to live */
344         uint8_t  next_proto_id;         /**< protocol ID */
345         uint16_t hdr_checksum;          /**< header checksum */
346         uint32_t src_addr;              /**< source address */
347         uint32_t dst_addr;              /**< destination address */
348 } __attribute__((__packed__));
349
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353
354 /* Per-device statistics struct */
355 struct device_statistics {
356         uint64_t tx_total;
357         rte_atomic64_t rx_total_atomic;
358         uint64_t rx_total;
359         uint64_t tx;
360         rte_atomic64_t rx_atomic;
361         uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372         struct rte_eth_vmdq_rx_conf conf;
373         struct rte_eth_vmdq_rx_conf *def_conf =
374                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
375         unsigned i;
376
377         memset(&conf, 0, sizeof(conf));
378         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
379         conf.nb_pool_maps = num_devices;
380         conf.enable_loop_back = def_conf->enable_loop_back;
381         conf.rx_mode = def_conf->rx_mode;
382
383         for (i = 0; i < conf.nb_pool_maps; i++) {
384                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
385                 conf.pool_map[i].pools = (1UL << i);
386         }
387
388         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
389         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
390                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
391         return 0;
392 }
393
394 /*
395  * Validate the device number according to the max pool number gotten form
396  * dev_info. If the device number is invalid, give the error message and
397  * return -1. Each device must have its own pool.
398  */
399 static inline int
400 validate_num_devices(uint32_t max_nb_devices)
401 {
402         if (num_devices > max_nb_devices) {
403                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
404                 return -1;
405         }
406         return 0;
407 }
408
409 /*
410  * Initialises a given port using global settings and with the rx buffers
411  * coming from the mbuf_pool passed as parameter
412  */
413 static inline int
414 port_init(uint8_t port)
415 {
416         struct rte_eth_dev_info dev_info;
417         struct rte_eth_conf port_conf;
418         uint16_t rx_rings, tx_rings;
419         uint16_t rx_ring_size, tx_ring_size;
420         int retval;
421         uint16_t q;
422
423         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
424         rte_eth_dev_info_get (port, &dev_info);
425
426         /*configure the number of supported virtio devices based on VMDQ limits */
427         num_devices = dev_info.max_vmdq_pools;
428
429         if (zero_copy) {
430                 rx_ring_size = num_rx_descriptor;
431                 tx_ring_size = num_tx_descriptor;
432                 tx_rings = dev_info.max_tx_queues;
433         } else {
434                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
435                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
436                 tx_rings = (uint16_t)rte_lcore_count();
437         }
438
439         retval = validate_num_devices(MAX_DEVICES);
440         if (retval < 0)
441                 return retval;
442
443         /* Get port configuration. */
444         retval = get_eth_conf(&port_conf, num_devices);
445         if (retval < 0)
446                 return retval;
447         /* NIC queues are divided into pf queues and vmdq queues.  */
448         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
449         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
450         num_vmdq_queues = num_devices * queues_per_pool;
451         num_queues = num_pf_queues + num_vmdq_queues;
452         vmdq_queue_base = dev_info.vmdq_queue_base;
453         vmdq_pool_base  = dev_info.vmdq_pool_base;
454         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
455                 num_pf_queues, num_devices, queues_per_pool);
456
457         if (port >= rte_eth_dev_count()) return -1;
458
459         rx_rings = (uint16_t)dev_info.max_rx_queues;
460         /* Configure ethernet device. */
461         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
462         if (retval != 0)
463                 return retval;
464
465         /* Setup the queues. */
466         for (q = 0; q < rx_rings; q ++) {
467                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
468                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
469                                                 vpool_array[q].pool);
470                 if (retval < 0)
471                         return retval;
472         }
473         for (q = 0; q < tx_rings; q ++) {
474                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
475                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
476                 if (retval < 0)
477                         return retval;
478         }
479
480         /* Start the device. */
481         retval  = rte_eth_dev_start(port);
482         if (retval < 0) {
483                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
484                 return retval;
485         }
486
487         if (promiscuous)
488                 rte_eth_promiscuous_enable(port);
489
490         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
491         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
492         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
493                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
494                         (unsigned)port,
495                         vmdq_ports_eth_addr[port].addr_bytes[0],
496                         vmdq_ports_eth_addr[port].addr_bytes[1],
497                         vmdq_ports_eth_addr[port].addr_bytes[2],
498                         vmdq_ports_eth_addr[port].addr_bytes[3],
499                         vmdq_ports_eth_addr[port].addr_bytes[4],
500                         vmdq_ports_eth_addr[port].addr_bytes[5]);
501
502         return 0;
503 }
504
505 /*
506  * Set character device basename.
507  */
508 static int
509 us_vhost_parse_basename(const char *q_arg)
510 {
511         /* parse number string */
512
513         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
514                 return -1;
515         else
516                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
517
518         return 0;
519 }
520
521 /*
522  * Parse the portmask provided at run time.
523  */
524 static int
525 parse_portmask(const char *portmask)
526 {
527         char *end = NULL;
528         unsigned long pm;
529
530         errno = 0;
531
532         /* parse hexadecimal string */
533         pm = strtoul(portmask, &end, 16);
534         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
535                 return -1;
536
537         if (pm == 0)
538                 return -1;
539
540         return pm;
541
542 }
543
544 /*
545  * Parse num options at run time.
546  */
547 static int
548 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
549 {
550         char *end = NULL;
551         unsigned long num;
552
553         errno = 0;
554
555         /* parse unsigned int string */
556         num = strtoul(q_arg, &end, 10);
557         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
558                 return -1;
559
560         if (num > max_valid_value)
561                 return -1;
562
563         return num;
564
565 }
566
567 /*
568  * Display usage
569  */
570 static void
571 us_vhost_usage(const char *prgname)
572 {
573         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
574         "               --vm2vm [0|1|2]\n"
575         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
576         "               --dev-basename <name>\n"
577         "               --nb-devices ND\n"
578         "               -p PORTMASK: Set mask for ports to be used by application\n"
579         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
580         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
581         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
582         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
583         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
584         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
585         "               --dev-basename: The basename to be used for the character device.\n"
586         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
587                         "zero copy\n"
588         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
589                         "used only when zero copy is enabled.\n"
590         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
591                         "used only when zero copy is enabled.\n",
592                prgname);
593 }
594
595 /*
596  * Parse the arguments given in the command line of the application.
597  */
598 static int
599 us_vhost_parse_args(int argc, char **argv)
600 {
601         int opt, ret;
602         int option_index;
603         unsigned i;
604         const char *prgname = argv[0];
605         static struct option long_option[] = {
606                 {"vm2vm", required_argument, NULL, 0},
607                 {"rx-retry", required_argument, NULL, 0},
608                 {"rx-retry-delay", required_argument, NULL, 0},
609                 {"rx-retry-num", required_argument, NULL, 0},
610                 {"mergeable", required_argument, NULL, 0},
611                 {"stats", required_argument, NULL, 0},
612                 {"dev-basename", required_argument, NULL, 0},
613                 {"zero-copy", required_argument, NULL, 0},
614                 {"rx-desc-num", required_argument, NULL, 0},
615                 {"tx-desc-num", required_argument, NULL, 0},
616                 {NULL, 0, 0, 0},
617         };
618
619         /* Parse command line */
620         while ((opt = getopt_long(argc, argv, "p:P",
621                         long_option, &option_index)) != EOF) {
622                 switch (opt) {
623                 /* Portmask */
624                 case 'p':
625                         enabled_port_mask = parse_portmask(optarg);
626                         if (enabled_port_mask == 0) {
627                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
628                                 us_vhost_usage(prgname);
629                                 return -1;
630                         }
631                         break;
632
633                 case 'P':
634                         promiscuous = 1;
635                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
636                                 ETH_VMDQ_ACCEPT_BROADCAST |
637                                 ETH_VMDQ_ACCEPT_MULTICAST;
638                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
639
640                         break;
641
642                 case 0:
643                         /* Enable/disable vm2vm comms. */
644                         if (!strncmp(long_option[option_index].name, "vm2vm",
645                                 MAX_LONG_OPT_SZ)) {
646                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
647                                 if (ret == -1) {
648                                         RTE_LOG(INFO, VHOST_CONFIG,
649                                                 "Invalid argument for "
650                                                 "vm2vm [0|1|2]\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 } else {
654                                         vm2vm_mode = (vm2vm_type)ret;
655                                 }
656                         }
657
658                         /* Enable/disable retries on RX. */
659                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
660                                 ret = parse_num_opt(optarg, 1);
661                                 if (ret == -1) {
662                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
663                                         us_vhost_usage(prgname);
664                                         return -1;
665                                 } else {
666                                         enable_retry = ret;
667                                 }
668                         }
669
670                         /* Specify the retries delay time (in useconds) on RX. */
671                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
672                                 ret = parse_num_opt(optarg, INT32_MAX);
673                                 if (ret == -1) {
674                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
675                                         us_vhost_usage(prgname);
676                                         return -1;
677                                 } else {
678                                         burst_rx_delay_time = ret;
679                                 }
680                         }
681
682                         /* Specify the retries number on RX. */
683                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
684                                 ret = parse_num_opt(optarg, INT32_MAX);
685                                 if (ret == -1) {
686                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
687                                         us_vhost_usage(prgname);
688                                         return -1;
689                                 } else {
690                                         burst_rx_retry_num = ret;
691                                 }
692                         }
693
694                         /* Enable/disable RX mergeable buffers. */
695                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
696                                 ret = parse_num_opt(optarg, 1);
697                                 if (ret == -1) {
698                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
699                                         us_vhost_usage(prgname);
700                                         return -1;
701                                 } else {
702                                         mergeable = !!ret;
703                                         if (ret) {
704                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
705                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
706                                                         = JUMBO_FRAME_MAX_SIZE;
707                                         }
708                                 }
709                         }
710
711                         /* Enable/disable stats. */
712                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
713                                 ret = parse_num_opt(optarg, INT32_MAX);
714                                 if (ret == -1) {
715                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
716                                         us_vhost_usage(prgname);
717                                         return -1;
718                                 } else {
719                                         enable_stats = ret;
720                                 }
721                         }
722
723                         /* Set character device basename. */
724                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
725                                 if (us_vhost_parse_basename(optarg) == -1) {
726                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
727                                         us_vhost_usage(prgname);
728                                         return -1;
729                                 }
730                         }
731
732                         /* Enable/disable rx/tx zero copy. */
733                         if (!strncmp(long_option[option_index].name,
734                                 "zero-copy", MAX_LONG_OPT_SZ)) {
735                                 ret = parse_num_opt(optarg, 1);
736                                 if (ret == -1) {
737                                         RTE_LOG(INFO, VHOST_CONFIG,
738                                                 "Invalid argument"
739                                                 " for zero-copy [0|1]\n");
740                                         us_vhost_usage(prgname);
741                                         return -1;
742                                 } else
743                                         zero_copy = ret;
744
745                                 if (zero_copy) {
746 #ifdef RTE_MBUF_REFCNT
747                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
748                                         "zero copy vhost APP, please "
749                                         "disable RTE_MBUF_REFCNT\n"
750                                         "in config file and then rebuild DPDK "
751                                         "core lib!\n"
752                                         "Otherwise please disable zero copy "
753                                         "flag in command line!\n");
754                                         return -1;
755 #endif
756                                 }
757                         }
758
759                         /* Specify the descriptor number on RX. */
760                         if (!strncmp(long_option[option_index].name,
761                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
762                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
763                                 if ((ret == -1) || (!POWEROF2(ret))) {
764                                         RTE_LOG(INFO, VHOST_CONFIG,
765                                         "Invalid argument for rx-desc-num[0-N],"
766                                         "power of 2 required.\n");
767                                         us_vhost_usage(prgname);
768                                         return -1;
769                                 } else {
770                                         num_rx_descriptor = ret;
771                                 }
772                         }
773
774                         /* Specify the descriptor number on TX. */
775                         if (!strncmp(long_option[option_index].name,
776                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
777                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
778                                 if ((ret == -1) || (!POWEROF2(ret))) {
779                                         RTE_LOG(INFO, VHOST_CONFIG,
780                                         "Invalid argument for tx-desc-num [0-N],"
781                                         "power of 2 required.\n");
782                                         us_vhost_usage(prgname);
783                                         return -1;
784                                 } else {
785                                         num_tx_descriptor = ret;
786                                 }
787                         }
788
789                         break;
790
791                         /* Invalid option - print options. */
792                 default:
793                         us_vhost_usage(prgname);
794                         return -1;
795                 }
796         }
797
798         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
799                 if (enabled_port_mask & (1 << i))
800                         ports[num_ports++] = (uint8_t)i;
801         }
802
803         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
804                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
805                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
806                 return -1;
807         }
808
809         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
810                 RTE_LOG(INFO, VHOST_PORT,
811                         "Vhost zero copy doesn't support software vm2vm,"
812                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
813                 return -1;
814         }
815
816         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
817                 RTE_LOG(INFO, VHOST_PORT,
818                         "Vhost zero copy doesn't support jumbo frame,"
819                         "please specify '--mergeable 0' to disable the "
820                         "mergeable feature.\n");
821                 return -1;
822         }
823
824         return 0;
825 }
826
827 /*
828  * Update the global var NUM_PORTS and array PORTS according to system ports number
829  * and return valid ports number
830  */
831 static unsigned check_ports_num(unsigned nb_ports)
832 {
833         unsigned valid_num_ports = num_ports;
834         unsigned portid;
835
836         if (num_ports > nb_ports) {
837                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
838                         num_ports, nb_ports);
839                 num_ports = nb_ports;
840         }
841
842         for (portid = 0; portid < num_ports; portid ++) {
843                 if (ports[portid] >= nb_ports) {
844                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
845                                 ports[portid], (nb_ports - 1));
846                         ports[portid] = INVALID_PORT_ID;
847                         valid_num_ports--;
848                 }
849         }
850         return valid_num_ports;
851 }
852
853 /*
854  * Macro to print out packet contents. Wrapped in debug define so that the
855  * data path is not effected when debug is disabled.
856  */
857 #ifdef DEBUG
858 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
859         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
860         unsigned int index;                                                                                                                                                                                             \
861         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
862                                                                                                                                                                                                                                         \
863         if ((header))                                                                                                                                                                                                   \
864                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
865         else                                                                                                                                                                                                                    \
866                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
867         for (index = 0; index < (size); index++) {                                                                                                                                              \
868                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
869                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
870         }                                                                                                                                                                                                                               \
871         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
872                                                                                                                                                                                                                                         \
873         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
874 } while(0)
875 #else
876 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
877 #endif
878
879 /*
880  * Function to convert guest physical addresses to vhost physical addresses.
881  * This is used to convert virtio buffer addresses.
882  */
883 static inline uint64_t __attribute__((always_inline))
884 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
885         uint32_t buf_len, hpa_type *addr_type)
886 {
887         struct virtio_memory_regions_hpa *region;
888         uint32_t regionidx;
889         uint64_t vhost_pa = 0;
890
891         *addr_type = PHYS_ADDR_INVALID;
892
893         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
894                 region = &vdev->regions_hpa[regionidx];
895                 if ((guest_pa >= region->guest_phys_address) &&
896                         (guest_pa <= region->guest_phys_address_end)) {
897                         vhost_pa = region->host_phys_addr_offset + guest_pa;
898                         if (likely((guest_pa + buf_len - 1)
899                                 <= region->guest_phys_address_end))
900                                 *addr_type = PHYS_ADDR_CONTINUOUS;
901                         else
902                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
903                         break;
904                 }
905         }
906
907         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
908                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
909                 (void *)(uintptr_t)vhost_pa);
910
911         return vhost_pa;
912 }
913
914 /*
915  * Compares a packet destination MAC address to a device MAC address.
916  */
917 static inline int __attribute__((always_inline))
918 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
919 {
920         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
921 }
922
923 /*
924  * This function learns the MAC address of the device and registers this along with a
925  * vlan tag to a VMDQ.
926  */
927 static int
928 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
929 {
930         struct ether_hdr *pkt_hdr;
931         struct virtio_net_data_ll *dev_ll;
932         struct virtio_net *dev = vdev->dev;
933         int i, ret;
934
935         /* Learn MAC address of guest device from packet */
936         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
937
938         dev_ll = ll_root_used;
939
940         while (dev_ll != NULL) {
941                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
942                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
943                         return -1;
944                 }
945                 dev_ll = dev_ll->next;
946         }
947
948         for (i = 0; i < ETHER_ADDR_LEN; i++)
949                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
950
951         /* vlan_tag currently uses the device_id. */
952         vdev->vlan_tag = vlan_tags[dev->device_fh];
953
954         /* Print out VMDQ registration info. */
955         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
956                 dev->device_fh,
957                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
958                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
959                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
960                 vdev->vlan_tag);
961
962         /* Register the MAC address. */
963         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
964                                 (uint32_t)dev->device_fh + vmdq_pool_base);
965         if (ret)
966                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
967                                         dev->device_fh);
968
969         /* Enable stripping of the vlan tag as we handle routing. */
970         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
971
972         /* Set device as ready for RX. */
973         vdev->ready = DEVICE_RX;
974
975         return 0;
976 }
977
978 /*
979  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
980  * queue before disabling RX on the device.
981  */
982 static inline void
983 unlink_vmdq(struct vhost_dev *vdev)
984 {
985         unsigned i = 0;
986         unsigned rx_count;
987         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
988
989         if (vdev->ready == DEVICE_RX) {
990                 /*clear MAC and VLAN settings*/
991                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
992                 for (i = 0; i < 6; i++)
993                         vdev->mac_address.addr_bytes[i] = 0;
994
995                 vdev->vlan_tag = 0;
996
997                 /*Clear out the receive buffers*/
998                 rx_count = rte_eth_rx_burst(ports[0],
999                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1000
1001                 while (rx_count) {
1002                         for (i = 0; i < rx_count; i++)
1003                                 rte_pktmbuf_free(pkts_burst[i]);
1004
1005                         rx_count = rte_eth_rx_burst(ports[0],
1006                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1007                 }
1008
1009                 vdev->ready = DEVICE_MAC_LEARNING;
1010         }
1011 }
1012
1013 /*
1014  * Check if the packet destination MAC address is for a local device. If so then put
1015  * the packet on that devices RX queue. If not then return.
1016  */
1017 static inline int __attribute__((always_inline))
1018 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1019 {
1020         struct virtio_net_data_ll *dev_ll;
1021         struct ether_hdr *pkt_hdr;
1022         uint64_t ret = 0;
1023         struct virtio_net *dev = vdev->dev;
1024         struct virtio_net *tdev; /* destination virito device */
1025
1026         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1027
1028         /*get the used devices list*/
1029         dev_ll = ll_root_used;
1030
1031         while (dev_ll != NULL) {
1032                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1033                                           &dev_ll->vdev->mac_address)) {
1034
1035                         /* Drop the packet if the TX packet is destined for the TX device. */
1036                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1037                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1038                                                         dev->device_fh);
1039                                 return 0;
1040                         }
1041                         tdev = dev_ll->vdev->dev;
1042
1043
1044                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1045
1046                         if (unlikely(dev_ll->vdev->remove)) {
1047                                 /*drop the packet if the device is marked for removal*/
1048                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1049                         } else {
1050                                 /*send the packet to the local virtio device*/
1051                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1052                                 if (enable_stats) {
1053                                         rte_atomic64_add(
1054                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1055                                         1);
1056                                         rte_atomic64_add(
1057                                         &dev_statistics[tdev->device_fh].rx_atomic,
1058                                         ret);
1059                                         dev_statistics[tdev->device_fh].tx_total++;
1060                                         dev_statistics[tdev->device_fh].tx += ret;
1061                                 }
1062                         }
1063
1064                         return 0;
1065                 }
1066                 dev_ll = dev_ll->next;
1067         }
1068
1069         return -1;
1070 }
1071
1072 /*
1073  * Check if the destination MAC of a packet is one local VM,
1074  * and get its vlan tag, and offset if it is.
1075  */
1076 static inline int __attribute__((always_inline))
1077 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1078         uint32_t *offset, uint16_t *vlan_tag)
1079 {
1080         struct virtio_net_data_ll *dev_ll = ll_root_used;
1081         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1082
1083         while (dev_ll != NULL) {
1084                 if ((dev_ll->vdev->ready == DEVICE_RX)
1085                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1086                 &dev_ll->vdev->mac_address)) {
1087                         /*
1088                          * Drop the packet if the TX packet is
1089                          * destined for the TX device.
1090                          */
1091                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1092                                 LOG_DEBUG(VHOST_DATA,
1093                                 "(%"PRIu64") TX: Source and destination"
1094                                 " MAC addresses are the same. Dropping "
1095                                 "packet.\n",
1096                                 dev_ll->vdev->dev->device_fh);
1097                                 return -1;
1098                         }
1099
1100                         /*
1101                          * HW vlan strip will reduce the packet length
1102                          * by minus length of vlan tag, so need restore
1103                          * the packet length by plus it.
1104                          */
1105                         *offset = VLAN_HLEN;
1106                         *vlan_tag =
1107                         (uint16_t)
1108                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1109
1110                         LOG_DEBUG(VHOST_DATA,
1111                         "(%"PRIu64") TX: pkt to local VM device id:"
1112                         "(%"PRIu64") vlan tag: %d.\n",
1113                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1114                         vlan_tag);
1115
1116                         break;
1117                 }
1118                 dev_ll = dev_ll->next;
1119         }
1120         return 0;
1121 }
1122
1123 /*
1124  * This function routes the TX packet to the correct interface. This may be a local device
1125  * or the physical port.
1126  */
1127 static inline void __attribute__((always_inline))
1128 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1129 {
1130         struct mbuf_table *tx_q;
1131         struct rte_mbuf **m_table;
1132         unsigned len, ret, offset = 0;
1133         const uint16_t lcore_id = rte_lcore_id();
1134         struct virtio_net *dev = vdev->dev;
1135
1136         /*check if destination is local VM*/
1137         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1138                 rte_pktmbuf_free(m);
1139                 return;
1140         }
1141
1142         if (vm2vm_mode == VM2VM_HARDWARE) {
1143                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1144                         offset > rte_pktmbuf_tailroom(m)) {
1145                         rte_pktmbuf_free(m);
1146                         return;
1147                 }
1148         }
1149
1150         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1151
1152         /*Add packet to the port tx queue*/
1153         tx_q = &lcore_tx_queue[lcore_id];
1154         len = tx_q->len;
1155
1156         m->ol_flags = PKT_TX_VLAN_PKT;
1157
1158         m->data_len += offset;
1159         m->pkt_len += offset;
1160
1161         m->vlan_tci = vlan_tag;
1162
1163         tx_q->m_table[len] = m;
1164         len++;
1165         if (enable_stats) {
1166                 dev_statistics[dev->device_fh].tx_total++;
1167                 dev_statistics[dev->device_fh].tx++;
1168         }
1169
1170         if (unlikely(len == MAX_PKT_BURST)) {
1171                 m_table = (struct rte_mbuf **)tx_q->m_table;
1172                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1173                 /* Free any buffers not handled by TX and update the port stats. */
1174                 if (unlikely(ret < len)) {
1175                         do {
1176                                 rte_pktmbuf_free(m_table[ret]);
1177                         } while (++ret < len);
1178                 }
1179
1180                 len = 0;
1181         }
1182
1183         tx_q->len = len;
1184         return;
1185 }
1186 /*
1187  * This function is called by each data core. It handles all RX/TX registered with the
1188  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1189  * with all devices in the main linked list.
1190  */
1191 static int
1192 switch_worker(__attribute__((unused)) void *arg)
1193 {
1194         struct rte_mempool *mbuf_pool = arg;
1195         struct virtio_net *dev = NULL;
1196         struct vhost_dev *vdev = NULL;
1197         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1198         struct virtio_net_data_ll *dev_ll;
1199         struct mbuf_table *tx_q;
1200         volatile struct lcore_ll_info *lcore_ll;
1201         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1202         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1203         unsigned ret, i;
1204         const uint16_t lcore_id = rte_lcore_id();
1205         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1206         uint16_t rx_count = 0;
1207         uint16_t tx_count;
1208         uint32_t retry = 0;
1209
1210         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1211         lcore_ll = lcore_info[lcore_id].lcore_ll;
1212         prev_tsc = 0;
1213
1214         tx_q = &lcore_tx_queue[lcore_id];
1215         for (i = 0; i < num_cores; i ++) {
1216                 if (lcore_ids[i] == lcore_id) {
1217                         tx_q->txq_id = i;
1218                         break;
1219                 }
1220         }
1221
1222         while(1) {
1223                 cur_tsc = rte_rdtsc();
1224                 /*
1225                  * TX burst queue drain
1226                  */
1227                 diff_tsc = cur_tsc - prev_tsc;
1228                 if (unlikely(diff_tsc > drain_tsc)) {
1229
1230                         if (tx_q->len) {
1231                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1232
1233                                 /*Tx any packets in the queue*/
1234                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1235                                                                            (struct rte_mbuf **)tx_q->m_table,
1236                                                                            (uint16_t)tx_q->len);
1237                                 if (unlikely(ret < tx_q->len)) {
1238                                         do {
1239                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1240                                         } while (++ret < tx_q->len);
1241                                 }
1242
1243                                 tx_q->len = 0;
1244                         }
1245
1246                         prev_tsc = cur_tsc;
1247
1248                 }
1249
1250                 rte_prefetch0(lcore_ll->ll_root_used);
1251                 /*
1252                  * Inform the configuration core that we have exited the linked list and that no devices are
1253                  * in use if requested.
1254                  */
1255                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1256                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1257
1258                 /*
1259                  * Process devices
1260                  */
1261                 dev_ll = lcore_ll->ll_root_used;
1262
1263                 while (dev_ll != NULL) {
1264                         /*get virtio device ID*/
1265                         vdev = dev_ll->vdev;
1266                         dev = vdev->dev;
1267
1268                         if (unlikely(vdev->remove)) {
1269                                 dev_ll = dev_ll->next;
1270                                 unlink_vmdq(vdev);
1271                                 vdev->ready = DEVICE_SAFE_REMOVE;
1272                                 continue;
1273                         }
1274                         if (likely(vdev->ready == DEVICE_RX)) {
1275                                 /*Handle guest RX*/
1276                                 rx_count = rte_eth_rx_burst(ports[0],
1277                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1278
1279                                 if (rx_count) {
1280                                         /*
1281                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1282                                         * Here MAX_PKT_BURST must be less than virtio queue size
1283                                         */
1284                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1285                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1286                                                         rte_delay_us(burst_rx_delay_time);
1287                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1288                                                                 break;
1289                                                 }
1290                                         }
1291                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1292                                         if (enable_stats) {
1293                                                 rte_atomic64_add(
1294                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1295                                                 rx_count);
1296                                                 rte_atomic64_add(
1297                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1298                                         }
1299                                         while (likely(rx_count)) {
1300                                                 rx_count--;
1301                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1302                                         }
1303
1304                                 }
1305                         }
1306
1307                         if (likely(!vdev->remove)) {
1308                                 /* Handle guest TX*/
1309                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1310                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1311                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1312                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1313                                                 while (tx_count--)
1314                                                         rte_pktmbuf_free(pkts_burst[tx_count]);
1315                                         }
1316                                 }
1317                                 while (tx_count)
1318                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1319                         }
1320
1321                         /*move to the next device in the list*/
1322                         dev_ll = dev_ll->next;
1323                 }
1324         }
1325
1326         return 0;
1327 }
1328
1329 /*
1330  * This function gets available ring number for zero copy rx.
1331  * Only one thread will call this funciton for a paticular virtio device,
1332  * so, it is designed as non-thread-safe function.
1333  */
1334 static inline uint32_t __attribute__((always_inline))
1335 get_available_ring_num_zcp(struct virtio_net *dev)
1336 {
1337         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1338         uint16_t avail_idx;
1339
1340         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1341         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1342 }
1343
1344 /*
1345  * This function gets available ring index for zero copy rx,
1346  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1347  * Only one thread will call this funciton for a paticular virtio device,
1348  * so, it is designed as non-thread-safe function.
1349  */
1350 static inline uint32_t __attribute__((always_inline))
1351 get_available_ring_index_zcp(struct virtio_net *dev,
1352         uint16_t *res_base_idx, uint32_t count)
1353 {
1354         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1355         uint16_t avail_idx;
1356         uint32_t retry = 0;
1357         uint16_t free_entries;
1358
1359         *res_base_idx = vq->last_used_idx_res;
1360         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1361         free_entries = (avail_idx - *res_base_idx);
1362
1363         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1364                         "avail idx: %d, "
1365                         "res base idx:%d, free entries:%d\n",
1366                         dev->device_fh, avail_idx, *res_base_idx,
1367                         free_entries);
1368
1369         /*
1370          * If retry is enabled and the queue is full then we wait
1371          * and retry to avoid packet loss.
1372          */
1373         if (enable_retry && unlikely(count > free_entries)) {
1374                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1375                         rte_delay_us(burst_rx_delay_time);
1376                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1377                         free_entries = (avail_idx - *res_base_idx);
1378                         if (count <= free_entries)
1379                                 break;
1380                 }
1381         }
1382
1383         /*check that we have enough buffers*/
1384         if (unlikely(count > free_entries))
1385                 count = free_entries;
1386
1387         if (unlikely(count == 0)) {
1388                 LOG_DEBUG(VHOST_DATA,
1389                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1390                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1391                         dev->device_fh, avail_idx,
1392                         *res_base_idx, free_entries);
1393                 return 0;
1394         }
1395
1396         vq->last_used_idx_res = *res_base_idx + count;
1397
1398         return count;
1399 }
1400
1401 /*
1402  * This function put descriptor back to used list.
1403  */
1404 static inline void __attribute__((always_inline))
1405 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1406 {
1407         uint16_t res_cur_idx = vq->last_used_idx;
1408         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1409         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1410         rte_compiler_barrier();
1411         *(volatile uint16_t *)&vq->used->idx += 1;
1412         vq->last_used_idx += 1;
1413
1414         /* Kick the guest if necessary. */
1415         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1416                 eventfd_write((int)vq->kickfd, 1);
1417 }
1418
1419 /*
1420  * This function get available descriptor from vitio vring and un-attached mbuf
1421  * from vpool->ring, and then attach them together. It needs adjust the offset
1422  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1423  * frame data may be put to wrong location in mbuf.
1424  */
1425 static inline void __attribute__((always_inline))
1426 attach_rxmbuf_zcp(struct virtio_net *dev)
1427 {
1428         uint16_t res_base_idx, desc_idx;
1429         uint64_t buff_addr, phys_addr;
1430         struct vhost_virtqueue *vq;
1431         struct vring_desc *desc;
1432         struct rte_mbuf *mbuf = NULL;
1433         struct vpool *vpool;
1434         hpa_type addr_type;
1435         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1436
1437         vpool = &vpool_array[vdev->vmdq_rx_q];
1438         vq = dev->virtqueue[VIRTIO_RXQ];
1439
1440         do {
1441                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1442                                 1) != 1))
1443                         return;
1444                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1445
1446                 desc = &vq->desc[desc_idx];
1447                 if (desc->flags & VRING_DESC_F_NEXT) {
1448                         desc = &vq->desc[desc->next];
1449                         buff_addr = gpa_to_vva(dev, desc->addr);
1450                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1451                                         &addr_type);
1452                 } else {
1453                         buff_addr = gpa_to_vva(dev,
1454                                         desc->addr + vq->vhost_hlen);
1455                         phys_addr = gpa_to_hpa(vdev,
1456                                         desc->addr + vq->vhost_hlen,
1457                                         desc->len, &addr_type);
1458                 }
1459
1460                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1461                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1462                                 " address found when attaching RX frame buffer"
1463                                 " address!\n", dev->device_fh);
1464                         put_desc_to_used_list_zcp(vq, desc_idx);
1465                         continue;
1466                 }
1467
1468                 /*
1469                  * Check if the frame buffer address from guest crosses
1470                  * sub-region or not.
1471                  */
1472                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1473                         RTE_LOG(ERR, VHOST_DATA,
1474                                 "(%"PRIu64") Frame buffer address cross "
1475                                 "sub-regioin found when attaching RX frame "
1476                                 "buffer address!\n",
1477                                 dev->device_fh);
1478                         put_desc_to_used_list_zcp(vq, desc_idx);
1479                         continue;
1480                 }
1481         } while (unlikely(phys_addr == 0));
1482
1483         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1484         if (unlikely(mbuf == NULL)) {
1485                 LOG_DEBUG(VHOST_DATA,
1486                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1487                         "ring_sc_dequeue fail.\n",
1488                         dev->device_fh);
1489                 put_desc_to_used_list_zcp(vq, desc_idx);
1490                 return;
1491         }
1492
1493         if (unlikely(vpool->buf_size > desc->len)) {
1494                 LOG_DEBUG(VHOST_DATA,
1495                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1496                         "length(%d) of descriptor idx: %d less than room "
1497                         "size required: %d\n",
1498                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1499                 put_desc_to_used_list_zcp(vq, desc_idx);
1500                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1501                 return;
1502         }
1503
1504         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1505         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1506         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1507         mbuf->data_len = desc->len;
1508         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1509
1510         LOG_DEBUG(VHOST_DATA,
1511                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1512                 "descriptor idx:%d\n",
1513                 dev->device_fh, res_base_idx, desc_idx);
1514
1515         __rte_mbuf_raw_free(mbuf);
1516
1517         return;
1518 }
1519
1520 /*
1521  * Detach an attched packet mbuf -
1522  *  - restore original mbuf address and length values.
1523  *  - reset pktmbuf data and data_len to their default values.
1524  *  All other fields of the given packet mbuf will be left intact.
1525  *
1526  * @param m
1527  *   The attached packet mbuf.
1528  */
1529 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1530 {
1531         const struct rte_mempool *mp = m->pool;
1532         void *buf = RTE_MBUF_TO_BADDR(m);
1533         uint32_t buf_ofs;
1534         uint32_t buf_len = mp->elt_size - sizeof(*m);
1535         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1536
1537         m->buf_addr = buf;
1538         m->buf_len = (uint16_t)buf_len;
1539
1540         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1541                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1542         m->data_off = buf_ofs;
1543
1544         m->data_len = 0;
1545 }
1546
1547 /*
1548  * This function is called after packets have been transimited. It fetchs mbuf
1549  * from vpool->pool, detached it and put into vpool->ring. It also update the
1550  * used index and kick the guest if necessary.
1551  */
1552 static inline uint32_t __attribute__((always_inline))
1553 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1554 {
1555         struct rte_mbuf *mbuf;
1556         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1557         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1558         uint32_t index = 0;
1559         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1560
1561         LOG_DEBUG(VHOST_DATA,
1562                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1563                 "clean is: %d\n",
1564                 dev->device_fh, mbuf_count);
1565         LOG_DEBUG(VHOST_DATA,
1566                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1567                 "clean  is : %d\n",
1568                 dev->device_fh, rte_ring_count(vpool->ring));
1569
1570         for (index = 0; index < mbuf_count; index++) {
1571                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1572                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1573                         pktmbuf_detach_zcp(mbuf);
1574                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1575
1576                 /* Update used index buffer information. */
1577                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1578                 vq->used->ring[used_idx].len = 0;
1579
1580                 used_idx = (used_idx + 1) & (vq->size - 1);
1581         }
1582
1583         LOG_DEBUG(VHOST_DATA,
1584                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1585                 "clean is: %d\n",
1586                 dev->device_fh, rte_mempool_count(vpool->pool));
1587         LOG_DEBUG(VHOST_DATA,
1588                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1589                 "clean  is : %d\n",
1590                 dev->device_fh, rte_ring_count(vpool->ring));
1591         LOG_DEBUG(VHOST_DATA,
1592                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1593                 "vq->last_used_idx:%d\n",
1594                 dev->device_fh, vq->last_used_idx);
1595
1596         vq->last_used_idx += mbuf_count;
1597
1598         LOG_DEBUG(VHOST_DATA,
1599                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1600                 "vq->last_used_idx:%d\n",
1601                 dev->device_fh, vq->last_used_idx);
1602
1603         rte_compiler_barrier();
1604
1605         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1606
1607         /* Kick guest if required. */
1608         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1609                 eventfd_write((int)vq->kickfd, 1);
1610
1611         return 0;
1612 }
1613
1614 /*
1615  * This function is called when a virtio device is destroy.
1616  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1617  */
1618 static void mbuf_destroy_zcp(struct vpool *vpool)
1619 {
1620         struct rte_mbuf *mbuf = NULL;
1621         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1622
1623         LOG_DEBUG(VHOST_CONFIG,
1624                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1625                 "mbuf_destroy_zcp is: %d\n",
1626                 mbuf_count);
1627         LOG_DEBUG(VHOST_CONFIG,
1628                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1629                 "mbuf_destroy_zcp  is : %d\n",
1630                 rte_ring_count(vpool->ring));
1631
1632         for (index = 0; index < mbuf_count; index++) {
1633                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1634                 if (likely(mbuf != NULL)) {
1635                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
1636                                 pktmbuf_detach_zcp(mbuf);
1637                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1638                 }
1639         }
1640
1641         LOG_DEBUG(VHOST_CONFIG,
1642                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1643                 "mbuf_destroy_zcp is: %d\n",
1644                 rte_mempool_count(vpool->pool));
1645         LOG_DEBUG(VHOST_CONFIG,
1646                 "in mbuf_destroy_zcp: mbuf count in ring after "
1647                 "mbuf_destroy_zcp is : %d\n",
1648                 rte_ring_count(vpool->ring));
1649 }
1650
1651 /*
1652  * This function update the use flag and counter.
1653  */
1654 static inline uint32_t __attribute__((always_inline))
1655 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1656         uint32_t count)
1657 {
1658         struct vhost_virtqueue *vq;
1659         struct vring_desc *desc;
1660         struct rte_mbuf *buff;
1661         /* The virtio_hdr is initialised to 0. */
1662         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1663                 = {{0, 0, 0, 0, 0, 0}, 0};
1664         uint64_t buff_hdr_addr = 0;
1665         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1666         uint32_t head_idx, packet_success = 0;
1667         uint16_t res_cur_idx;
1668
1669         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1670
1671         if (count == 0)
1672                 return 0;
1673
1674         vq = dev->virtqueue[VIRTIO_RXQ];
1675         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1676
1677         res_cur_idx = vq->last_used_idx;
1678         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1679                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1680
1681         /* Retrieve all of the head indexes first to avoid caching issues. */
1682         for (head_idx = 0; head_idx < count; head_idx++)
1683                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1684
1685         /*Prefetch descriptor index. */
1686         rte_prefetch0(&vq->desc[head[packet_success]]);
1687
1688         while (packet_success != count) {
1689                 /* Get descriptor from available ring */
1690                 desc = &vq->desc[head[packet_success]];
1691
1692                 buff = pkts[packet_success];
1693                 LOG_DEBUG(VHOST_DATA,
1694                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1695                         "pkt[%d] descriptor idx: %d\n",
1696                         dev->device_fh, packet_success,
1697                         MBUF_HEADROOM_UINT32(buff));
1698
1699                 PRINT_PACKET(dev,
1700                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1701                         + RTE_PKTMBUF_HEADROOM),
1702                         rte_pktmbuf_data_len(buff), 0);
1703
1704                 /* Buffer address translation for virtio header. */
1705                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1706                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1707
1708                 /*
1709                  * If the descriptors are chained the header and data are
1710                  * placed in separate buffers.
1711                  */
1712                 if (desc->flags & VRING_DESC_F_NEXT) {
1713                         desc->len = vq->vhost_hlen;
1714                         desc = &vq->desc[desc->next];
1715                         desc->len = rte_pktmbuf_data_len(buff);
1716                 } else {
1717                         desc->len = packet_len;
1718                 }
1719
1720                 /* Update used ring with desc information */
1721                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1722                         = head[packet_success];
1723                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1724                         = packet_len;
1725                 res_cur_idx++;
1726                 packet_success++;
1727
1728                 /* A header is required per buffer. */
1729                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1730                         (const void *)&virtio_hdr, vq->vhost_hlen);
1731
1732                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1733
1734                 if (likely(packet_success < count)) {
1735                         /* Prefetch descriptor index. */
1736                         rte_prefetch0(&vq->desc[head[packet_success]]);
1737                 }
1738         }
1739
1740         rte_compiler_barrier();
1741
1742         LOG_DEBUG(VHOST_DATA,
1743                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1744                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1745                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1746
1747         *(volatile uint16_t *)&vq->used->idx += count;
1748         vq->last_used_idx += count;
1749
1750         LOG_DEBUG(VHOST_DATA,
1751                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1752                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1753                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1754
1755         /* Kick the guest if necessary. */
1756         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1757                 eventfd_write((int)vq->kickfd, 1);
1758
1759         return count;
1760 }
1761
1762 /*
1763  * This function routes the TX packet to the correct interface.
1764  * This may be a local device or the physical port.
1765  */
1766 static inline void __attribute__((always_inline))
1767 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1768         uint32_t desc_idx, uint8_t need_copy)
1769 {
1770         struct mbuf_table *tx_q;
1771         struct rte_mbuf **m_table;
1772         struct rte_mbuf *mbuf = NULL;
1773         unsigned len, ret, offset = 0;
1774         struct vpool *vpool;
1775         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1776         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1777
1778         /*Add packet to the port tx queue*/
1779         tx_q = &tx_queue_zcp[vmdq_rx_q];
1780         len = tx_q->len;
1781
1782         /* Allocate an mbuf and populate the structure. */
1783         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1784         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1785         if (unlikely(mbuf == NULL)) {
1786                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1787                 RTE_LOG(ERR, VHOST_DATA,
1788                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1789                         dev->device_fh);
1790                 put_desc_to_used_list_zcp(vq, desc_idx);
1791                 return;
1792         }
1793
1794         if (vm2vm_mode == VM2VM_HARDWARE) {
1795                 /* Avoid using a vlan tag from any vm for external pkt, such as
1796                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1797                  * selection, MAC address determines it as an external pkt
1798                  * which should go to network, while vlan tag determine it as
1799                  * a vm2vm pkt should forward to another vm. Hardware confuse
1800                  * such a ambiguous situation, so pkt will lost.
1801                  */
1802                 vlan_tag = external_pkt_default_vlan_tag;
1803                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1804                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1805                         __rte_mbuf_raw_free(mbuf);
1806                         return;
1807                 }
1808         }
1809
1810         mbuf->nb_segs = m->nb_segs;
1811         mbuf->next = m->next;
1812         mbuf->data_len = m->data_len + offset;
1813         mbuf->pkt_len = mbuf->data_len;
1814         if (unlikely(need_copy)) {
1815                 /* Copy the packet contents to the mbuf. */
1816                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1817                         rte_pktmbuf_mtod(m, void *),
1818                         m->data_len);
1819         } else {
1820                 mbuf->data_off = m->data_off;
1821                 mbuf->buf_physaddr = m->buf_physaddr;
1822                 mbuf->buf_addr = m->buf_addr;
1823         }
1824         mbuf->ol_flags = PKT_TX_VLAN_PKT;
1825         mbuf->vlan_tci = vlan_tag;
1826         mbuf->l2_len = sizeof(struct ether_hdr);
1827         mbuf->l3_len = sizeof(struct ipv4_hdr);
1828         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1829
1830         tx_q->m_table[len] = mbuf;
1831         len++;
1832
1833         LOG_DEBUG(VHOST_DATA,
1834                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1835                 dev->device_fh,
1836                 mbuf->nb_segs,
1837                 (mbuf->next == NULL) ? "null" : "non-null");
1838
1839         if (enable_stats) {
1840                 dev_statistics[dev->device_fh].tx_total++;
1841                 dev_statistics[dev->device_fh].tx++;
1842         }
1843
1844         if (unlikely(len == MAX_PKT_BURST)) {
1845                 m_table = (struct rte_mbuf **)tx_q->m_table;
1846                 ret = rte_eth_tx_burst(ports[0],
1847                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1848
1849                 /*
1850                  * Free any buffers not handled by TX and update
1851                  * the port stats.
1852                  */
1853                 if (unlikely(ret < len)) {
1854                         do {
1855                                 rte_pktmbuf_free(m_table[ret]);
1856                         } while (++ret < len);
1857                 }
1858
1859                 len = 0;
1860                 txmbuf_clean_zcp(dev, vpool);
1861         }
1862
1863         tx_q->len = len;
1864
1865         return;
1866 }
1867
1868 /*
1869  * This function TX all available packets in virtio TX queue for one
1870  * virtio-net device. If it is first packet, it learns MAC address and
1871  * setup VMDQ.
1872  */
1873 static inline void __attribute__((always_inline))
1874 virtio_dev_tx_zcp(struct virtio_net *dev)
1875 {
1876         struct rte_mbuf m;
1877         struct vhost_virtqueue *vq;
1878         struct vring_desc *desc;
1879         uint64_t buff_addr = 0, phys_addr;
1880         uint32_t head[MAX_PKT_BURST];
1881         uint32_t i;
1882         uint16_t free_entries, packet_success = 0;
1883         uint16_t avail_idx;
1884         uint8_t need_copy = 0;
1885         hpa_type addr_type;
1886         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1887
1888         vq = dev->virtqueue[VIRTIO_TXQ];
1889         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1890
1891         /* If there are no available buffers then return. */
1892         if (vq->last_used_idx_res == avail_idx)
1893                 return;
1894
1895         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1896
1897         /* Prefetch available ring to retrieve head indexes. */
1898         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1899
1900         /* Get the number of free entries in the ring */
1901         free_entries = (avail_idx - vq->last_used_idx_res);
1902
1903         /* Limit to MAX_PKT_BURST. */
1904         free_entries
1905                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1906
1907         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1908                 dev->device_fh, free_entries);
1909
1910         /* Retrieve all of the head indexes first to avoid caching issues. */
1911         for (i = 0; i < free_entries; i++)
1912                 head[i]
1913                         = vq->avail->ring[(vq->last_used_idx_res + i)
1914                         & (vq->size - 1)];
1915
1916         vq->last_used_idx_res += free_entries;
1917
1918         /* Prefetch descriptor index. */
1919         rte_prefetch0(&vq->desc[head[packet_success]]);
1920         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1921
1922         while (packet_success < free_entries) {
1923                 desc = &vq->desc[head[packet_success]];
1924
1925                 /* Discard first buffer as it is the virtio header */
1926                 desc = &vq->desc[desc->next];
1927
1928                 /* Buffer address translation. */
1929                 buff_addr = gpa_to_vva(dev, desc->addr);
1930                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1931                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1932                         &addr_type);
1933
1934                 if (likely(packet_success < (free_entries - 1)))
1935                         /* Prefetch descriptor index. */
1936                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1937
1938                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1939                         RTE_LOG(ERR, VHOST_DATA,
1940                                 "(%"PRIu64") Invalid frame buffer address found"
1941                                 "when TX packets!\n",
1942                                 dev->device_fh);
1943                         packet_success++;
1944                         continue;
1945                 }
1946
1947                 /* Prefetch buffer address. */
1948                 rte_prefetch0((void *)(uintptr_t)buff_addr);
1949
1950                 /*
1951                  * Setup dummy mbuf. This is copied to a real mbuf if
1952                  * transmitted out the physical port.
1953                  */
1954                 m.data_len = desc->len;
1955                 m.nb_segs = 1;
1956                 m.next = NULL;
1957                 m.data_off = 0;
1958                 m.buf_addr = (void *)(uintptr_t)buff_addr;
1959                 m.buf_physaddr = phys_addr;
1960
1961                 /*
1962                  * Check if the frame buffer address from guest crosses
1963                  * sub-region or not.
1964                  */
1965                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1966                         RTE_LOG(ERR, VHOST_DATA,
1967                                 "(%"PRIu64") Frame buffer address cross "
1968                                 "sub-regioin found when attaching TX frame "
1969                                 "buffer address!\n",
1970                                 dev->device_fh);
1971                         need_copy = 1;
1972                 } else
1973                         need_copy = 0;
1974
1975                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1976
1977                 /*
1978                  * If this is the first received packet we need to learn
1979                  * the MAC and setup VMDQ
1980                  */
1981                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1982                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1983                                 /*
1984                                  * Discard frame if device is scheduled for
1985                                  * removal or a duplicate MAC address is found.
1986                                  */
1987                                 packet_success += free_entries;
1988                                 vq->last_used_idx += packet_success;
1989                                 break;
1990                         }
1991                 }
1992
1993                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1994                 packet_success++;
1995         }
1996 }
1997
1998 /*
1999  * This function is called by each data core. It handles all RX/TX registered
2000  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2001  * addresses are compared with all devices in the main linked list.
2002  */
2003 static int
2004 switch_worker_zcp(__attribute__((unused)) void *arg)
2005 {
2006         struct virtio_net *dev = NULL;
2007         struct vhost_dev  *vdev = NULL;
2008         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2009         struct virtio_net_data_ll *dev_ll;
2010         struct mbuf_table *tx_q;
2011         volatile struct lcore_ll_info *lcore_ll;
2012         const uint64_t drain_tsc
2013                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2014                 * BURST_TX_DRAIN_US;
2015         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2016         unsigned ret;
2017         const uint16_t lcore_id = rte_lcore_id();
2018         uint16_t count_in_ring, rx_count = 0;
2019
2020         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2021
2022         lcore_ll = lcore_info[lcore_id].lcore_ll;
2023         prev_tsc = 0;
2024
2025         while (1) {
2026                 cur_tsc = rte_rdtsc();
2027
2028                 /* TX burst queue drain */
2029                 diff_tsc = cur_tsc - prev_tsc;
2030                 if (unlikely(diff_tsc > drain_tsc)) {
2031                         /*
2032                          * Get mbuf from vpool.pool and detach mbuf and
2033                          * put back into vpool.ring.
2034                          */
2035                         dev_ll = lcore_ll->ll_root_used;
2036                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2037                                 /* Get virtio device ID */
2038                                 vdev = dev_ll->vdev;
2039                                 dev = vdev->dev;
2040
2041                                 if (likely(!vdev->remove)) {
2042                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2043                                         if (tx_q->len) {
2044                                                 LOG_DEBUG(VHOST_DATA,
2045                                                 "TX queue drained after timeout"
2046                                                 " with burst size %u\n",
2047                                                 tx_q->len);
2048
2049                                                 /*
2050                                                  * Tx any packets in the queue
2051                                                  */
2052                                                 ret = rte_eth_tx_burst(
2053                                                         ports[0],
2054                                                         (uint16_t)tx_q->txq_id,
2055                                                         (struct rte_mbuf **)
2056                                                         tx_q->m_table,
2057                                                         (uint16_t)tx_q->len);
2058                                                 if (unlikely(ret < tx_q->len)) {
2059                                                         do {
2060                                                                 rte_pktmbuf_free(
2061                                                                         tx_q->m_table[ret]);
2062                                                         } while (++ret < tx_q->len);
2063                                                 }
2064                                                 tx_q->len = 0;
2065
2066                                                 txmbuf_clean_zcp(dev,
2067                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2068                                         }
2069                                 }
2070                                 dev_ll = dev_ll->next;
2071                         }
2072                         prev_tsc = cur_tsc;
2073                 }
2074
2075                 rte_prefetch0(lcore_ll->ll_root_used);
2076
2077                 /*
2078                  * Inform the configuration core that we have exited the linked
2079                  * list and that no devices are in use if requested.
2080                  */
2081                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2082                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2083
2084                 /* Process devices */
2085                 dev_ll = lcore_ll->ll_root_used;
2086
2087                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2088                         vdev = dev_ll->vdev;
2089                         dev  = vdev->dev;
2090                         if (unlikely(vdev->remove)) {
2091                                 dev_ll = dev_ll->next;
2092                                 unlink_vmdq(vdev);
2093                                 vdev->ready = DEVICE_SAFE_REMOVE;
2094                                 continue;
2095                         }
2096
2097                         if (likely(vdev->ready == DEVICE_RX)) {
2098                                 uint32_t index = vdev->vmdq_rx_q;
2099                                 uint16_t i;
2100                                 count_in_ring
2101                                 = rte_ring_count(vpool_array[index].ring);
2102                                 uint16_t free_entries
2103                                 = (uint16_t)get_available_ring_num_zcp(dev);
2104
2105                                 /*
2106                                  * Attach all mbufs in vpool.ring and put back
2107                                  * into vpool.pool.
2108                                  */
2109                                 for (i = 0;
2110                                 i < RTE_MIN(free_entries,
2111                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2112                                 i++)
2113                                         attach_rxmbuf_zcp(dev);
2114
2115                                 /* Handle guest RX */
2116                                 rx_count = rte_eth_rx_burst(ports[0],
2117                                         vdev->vmdq_rx_q, pkts_burst,
2118                                         MAX_PKT_BURST);
2119
2120                                 if (rx_count) {
2121                                         ret_count = virtio_dev_rx_zcp(dev,
2122                                                         pkts_burst, rx_count);
2123                                         if (enable_stats) {
2124                                                 dev_statistics[dev->device_fh].rx_total
2125                                                         += rx_count;
2126                                                 dev_statistics[dev->device_fh].rx
2127                                                         += ret_count;
2128                                         }
2129                                         while (likely(rx_count)) {
2130                                                 rx_count--;
2131                                                 pktmbuf_detach_zcp(
2132                                                         pkts_burst[rx_count]);
2133                                                 rte_ring_sp_enqueue(
2134                                                         vpool_array[index].ring,
2135                                                         (void *)pkts_burst[rx_count]);
2136                                         }
2137                                 }
2138                         }
2139
2140                         if (likely(!vdev->remove))
2141                                 /* Handle guest TX */
2142                                 virtio_dev_tx_zcp(dev);
2143
2144                         /* Move to the next device in the list */
2145                         dev_ll = dev_ll->next;
2146                 }
2147         }
2148
2149         return 0;
2150 }
2151
2152
2153 /*
2154  * Add an entry to a used linked list. A free entry must first be found
2155  * in the free linked list using get_data_ll_free_entry();
2156  */
2157 static void
2158 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2159         struct virtio_net_data_ll *ll_dev)
2160 {
2161         struct virtio_net_data_ll *ll = *ll_root_addr;
2162
2163         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2164         ll_dev->next = NULL;
2165         rte_compiler_barrier();
2166
2167         /* If ll == NULL then this is the first device. */
2168         if (ll) {
2169                 /* Increment to the tail of the linked list. */
2170                 while ((ll->next != NULL) )
2171                         ll = ll->next;
2172
2173                 ll->next = ll_dev;
2174         } else {
2175                 *ll_root_addr = ll_dev;
2176         }
2177 }
2178
2179 /*
2180  * Remove an entry from a used linked list. The entry must then be added to
2181  * the free linked list using put_data_ll_free_entry().
2182  */
2183 static void
2184 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2185         struct virtio_net_data_ll *ll_dev,
2186         struct virtio_net_data_ll *ll_dev_last)
2187 {
2188         struct virtio_net_data_ll *ll = *ll_root_addr;
2189
2190         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2191                 return;
2192
2193         if (ll_dev == ll)
2194                 *ll_root_addr = ll_dev->next;
2195         else
2196                 if (likely(ll_dev_last != NULL))
2197                         ll_dev_last->next = ll_dev->next;
2198                 else
2199                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2200 }
2201
2202 /*
2203  * Find and return an entry from the free linked list.
2204  */
2205 static struct virtio_net_data_ll *
2206 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2207 {
2208         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2209         struct virtio_net_data_ll *ll_dev;
2210
2211         if (ll_free == NULL)
2212                 return NULL;
2213
2214         ll_dev = ll_free;
2215         *ll_root_addr = ll_free->next;
2216
2217         return ll_dev;
2218 }
2219
2220 /*
2221  * Place an entry back on to the free linked list.
2222  */
2223 static void
2224 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2225         struct virtio_net_data_ll *ll_dev)
2226 {
2227         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2228
2229         if (ll_dev == NULL)
2230                 return;
2231
2232         ll_dev->next = ll_free;
2233         *ll_root_addr = ll_dev;
2234 }
2235
2236 /*
2237  * Creates a linked list of a given size.
2238  */
2239 static struct virtio_net_data_ll *
2240 alloc_data_ll(uint32_t size)
2241 {
2242         struct virtio_net_data_ll *ll_new;
2243         uint32_t i;
2244
2245         /* Malloc and then chain the linked list. */
2246         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2247         if (ll_new == NULL) {
2248                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2249                 return NULL;
2250         }
2251
2252         for (i = 0; i < size - 1; i++) {
2253                 ll_new[i].vdev = NULL;
2254                 ll_new[i].next = &ll_new[i+1];
2255         }
2256         ll_new[i].next = NULL;
2257
2258         return (ll_new);
2259 }
2260
2261 /*
2262  * Create the main linked list along with each individual cores linked list. A used and a free list
2263  * are created to manage entries.
2264  */
2265 static int
2266 init_data_ll (void)
2267 {
2268         int lcore;
2269
2270         RTE_LCORE_FOREACH_SLAVE(lcore) {
2271                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2272                 if (lcore_info[lcore].lcore_ll == NULL) {
2273                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2274                         return -1;
2275                 }
2276
2277                 lcore_info[lcore].lcore_ll->device_num = 0;
2278                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2279                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2280                 if (num_devices % num_switching_cores)
2281                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2282                 else
2283                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2284         }
2285
2286         /* Allocate devices up to a maximum of MAX_DEVICES. */
2287         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2288
2289         return 0;
2290 }
2291
2292 /*
2293  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2294  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2295  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2296  */
2297 static void
2298 destroy_device (volatile struct virtio_net *dev)
2299 {
2300         struct virtio_net_data_ll *ll_lcore_dev_cur;
2301         struct virtio_net_data_ll *ll_main_dev_cur;
2302         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2303         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2304         struct vhost_dev *vdev;
2305         int lcore;
2306
2307         dev->flags &= ~VIRTIO_DEV_RUNNING;
2308
2309         vdev = (struct vhost_dev *)dev->priv;
2310         /*set the remove flag. */
2311         vdev->remove = 1;
2312         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2313                 rte_pause();
2314         }
2315
2316         /* Search for entry to be removed from lcore ll */
2317         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2318         while (ll_lcore_dev_cur != NULL) {
2319                 if (ll_lcore_dev_cur->vdev == vdev) {
2320                         break;
2321                 } else {
2322                         ll_lcore_dev_last = ll_lcore_dev_cur;
2323                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2324                 }
2325         }
2326
2327         if (ll_lcore_dev_cur == NULL) {
2328                 RTE_LOG(ERR, VHOST_CONFIG,
2329                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2330                         dev->device_fh);
2331                 return;
2332         }
2333
2334         /* Search for entry to be removed from main ll */
2335         ll_main_dev_cur = ll_root_used;
2336         ll_main_dev_last = NULL;
2337         while (ll_main_dev_cur != NULL) {
2338                 if (ll_main_dev_cur->vdev == vdev) {
2339                         break;
2340                 } else {
2341                         ll_main_dev_last = ll_main_dev_cur;
2342                         ll_main_dev_cur = ll_main_dev_cur->next;
2343                 }
2344         }
2345
2346         /* Remove entries from the lcore and main ll. */
2347         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2348         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2349
2350         /* Set the dev_removal_flag on each lcore. */
2351         RTE_LCORE_FOREACH_SLAVE(lcore) {
2352                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2353         }
2354
2355         /*
2356          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2357          * they can no longer access the device removed from the linked lists and that the devices
2358          * are no longer in use.
2359          */
2360         RTE_LCORE_FOREACH_SLAVE(lcore) {
2361                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2362                         rte_pause();
2363                 }
2364         }
2365
2366         /* Add the entries back to the lcore and main free ll.*/
2367         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2368         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2369
2370         /* Decrement number of device on the lcore. */
2371         lcore_info[vdev->coreid].lcore_ll->device_num--;
2372
2373         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2374
2375         if (zero_copy) {
2376                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2377
2378                 /* Stop the RX queue. */
2379                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2380                         LOG_DEBUG(VHOST_CONFIG,
2381                                 "(%"PRIu64") In destroy_device: Failed to stop "
2382                                 "rx queue:%d\n",
2383                                 dev->device_fh,
2384                                 vdev->vmdq_rx_q);
2385                 }
2386
2387                 LOG_DEBUG(VHOST_CONFIG,
2388                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2389                         "mempool back to ring for RX queue: %d\n",
2390                         dev->device_fh, vdev->vmdq_rx_q);
2391
2392                 mbuf_destroy_zcp(vpool);
2393
2394                 /* Stop the TX queue. */
2395                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2396                         LOG_DEBUG(VHOST_CONFIG,
2397                                 "(%"PRIu64") In destroy_device: Failed to "
2398                                 "stop tx queue:%d\n",
2399                                 dev->device_fh, vdev->vmdq_rx_q);
2400                 }
2401
2402                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2403
2404                 LOG_DEBUG(VHOST_CONFIG,
2405                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2406                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2407                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2408                         dev->device_fh);
2409
2410                 mbuf_destroy_zcp(vpool);
2411                 rte_free(vdev->regions_hpa);
2412         }
2413         rte_free(vdev);
2414
2415 }
2416
2417 /*
2418  * Calculate the region count of physical continous regions for one particular
2419  * region of whose vhost virtual address is continous. The particular region
2420  * start from vva_start, with size of 'size' in argument.
2421  */
2422 static uint32_t
2423 check_hpa_regions(uint64_t vva_start, uint64_t size)
2424 {
2425         uint32_t i, nregions = 0, page_size = getpagesize();
2426         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2427         if (vva_start % page_size) {
2428                 LOG_DEBUG(VHOST_CONFIG,
2429                         "in check_countinous: vva start(%p) mod page_size(%d) "
2430                         "has remainder\n",
2431                         (void *)(uintptr_t)vva_start, page_size);
2432                 return 0;
2433         }
2434         if (size % page_size) {
2435                 LOG_DEBUG(VHOST_CONFIG,
2436                         "in check_countinous: "
2437                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2438                         size, page_size);
2439                 return 0;
2440         }
2441         for (i = 0; i < size - page_size; i = i + page_size) {
2442                 cur_phys_addr
2443                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2444                 next_phys_addr = rte_mem_virt2phy(
2445                         (void *)(uintptr_t)(vva_start + i + page_size));
2446                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2447                         ++nregions;
2448                         LOG_DEBUG(VHOST_CONFIG,
2449                                 "in check_continuous: hva addr:(%p) is not "
2450                                 "continuous with hva addr:(%p), diff:%d\n",
2451                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2452                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2453                                 + page_size), page_size);
2454                         LOG_DEBUG(VHOST_CONFIG,
2455                                 "in check_continuous: hpa addr:(%p) is not "
2456                                 "continuous with hpa addr:(%p), "
2457                                 "diff:(%"PRIu64")\n",
2458                                 (void *)(uintptr_t)cur_phys_addr,
2459                                 (void *)(uintptr_t)next_phys_addr,
2460                                 (next_phys_addr-cur_phys_addr));
2461                 }
2462         }
2463         return nregions;
2464 }
2465
2466 /*
2467  * Divide each region whose vhost virtual address is continous into a few
2468  * sub-regions, make sure the physical address within each sub-region are
2469  * continous. And fill offset(to GPA) and size etc. information of each
2470  * sub-region into regions_hpa.
2471  */
2472 static uint32_t
2473 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2474 {
2475         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2476         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2477
2478         if (mem_region_hpa == NULL)
2479                 return 0;
2480
2481         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2482                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2483                         virtio_memory->regions[regionidx].address_offset;
2484                 mem_region_hpa[regionidx_hpa].guest_phys_address
2485                         = virtio_memory->regions[regionidx].guest_phys_address;
2486                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2487                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2488                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2489                 LOG_DEBUG(VHOST_CONFIG,
2490                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2491                         regionidx_hpa,
2492                         (void *)(uintptr_t)
2493                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2494                 LOG_DEBUG(VHOST_CONFIG,
2495                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2496                         regionidx_hpa,
2497                         (void *)(uintptr_t)
2498                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2499                 for (i = 0, k = 0;
2500                         i < virtio_memory->regions[regionidx].memory_size -
2501                                 page_size;
2502                         i += page_size) {
2503                         cur_phys_addr = rte_mem_virt2phy(
2504                                         (void *)(uintptr_t)(vva_start + i));
2505                         next_phys_addr = rte_mem_virt2phy(
2506                                         (void *)(uintptr_t)(vva_start +
2507                                         i + page_size));
2508                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2509                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2510                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2511                                         k + page_size;
2512                                 mem_region_hpa[regionidx_hpa].memory_size
2513                                         = k + page_size;
2514                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2515                                         "phys addr end  [%d]:(%p)\n",
2516                                         regionidx_hpa,
2517                                         (void *)(uintptr_t)
2518                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2519                                 LOG_DEBUG(VHOST_CONFIG,
2520                                         "in fill_hpa_regions: guest phys addr "
2521                                         "size [%d]:(%p)\n",
2522                                         regionidx_hpa,
2523                                         (void *)(uintptr_t)
2524                                         (mem_region_hpa[regionidx_hpa].memory_size));
2525                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2526                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2527                                 ++regionidx_hpa;
2528                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2529                                         next_phys_addr -
2530                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2531                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2532                                         " phys addr start[%d]:(%p)\n",
2533                                         regionidx_hpa,
2534                                         (void *)(uintptr_t)
2535                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2536                                 LOG_DEBUG(VHOST_CONFIG,
2537                                         "in fill_hpa_regions: host  phys addr "
2538                                         "start[%d]:(%p)\n",
2539                                         regionidx_hpa,
2540                                         (void *)(uintptr_t)
2541                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2542                                 k = 0;
2543                         } else {
2544                                 k += page_size;
2545                         }
2546                 }
2547                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2548                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2549                         + k + page_size;
2550                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2551                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2552                         "[%d]:(%p)\n", regionidx_hpa,
2553                         (void *)(uintptr_t)
2554                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2555                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2556                         "[%d]:(%p)\n", regionidx_hpa,
2557                         (void *)(uintptr_t)
2558                         (mem_region_hpa[regionidx_hpa].memory_size));
2559                 ++regionidx_hpa;
2560         }
2561         return regionidx_hpa;
2562 }
2563
2564 /*
2565  * A new device is added to a data core. First the device is added to the main linked list
2566  * and the allocated to a specific data core.
2567  */
2568 static int
2569 new_device (struct virtio_net *dev)
2570 {
2571         struct virtio_net_data_ll *ll_dev;
2572         int lcore, core_add = 0;
2573         uint32_t device_num_min = num_devices;
2574         struct vhost_dev *vdev;
2575         uint32_t regionidx;
2576
2577         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2578         if (vdev == NULL) {
2579                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2580                         dev->device_fh);
2581                 return -1;
2582         }
2583         vdev->dev = dev;
2584         dev->priv = vdev;
2585
2586         if (zero_copy) {
2587                 vdev->nregions_hpa = dev->mem->nregions;
2588                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2589                         vdev->nregions_hpa
2590                                 += check_hpa_regions(
2591                                         dev->mem->regions[regionidx].guest_phys_address
2592                                         + dev->mem->regions[regionidx].address_offset,
2593                                         dev->mem->regions[regionidx].memory_size);
2594
2595                 }
2596
2597                 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2598                         sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2599                         RTE_CACHE_LINE_SIZE);
2600                 if (vdev->regions_hpa == NULL) {
2601                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2602                         rte_free(vdev);
2603                         return -1;
2604                 }
2605
2606
2607                 if (fill_hpa_memory_regions(
2608                         vdev->regions_hpa, dev->mem
2609                         ) != vdev->nregions_hpa) {
2610
2611                         RTE_LOG(ERR, VHOST_CONFIG,
2612                                 "hpa memory regions number mismatch: "
2613                                 "[%d]\n", vdev->nregions_hpa);
2614                         rte_free(vdev->regions_hpa);
2615                         rte_free(vdev);
2616                         return -1;
2617                 }
2618         }
2619
2620
2621         /* Add device to main ll */
2622         ll_dev = get_data_ll_free_entry(&ll_root_free);
2623         if (ll_dev == NULL) {
2624                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2625                         "of %d devices per core has been reached\n",
2626                         dev->device_fh, num_devices);
2627                 if (vdev->regions_hpa)
2628                         rte_free(vdev->regions_hpa);
2629                 rte_free(vdev);
2630                 return -1;
2631         }
2632         ll_dev->vdev = vdev;
2633         add_data_ll_entry(&ll_root_used, ll_dev);
2634         vdev->vmdq_rx_q
2635                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2636
2637         if (zero_copy) {
2638                 uint32_t index = vdev->vmdq_rx_q;
2639                 uint32_t count_in_ring, i;
2640                 struct mbuf_table *tx_q;
2641
2642                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2643
2644                 LOG_DEBUG(VHOST_CONFIG,
2645                         "(%"PRIu64") in new_device: mbuf count in mempool "
2646                         "before attach is: %d\n",
2647                         dev->device_fh,
2648                         rte_mempool_count(vpool_array[index].pool));
2649                 LOG_DEBUG(VHOST_CONFIG,
2650                         "(%"PRIu64") in new_device: mbuf count in  ring "
2651                         "before attach  is : %d\n",
2652                         dev->device_fh, count_in_ring);
2653
2654                 /*
2655                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2656                  */
2657                 for (i = 0; i < count_in_ring; i++)
2658                         attach_rxmbuf_zcp(dev);
2659
2660                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2661                         "mempool after attach is: %d\n",
2662                         dev->device_fh,
2663                         rte_mempool_count(vpool_array[index].pool));
2664                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2665                         "ring after attach  is : %d\n",
2666                         dev->device_fh,
2667                         rte_ring_count(vpool_array[index].ring));
2668
2669                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2670                 tx_q->txq_id = vdev->vmdq_rx_q;
2671
2672                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2673                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2674
2675                         LOG_DEBUG(VHOST_CONFIG,
2676                                 "(%"PRIu64") In new_device: Failed to start "
2677                                 "tx queue:%d\n",
2678                                 dev->device_fh, vdev->vmdq_rx_q);
2679
2680                         mbuf_destroy_zcp(vpool);
2681                         rte_free(vdev->regions_hpa);
2682                         rte_free(vdev);
2683                         return -1;
2684                 }
2685
2686                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2687                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2688
2689                         LOG_DEBUG(VHOST_CONFIG,
2690                                 "(%"PRIu64") In new_device: Failed to start "
2691                                 "rx queue:%d\n",
2692                                 dev->device_fh, vdev->vmdq_rx_q);
2693
2694                         /* Stop the TX queue. */
2695                         if (rte_eth_dev_tx_queue_stop(ports[0],
2696                                 vdev->vmdq_rx_q) != 0) {
2697                                 LOG_DEBUG(VHOST_CONFIG,
2698                                         "(%"PRIu64") In new_device: Failed to "
2699                                         "stop tx queue:%d\n",
2700                                         dev->device_fh, vdev->vmdq_rx_q);
2701                         }
2702
2703                         mbuf_destroy_zcp(vpool);
2704                         rte_free(vdev->regions_hpa);
2705                         rte_free(vdev);
2706                         return -1;
2707                 }
2708
2709         }
2710
2711         /*reset ready flag*/
2712         vdev->ready = DEVICE_MAC_LEARNING;
2713         vdev->remove = 0;
2714
2715         /* Find a suitable lcore to add the device. */
2716         RTE_LCORE_FOREACH_SLAVE(lcore) {
2717                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2718                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2719                         core_add = lcore;
2720                 }
2721         }
2722         /* Add device to lcore ll */
2723         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2724         if (ll_dev == NULL) {
2725                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2726                 vdev->ready = DEVICE_SAFE_REMOVE;
2727                 destroy_device(dev);
2728                 if (vdev->regions_hpa)
2729                         rte_free(vdev->regions_hpa);
2730                 rte_free(vdev);
2731                 return -1;
2732         }
2733         ll_dev->vdev = vdev;
2734         vdev->coreid = core_add;
2735
2736         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2737
2738         /* Initialize device stats */
2739         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2740
2741         /* Disable notifications. */
2742         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2743         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2744         lcore_info[vdev->coreid].lcore_ll->device_num++;
2745         dev->flags |= VIRTIO_DEV_RUNNING;
2746
2747         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2748
2749         return 0;
2750 }
2751
2752 /*
2753  * These callback allow devices to be added to the data core when configuration
2754  * has been fully complete.
2755  */
2756 static const struct virtio_net_device_ops virtio_net_device_ops =
2757 {
2758         .new_device =  new_device,
2759         .destroy_device = destroy_device,
2760 };
2761
2762 /*
2763  * This is a thread will wake up after a period to print stats if the user has
2764  * enabled them.
2765  */
2766 static void
2767 print_stats(void)
2768 {
2769         struct virtio_net_data_ll *dev_ll;
2770         uint64_t tx_dropped, rx_dropped;
2771         uint64_t tx, tx_total, rx, rx_total;
2772         uint32_t device_fh;
2773         const char clr[] = { 27, '[', '2', 'J', '\0' };
2774         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2775
2776         while(1) {
2777                 sleep(enable_stats);
2778
2779                 /* Clear screen and move to top left */
2780                 printf("%s%s", clr, top_left);
2781
2782                 printf("\nDevice statistics ====================================");
2783
2784                 dev_ll = ll_root_used;
2785                 while (dev_ll != NULL) {
2786                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2787                         tx_total = dev_statistics[device_fh].tx_total;
2788                         tx = dev_statistics[device_fh].tx;
2789                         tx_dropped = tx_total - tx;
2790                         if (zero_copy == 0) {
2791                                 rx_total = rte_atomic64_read(
2792                                         &dev_statistics[device_fh].rx_total_atomic);
2793                                 rx = rte_atomic64_read(
2794                                         &dev_statistics[device_fh].rx_atomic);
2795                         } else {
2796                                 rx_total = dev_statistics[device_fh].rx_total;
2797                                 rx = dev_statistics[device_fh].rx;
2798                         }
2799                         rx_dropped = rx_total - rx;
2800
2801                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2802                                         "\nTX total:            %"PRIu64""
2803                                         "\nTX dropped:          %"PRIu64""
2804                                         "\nTX successful:               %"PRIu64""
2805                                         "\nRX total:            %"PRIu64""
2806                                         "\nRX dropped:          %"PRIu64""
2807                                         "\nRX successful:               %"PRIu64"",
2808                                         device_fh,
2809                                         tx_total,
2810                                         tx_dropped,
2811                                         tx,
2812                                         rx_total,
2813                                         rx_dropped,
2814                                         rx);
2815
2816                         dev_ll = dev_ll->next;
2817                 }
2818                 printf("\n======================================================\n");
2819         }
2820 }
2821
2822 static void
2823 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2824         char *ring_name, uint32_t nb_mbuf)
2825 {
2826         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2827         vpool_array[index].pool
2828                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2829                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2830                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2831                 rte_pktmbuf_init, NULL, socket, 0);
2832         if (vpool_array[index].pool != NULL) {
2833                 vpool_array[index].ring
2834                         = rte_ring_create(ring_name,
2835                                 rte_align32pow2(nb_mbuf + 1),
2836                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2837                 if (likely(vpool_array[index].ring != NULL)) {
2838                         LOG_DEBUG(VHOST_CONFIG,
2839                                 "in setup_mempool_tbl: mbuf count in "
2840                                 "mempool is: %d\n",
2841                                 rte_mempool_count(vpool_array[index].pool));
2842                         LOG_DEBUG(VHOST_CONFIG,
2843                                 "in setup_mempool_tbl: mbuf count in "
2844                                 "ring   is: %d\n",
2845                                 rte_ring_count(vpool_array[index].ring));
2846                 } else {
2847                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2848                                 ring_name);
2849                 }
2850
2851                 /* Need consider head room. */
2852                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2853         } else {
2854                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2855         }
2856 }
2857
2858
2859 /*
2860  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2861  * device is also registered here to handle the IOCTLs.
2862  */
2863 int
2864 main(int argc, char *argv[])
2865 {
2866         struct rte_mempool *mbuf_pool = NULL;
2867         unsigned lcore_id, core_id = 0;
2868         unsigned nb_ports, valid_num_ports;
2869         int ret;
2870         uint8_t portid;
2871         uint16_t queue_id;
2872         static pthread_t tid;
2873
2874         /* init EAL */
2875         ret = rte_eal_init(argc, argv);
2876         if (ret < 0)
2877                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2878         argc -= ret;
2879         argv += ret;
2880
2881         /* parse app arguments */
2882         ret = us_vhost_parse_args(argc, argv);
2883         if (ret < 0)
2884                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2885
2886         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2887                 if (rte_lcore_is_enabled(lcore_id))
2888                         lcore_ids[core_id ++] = lcore_id;
2889
2890         if (rte_lcore_count() > RTE_MAX_LCORE)
2891                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2892
2893         /*set the number of swithcing cores available*/
2894         num_switching_cores = rte_lcore_count()-1;
2895
2896         /* Get the number of physical ports. */
2897         nb_ports = rte_eth_dev_count();
2898         if (nb_ports > RTE_MAX_ETHPORTS)
2899                 nb_ports = RTE_MAX_ETHPORTS;
2900
2901         /*
2902          * Update the global var NUM_PORTS and global array PORTS
2903          * and get value of var VALID_NUM_PORTS according to system ports number
2904          */
2905         valid_num_ports = check_ports_num(nb_ports);
2906
2907         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
2908                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2909                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2910                 return -1;
2911         }
2912
2913         if (zero_copy == 0) {
2914                 /* Create the mbuf pool. */
2915                 mbuf_pool = rte_mempool_create(
2916                                 "MBUF_POOL",
2917                                 NUM_MBUFS_PER_PORT
2918                                 * valid_num_ports,
2919                                 MBUF_SIZE, MBUF_CACHE_SIZE,
2920                                 sizeof(struct rte_pktmbuf_pool_private),
2921                                 rte_pktmbuf_pool_init, NULL,
2922                                 rte_pktmbuf_init, NULL,
2923                                 rte_socket_id(), 0);
2924                 if (mbuf_pool == NULL)
2925                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2926
2927                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2928                         vpool_array[queue_id].pool = mbuf_pool;
2929
2930                 if (vm2vm_mode == VM2VM_HARDWARE) {
2931                         /* Enable VT loop back to let L2 switch to do it. */
2932                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2933                         LOG_DEBUG(VHOST_CONFIG,
2934                                 "Enable loop back for L2 switch in vmdq.\n");
2935                 }
2936         } else {
2937                 uint32_t nb_mbuf;
2938                 char pool_name[RTE_MEMPOOL_NAMESIZE];
2939                 char ring_name[RTE_MEMPOOL_NAMESIZE];
2940
2941                 /*
2942                  * Zero copy defers queue RX/TX start to the time when guest
2943                  * finishes its startup and packet buffers from that guest are
2944                  * available.
2945                  */
2946                 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2947                 rx_conf_default.rx_drop_en = 0;
2948                 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2949                 nb_mbuf = num_rx_descriptor
2950                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951                         + num_switching_cores * MAX_PKT_BURST;
2952
2953                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954                         snprintf(pool_name, sizeof(pool_name),
2955                                 "rxmbuf_pool_%u", queue_id);
2956                         snprintf(ring_name, sizeof(ring_name),
2957                                 "rxmbuf_ring_%u", queue_id);
2958                         setup_mempool_tbl(rte_socket_id(), queue_id,
2959                                 pool_name, ring_name, nb_mbuf);
2960                 }
2961
2962                 nb_mbuf = num_tx_descriptor
2963                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2964                                 + num_switching_cores * MAX_PKT_BURST;
2965
2966                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2967                         snprintf(pool_name, sizeof(pool_name),
2968                                 "txmbuf_pool_%u", queue_id);
2969                         snprintf(ring_name, sizeof(ring_name),
2970                                 "txmbuf_ring_%u", queue_id);
2971                         setup_mempool_tbl(rte_socket_id(),
2972                                 (queue_id + MAX_QUEUES),
2973                                 pool_name, ring_name, nb_mbuf);
2974                 }
2975
2976                 if (vm2vm_mode == VM2VM_HARDWARE) {
2977                         /* Enable VT loop back to let L2 switch to do it. */
2978                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2979                         LOG_DEBUG(VHOST_CONFIG,
2980                                 "Enable loop back for L2 switch in vmdq.\n");
2981                 }
2982         }
2983         /* Set log level. */
2984         rte_set_log_level(LOG_LEVEL);
2985
2986         /* initialize all ports */
2987         for (portid = 0; portid < nb_ports; portid++) {
2988                 /* skip ports that are not enabled */
2989                 if ((enabled_port_mask & (1 << portid)) == 0) {
2990                         RTE_LOG(INFO, VHOST_PORT,
2991                                 "Skipping disabled port %d\n", portid);
2992                         continue;
2993                 }
2994                 if (port_init(portid) != 0)
2995                         rte_exit(EXIT_FAILURE,
2996                                 "Cannot initialize network ports\n");
2997         }
2998
2999         /* Initialise all linked lists. */
3000         if (init_data_ll() == -1)
3001                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3002
3003         /* Initialize device stats */
3004         memset(&dev_statistics, 0, sizeof(dev_statistics));
3005
3006         /* Enable stats if the user option is set. */
3007         if (enable_stats)
3008                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3009
3010         /* Launch all data cores. */
3011         if (zero_copy == 0) {
3012                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3013                         rte_eal_remote_launch(switch_worker,
3014                                 mbuf_pool, lcore_id);
3015                 }
3016         } else {
3017                 uint32_t count_in_mempool, index, i;
3018                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3019                         /* For all RX and TX queues. */
3020                         count_in_mempool
3021                                 = rte_mempool_count(vpool_array[index].pool);
3022
3023                         /*
3024                          * Transfer all un-attached mbufs from vpool.pool
3025                          * to vpoo.ring.
3026                          */
3027                         for (i = 0; i < count_in_mempool; i++) {
3028                                 struct rte_mbuf *mbuf
3029                                         = __rte_mbuf_raw_alloc(
3030                                                 vpool_array[index].pool);
3031                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3032                                                 (void *)mbuf);
3033                         }
3034
3035                         LOG_DEBUG(VHOST_CONFIG,
3036                                 "in main: mbuf count in mempool at initial "
3037                                 "is: %d\n", count_in_mempool);
3038                         LOG_DEBUG(VHOST_CONFIG,
3039                                 "in main: mbuf count in  ring at initial  is :"
3040                                 " %d\n",
3041                                 rte_ring_count(vpool_array[index].ring));
3042                 }
3043
3044                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3045                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3046                                 lcore_id);
3047         }
3048
3049         if (mergeable == 0)
3050                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3051
3052         /* Register CUSE device to handle IOCTLs. */
3053         ret = rte_vhost_driver_register((char *)&dev_basename);
3054         if (ret != 0)
3055                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3056
3057         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3058
3059         /* Start CUSE session. */
3060         rte_vhost_driver_session_start();
3061         return 0;
3062
3063 }
3064