examples/vhost: add virtio offload
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 #include <rte_udp.h>
56 #include <rte_sctp.h>
57
58 #include "main.h"
59
60 #ifndef MAX_QUEUES
61 #define MAX_QUEUES 128
62 #endif
63
64 /* the maximum number of external ports supported */
65 #define MAX_SUP_PORTS 1
66
67 /*
68  * Calculate the number of buffers needed per port
69  */
70 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
71                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
72                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
73                                                         (num_switching_cores*MBUF_CACHE_SIZE))
74
75 #define MBUF_CACHE_SIZE 128
76 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
77
78 /*
79  * No frame data buffer allocated from host are required for zero copy
80  * implementation, guest will allocate the frame data buffer, and vhost
81  * directly use it.
82  */
83 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
84 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
85 #define MBUF_CACHE_SIZE_ZCP 0
86
87 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
88 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
89
90 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
91 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
92
93 #define JUMBO_FRAME_MAX_SIZE    0x2600
94
95 /* State of virtio device. */
96 #define DEVICE_MAC_LEARNING 0
97 #define DEVICE_RX                       1
98 #define DEVICE_SAFE_REMOVE      2
99
100 /* Config_core_flag status definitions. */
101 #define REQUEST_DEV_REMOVAL 1
102 #define ACK_DEV_REMOVAL 0
103
104 /* Configurable number of RX/TX ring descriptors */
105 #define RTE_TEST_RX_DESC_DEFAULT 1024
106 #define RTE_TEST_TX_DESC_DEFAULT 512
107
108 /*
109  * Need refine these 2 macros for legacy and DPDK based front end:
110  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
111  * And then adjust power 2.
112  */
113 /*
114  * For legacy front end, 128 descriptors,
115  * half for virtio header, another half for mbuf.
116  */
117 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
118 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
119
120 /* Get first 4 bytes in mbuf headroom. */
121 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
122                 + sizeof(struct rte_mbuf)))
123
124 /* true if x is a power of 2 */
125 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
126
127 #define INVALID_PORT_ID 0xFF
128
129 /* Max number of devices. Limited by vmdq. */
130 #define MAX_DEVICES 64
131
132 /* Size of buffers used for snprintfs. */
133 #define MAX_PRINT_BUFF 6072
134
135 /* Maximum character device basename size. */
136 #define MAX_BASENAME_SZ 10
137
138 /* Maximum long option length for option parsing. */
139 #define MAX_LONG_OPT_SZ 64
140
141 /* Used to compare MAC addresses. */
142 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
143
144 /* Number of descriptors per cacheline. */
145 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
146
147 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
148
149 /* mask of enabled ports */
150 static uint32_t enabled_port_mask = 0;
151
152 /* Promiscuous mode */
153 static uint32_t promiscuous;
154
155 /*Number of switching cores enabled*/
156 static uint32_t num_switching_cores = 0;
157
158 /* number of devices/queues to support*/
159 static uint32_t num_queues = 0;
160 static uint32_t num_devices;
161
162 /*
163  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
164  * disabled on default.
165  */
166 static uint32_t zero_copy;
167 static int mergeable;
168
169 /* Do vlan strip on host, enabled on default */
170 static uint32_t vlan_strip = 1;
171
172 /* number of descriptors to apply*/
173 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
174 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
175
176 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
177 #define MAX_RING_DESC 4096
178
179 struct vpool {
180         struct rte_mempool *pool;
181         struct rte_ring *ring;
182         uint32_t buf_size;
183 } vpool_array[MAX_QUEUES+MAX_QUEUES];
184
185 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
186 typedef enum {
187         VM2VM_DISABLED = 0,
188         VM2VM_SOFTWARE = 1,
189         VM2VM_HARDWARE = 2,
190         VM2VM_LAST
191 } vm2vm_type;
192 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
193
194 /* The type of host physical address translated from guest physical address. */
195 typedef enum {
196         PHYS_ADDR_CONTINUOUS = 0,
197         PHYS_ADDR_CROSS_SUBREG = 1,
198         PHYS_ADDR_INVALID = 2,
199         PHYS_ADDR_LAST
200 } hpa_type;
201
202 /* Enable stats. */
203 static uint32_t enable_stats = 0;
204 /* Enable retries on RX. */
205 static uint32_t enable_retry = 1;
206
207 /* Disable TX checksum offload */
208 static uint32_t enable_tx_csum;
209
210 /* Disable TSO offload */
211 static uint32_t enable_tso;
212
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220
221 /* empty vmdq configuration structure. Filled in programatically */
222 static struct rte_eth_conf vmdq_conf_default = {
223         .rxmode = {
224                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
225                 .split_hdr_size = 0,
226                 .header_split   = 0, /**< Header Split disabled */
227                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
228                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
229                 /*
230                  * It is necessary for 1G NIC such as I350,
231                  * this fixes bug of ipv4 forwarding in guest can't
232                  * forward pakets from one virtio dev to another virtio dev.
233                  */
234                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
235                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
236                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
237         },
238
239         .txmode = {
240                 .mq_mode = ETH_MQ_TX_NONE,
241         },
242         .rx_adv_conf = {
243                 /*
244                  * should be overridden separately in code with
245                  * appropriate values
246                  */
247                 .vmdq_rx_conf = {
248                         .nb_queue_pools = ETH_8_POOLS,
249                         .enable_default_pool = 0,
250                         .default_pool = 0,
251                         .nb_pool_maps = 0,
252                         .pool_map = {{0, 0},},
253                 },
254         },
255 };
256
257 static unsigned lcore_ids[RTE_MAX_LCORE];
258 static uint8_t ports[RTE_MAX_ETHPORTS];
259 static unsigned num_ports = 0; /**< The number of ports specified in command line */
260 static uint16_t num_pf_queues, num_vmdq_queues;
261 static uint16_t vmdq_pool_base, vmdq_queue_base;
262 static uint16_t queues_per_pool;
263
264 static const uint16_t external_pkt_default_vlan_tag = 2000;
265 const uint16_t vlan_tags[] = {
266         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
267         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
268         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
269         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
270         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
271         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
272         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
273         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
274 };
275
276 /* ethernet addresses of ports */
277 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
278
279 /* heads for the main used and free linked lists for the data path. */
280 static struct virtio_net_data_ll *ll_root_used = NULL;
281 static struct virtio_net_data_ll *ll_root_free = NULL;
282
283 /* Array of data core structures containing information on individual core linked lists. */
284 static struct lcore_info lcore_info[RTE_MAX_LCORE];
285
286 /* Used for queueing bursts of TX packets. */
287 struct mbuf_table {
288         unsigned len;
289         unsigned txq_id;
290         struct rte_mbuf *m_table[MAX_PKT_BURST];
291 };
292
293 /* TX queue for each data core. */
294 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
295
296 /* TX queue fori each virtio device for zero copy. */
297 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
298
299 /* Vlan header struct used to insert vlan tags on TX. */
300 struct vlan_ethhdr {
301         unsigned char   h_dest[ETH_ALEN];
302         unsigned char   h_source[ETH_ALEN];
303         __be16          h_vlan_proto;
304         __be16          h_vlan_TCI;
305         __be16          h_vlan_encapsulated_proto;
306 };
307
308 /* Header lengths. */
309 #define VLAN_HLEN       4
310 #define VLAN_ETH_HLEN   18
311
312 /* Per-device statistics struct */
313 struct device_statistics {
314         uint64_t tx_total;
315         rte_atomic64_t rx_total_atomic;
316         uint64_t rx_total;
317         uint64_t tx;
318         rte_atomic64_t rx_atomic;
319         uint64_t rx;
320 } __rte_cache_aligned;
321 struct device_statistics dev_statistics[MAX_DEVICES];
322
323 /*
324  * Builds up the correct configuration for VMDQ VLAN pool map
325  * according to the pool & queue limits.
326  */
327 static inline int
328 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
329 {
330         struct rte_eth_vmdq_rx_conf conf;
331         struct rte_eth_vmdq_rx_conf *def_conf =
332                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
333         unsigned i;
334
335         memset(&conf, 0, sizeof(conf));
336         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
337         conf.nb_pool_maps = num_devices;
338         conf.enable_loop_back = def_conf->enable_loop_back;
339         conf.rx_mode = def_conf->rx_mode;
340
341         for (i = 0; i < conf.nb_pool_maps; i++) {
342                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
343                 conf.pool_map[i].pools = (1UL << i);
344         }
345
346         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
347         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
348                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
349         return 0;
350 }
351
352 /*
353  * Validate the device number according to the max pool number gotten form
354  * dev_info. If the device number is invalid, give the error message and
355  * return -1. Each device must have its own pool.
356  */
357 static inline int
358 validate_num_devices(uint32_t max_nb_devices)
359 {
360         if (num_devices > max_nb_devices) {
361                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
362                 return -1;
363         }
364         return 0;
365 }
366
367 /*
368  * Initialises a given port using global settings and with the rx buffers
369  * coming from the mbuf_pool passed as parameter
370  */
371 static inline int
372 port_init(uint8_t port)
373 {
374         struct rte_eth_dev_info dev_info;
375         struct rte_eth_conf port_conf;
376         struct rte_eth_rxconf *rxconf;
377         struct rte_eth_txconf *txconf;
378         int16_t rx_rings, tx_rings;
379         uint16_t rx_ring_size, tx_ring_size;
380         int retval;
381         uint16_t q;
382
383         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
384         rte_eth_dev_info_get (port, &dev_info);
385
386         if (dev_info.max_rx_queues > MAX_QUEUES) {
387                 rte_exit(EXIT_FAILURE,
388                         "please define MAX_QUEUES no less than %u in %s\n",
389                         dev_info.max_rx_queues, __FILE__);
390         }
391
392         rxconf = &dev_info.default_rxconf;
393         txconf = &dev_info.default_txconf;
394         rxconf->rx_drop_en = 1;
395
396         /* Enable vlan offload */
397         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
398
399         /*
400          * Zero copy defers queue RX/TX start to the time when guest
401          * finishes its startup and packet buffers from that guest are
402          * available.
403          */
404         if (zero_copy) {
405                 rxconf->rx_deferred_start = 1;
406                 rxconf->rx_drop_en = 0;
407                 txconf->tx_deferred_start = 1;
408         }
409
410         /*configure the number of supported virtio devices based on VMDQ limits */
411         num_devices = dev_info.max_vmdq_pools;
412
413         if (zero_copy) {
414                 rx_ring_size = num_rx_descriptor;
415                 tx_ring_size = num_tx_descriptor;
416                 tx_rings = dev_info.max_tx_queues;
417         } else {
418                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
419                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
420                 tx_rings = (uint16_t)rte_lcore_count();
421         }
422
423         retval = validate_num_devices(MAX_DEVICES);
424         if (retval < 0)
425                 return retval;
426
427         /* Get port configuration. */
428         retval = get_eth_conf(&port_conf, num_devices);
429         if (retval < 0)
430                 return retval;
431         /* NIC queues are divided into pf queues and vmdq queues.  */
432         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
433         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
434         num_vmdq_queues = num_devices * queues_per_pool;
435         num_queues = num_pf_queues + num_vmdq_queues;
436         vmdq_queue_base = dev_info.vmdq_queue_base;
437         vmdq_pool_base  = dev_info.vmdq_pool_base;
438         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
439                 num_pf_queues, num_devices, queues_per_pool);
440
441         if (port >= rte_eth_dev_count()) return -1;
442
443         if (enable_tx_csum == 0)
444                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
445
446         if (enable_tso == 0) {
447                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
448                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
449         }
450
451         rx_rings = (uint16_t)dev_info.max_rx_queues;
452         /* Configure ethernet device. */
453         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
454         if (retval != 0)
455                 return retval;
456
457         /* Setup the queues. */
458         for (q = 0; q < rx_rings; q ++) {
459                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
460                                                 rte_eth_dev_socket_id(port),
461                                                 rxconf,
462                                                 vpool_array[q].pool);
463                 if (retval < 0)
464                         return retval;
465         }
466         for (q = 0; q < tx_rings; q ++) {
467                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
468                                                 rte_eth_dev_socket_id(port),
469                                                 txconf);
470                 if (retval < 0)
471                         return retval;
472         }
473
474         /* Start the device. */
475         retval  = rte_eth_dev_start(port);
476         if (retval < 0) {
477                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
478                 return retval;
479         }
480
481         if (promiscuous)
482                 rte_eth_promiscuous_enable(port);
483
484         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
485         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
486         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
487                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
488                         (unsigned)port,
489                         vmdq_ports_eth_addr[port].addr_bytes[0],
490                         vmdq_ports_eth_addr[port].addr_bytes[1],
491                         vmdq_ports_eth_addr[port].addr_bytes[2],
492                         vmdq_ports_eth_addr[port].addr_bytes[3],
493                         vmdq_ports_eth_addr[port].addr_bytes[4],
494                         vmdq_ports_eth_addr[port].addr_bytes[5]);
495
496         return 0;
497 }
498
499 /*
500  * Set character device basename.
501  */
502 static int
503 us_vhost_parse_basename(const char *q_arg)
504 {
505         /* parse number string */
506
507         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
508                 return -1;
509         else
510                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
511
512         return 0;
513 }
514
515 /*
516  * Parse the portmask provided at run time.
517  */
518 static int
519 parse_portmask(const char *portmask)
520 {
521         char *end = NULL;
522         unsigned long pm;
523
524         errno = 0;
525
526         /* parse hexadecimal string */
527         pm = strtoul(portmask, &end, 16);
528         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
529                 return -1;
530
531         if (pm == 0)
532                 return -1;
533
534         return pm;
535
536 }
537
538 /*
539  * Parse num options at run time.
540  */
541 static int
542 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
543 {
544         char *end = NULL;
545         unsigned long num;
546
547         errno = 0;
548
549         /* parse unsigned int string */
550         num = strtoul(q_arg, &end, 10);
551         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
552                 return -1;
553
554         if (num > max_valid_value)
555                 return -1;
556
557         return num;
558
559 }
560
561 /*
562  * Display usage
563  */
564 static void
565 us_vhost_usage(const char *prgname)
566 {
567         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
568         "               --vm2vm [0|1|2]\n"
569         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
570         "               --dev-basename <name>\n"
571         "               --nb-devices ND\n"
572         "               -p PORTMASK: Set mask for ports to be used by application\n"
573         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
574         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
575         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
576         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
577         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
578         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
579         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
580         "               --dev-basename: The basename to be used for the character device.\n"
581         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
582                         "zero copy\n"
583         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
584                         "used only when zero copy is enabled.\n"
585         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
586                         "used only when zero copy is enabled.\n"
587         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
588         "               --tso [0|1] disable/enable TCP segment offload.\n",
589                prgname);
590 }
591
592 /*
593  * Parse the arguments given in the command line of the application.
594  */
595 static int
596 us_vhost_parse_args(int argc, char **argv)
597 {
598         int opt, ret;
599         int option_index;
600         unsigned i;
601         const char *prgname = argv[0];
602         static struct option long_option[] = {
603                 {"vm2vm", required_argument, NULL, 0},
604                 {"rx-retry", required_argument, NULL, 0},
605                 {"rx-retry-delay", required_argument, NULL, 0},
606                 {"rx-retry-num", required_argument, NULL, 0},
607                 {"mergeable", required_argument, NULL, 0},
608                 {"vlan-strip", required_argument, NULL, 0},
609                 {"stats", required_argument, NULL, 0},
610                 {"dev-basename", required_argument, NULL, 0},
611                 {"zero-copy", required_argument, NULL, 0},
612                 {"rx-desc-num", required_argument, NULL, 0},
613                 {"tx-desc-num", required_argument, NULL, 0},
614                 {"tx-csum", required_argument, NULL, 0},
615                 {"tso", required_argument, NULL, 0},
616                 {NULL, 0, 0, 0},
617         };
618
619         /* Parse command line */
620         while ((opt = getopt_long(argc, argv, "p:P",
621                         long_option, &option_index)) != EOF) {
622                 switch (opt) {
623                 /* Portmask */
624                 case 'p':
625                         enabled_port_mask = parse_portmask(optarg);
626                         if (enabled_port_mask == 0) {
627                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
628                                 us_vhost_usage(prgname);
629                                 return -1;
630                         }
631                         break;
632
633                 case 'P':
634                         promiscuous = 1;
635                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
636                                 ETH_VMDQ_ACCEPT_BROADCAST |
637                                 ETH_VMDQ_ACCEPT_MULTICAST;
638                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
639
640                         break;
641
642                 case 0:
643                         /* Enable/disable vm2vm comms. */
644                         if (!strncmp(long_option[option_index].name, "vm2vm",
645                                 MAX_LONG_OPT_SZ)) {
646                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
647                                 if (ret == -1) {
648                                         RTE_LOG(INFO, VHOST_CONFIG,
649                                                 "Invalid argument for "
650                                                 "vm2vm [0|1|2]\n");
651                                         us_vhost_usage(prgname);
652                                         return -1;
653                                 } else {
654                                         vm2vm_mode = (vm2vm_type)ret;
655                                 }
656                         }
657
658                         /* Enable/disable retries on RX. */
659                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
660                                 ret = parse_num_opt(optarg, 1);
661                                 if (ret == -1) {
662                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
663                                         us_vhost_usage(prgname);
664                                         return -1;
665                                 } else {
666                                         enable_retry = ret;
667                                 }
668                         }
669
670                         /* Enable/disable TX checksum offload. */
671                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
672                                 ret = parse_num_opt(optarg, 1);
673                                 if (ret == -1) {
674                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
675                                         us_vhost_usage(prgname);
676                                         return -1;
677                                 } else
678                                         enable_tx_csum = ret;
679                         }
680
681                         /* Enable/disable TSO offload. */
682                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
683                                 ret = parse_num_opt(optarg, 1);
684                                 if (ret == -1) {
685                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
686                                         us_vhost_usage(prgname);
687                                         return -1;
688                                 } else
689                                         enable_tso = ret;
690                         }
691
692                         /* Specify the retries delay time (in useconds) on RX. */
693                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
694                                 ret = parse_num_opt(optarg, INT32_MAX);
695                                 if (ret == -1) {
696                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
697                                         us_vhost_usage(prgname);
698                                         return -1;
699                                 } else {
700                                         burst_rx_delay_time = ret;
701                                 }
702                         }
703
704                         /* Specify the retries number on RX. */
705                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
706                                 ret = parse_num_opt(optarg, INT32_MAX);
707                                 if (ret == -1) {
708                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
709                                         us_vhost_usage(prgname);
710                                         return -1;
711                                 } else {
712                                         burst_rx_retry_num = ret;
713                                 }
714                         }
715
716                         /* Enable/disable RX mergeable buffers. */
717                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
718                                 ret = parse_num_opt(optarg, 1);
719                                 if (ret == -1) {
720                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
721                                         us_vhost_usage(prgname);
722                                         return -1;
723                                 } else {
724                                         mergeable = !!ret;
725                                         if (ret) {
726                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
727                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
728                                                         = JUMBO_FRAME_MAX_SIZE;
729                                         }
730                                 }
731                         }
732
733                         /* Enable/disable RX VLAN strip on host. */
734                         if (!strncmp(long_option[option_index].name,
735                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
736                                 ret = parse_num_opt(optarg, 1);
737                                 if (ret == -1) {
738                                         RTE_LOG(INFO, VHOST_CONFIG,
739                                                 "Invalid argument for VLAN strip [0|1]\n");
740                                         us_vhost_usage(prgname);
741                                         return -1;
742                                 } else {
743                                         vlan_strip = !!ret;
744                                         vmdq_conf_default.rxmode.hw_vlan_strip =
745                                                 vlan_strip;
746                                 }
747                         }
748
749                         /* Enable/disable stats. */
750                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
751                                 ret = parse_num_opt(optarg, INT32_MAX);
752                                 if (ret == -1) {
753                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
754                                         us_vhost_usage(prgname);
755                                         return -1;
756                                 } else {
757                                         enable_stats = ret;
758                                 }
759                         }
760
761                         /* Set character device basename. */
762                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
763                                 if (us_vhost_parse_basename(optarg) == -1) {
764                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
765                                         us_vhost_usage(prgname);
766                                         return -1;
767                                 }
768                         }
769
770                         /* Enable/disable rx/tx zero copy. */
771                         if (!strncmp(long_option[option_index].name,
772                                 "zero-copy", MAX_LONG_OPT_SZ)) {
773                                 ret = parse_num_opt(optarg, 1);
774                                 if (ret == -1) {
775                                         RTE_LOG(INFO, VHOST_CONFIG,
776                                                 "Invalid argument"
777                                                 " for zero-copy [0|1]\n");
778                                         us_vhost_usage(prgname);
779                                         return -1;
780                                 } else
781                                         zero_copy = ret;
782                         }
783
784                         /* Specify the descriptor number on RX. */
785                         if (!strncmp(long_option[option_index].name,
786                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
787                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
788                                 if ((ret == -1) || (!POWEROF2(ret))) {
789                                         RTE_LOG(INFO, VHOST_CONFIG,
790                                         "Invalid argument for rx-desc-num[0-N],"
791                                         "power of 2 required.\n");
792                                         us_vhost_usage(prgname);
793                                         return -1;
794                                 } else {
795                                         num_rx_descriptor = ret;
796                                 }
797                         }
798
799                         /* Specify the descriptor number on TX. */
800                         if (!strncmp(long_option[option_index].name,
801                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
802                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
803                                 if ((ret == -1) || (!POWEROF2(ret))) {
804                                         RTE_LOG(INFO, VHOST_CONFIG,
805                                         "Invalid argument for tx-desc-num [0-N],"
806                                         "power of 2 required.\n");
807                                         us_vhost_usage(prgname);
808                                         return -1;
809                                 } else {
810                                         num_tx_descriptor = ret;
811                                 }
812                         }
813
814                         break;
815
816                         /* Invalid option - print options. */
817                 default:
818                         us_vhost_usage(prgname);
819                         return -1;
820                 }
821         }
822
823         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
824                 if (enabled_port_mask & (1 << i))
825                         ports[num_ports++] = (uint8_t)i;
826         }
827
828         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
829                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
830                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
831                 return -1;
832         }
833
834         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
835                 RTE_LOG(INFO, VHOST_PORT,
836                         "Vhost zero copy doesn't support software vm2vm,"
837                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
838                 return -1;
839         }
840
841         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
842                 RTE_LOG(INFO, VHOST_PORT,
843                         "Vhost zero copy doesn't support jumbo frame,"
844                         "please specify '--mergeable 0' to disable the "
845                         "mergeable feature.\n");
846                 return -1;
847         }
848
849         return 0;
850 }
851
852 /*
853  * Update the global var NUM_PORTS and array PORTS according to system ports number
854  * and return valid ports number
855  */
856 static unsigned check_ports_num(unsigned nb_ports)
857 {
858         unsigned valid_num_ports = num_ports;
859         unsigned portid;
860
861         if (num_ports > nb_ports) {
862                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
863                         num_ports, nb_ports);
864                 num_ports = nb_ports;
865         }
866
867         for (portid = 0; portid < num_ports; portid ++) {
868                 if (ports[portid] >= nb_ports) {
869                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
870                                 ports[portid], (nb_ports - 1));
871                         ports[portid] = INVALID_PORT_ID;
872                         valid_num_ports--;
873                 }
874         }
875         return valid_num_ports;
876 }
877
878 /*
879  * Macro to print out packet contents. Wrapped in debug define so that the
880  * data path is not effected when debug is disabled.
881  */
882 #ifdef DEBUG
883 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
884         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
885         unsigned int index;                                                                                                                                                                                             \
886         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
887                                                                                                                                                                                                                                         \
888         if ((header))                                                                                                                                                                                                   \
889                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
890         else                                                                                                                                                                                                                    \
891                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
892         for (index = 0; index < (size); index++) {                                                                                                                                              \
893                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
894                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
895         }                                                                                                                                                                                                                               \
896         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
897                                                                                                                                                                                                                                         \
898         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
899 } while(0)
900 #else
901 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
902 #endif
903
904 /*
905  * Function to convert guest physical addresses to vhost physical addresses.
906  * This is used to convert virtio buffer addresses.
907  */
908 static inline uint64_t __attribute__((always_inline))
909 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
910         uint32_t buf_len, hpa_type *addr_type)
911 {
912         struct virtio_memory_regions_hpa *region;
913         uint32_t regionidx;
914         uint64_t vhost_pa = 0;
915
916         *addr_type = PHYS_ADDR_INVALID;
917
918         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
919                 region = &vdev->regions_hpa[regionidx];
920                 if ((guest_pa >= region->guest_phys_address) &&
921                         (guest_pa <= region->guest_phys_address_end)) {
922                         vhost_pa = region->host_phys_addr_offset + guest_pa;
923                         if (likely((guest_pa + buf_len - 1)
924                                 <= region->guest_phys_address_end))
925                                 *addr_type = PHYS_ADDR_CONTINUOUS;
926                         else
927                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
928                         break;
929                 }
930         }
931
932         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
933                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
934                 (void *)(uintptr_t)vhost_pa);
935
936         return vhost_pa;
937 }
938
939 /*
940  * Compares a packet destination MAC address to a device MAC address.
941  */
942 static inline int __attribute__((always_inline))
943 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
944 {
945         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
946 }
947
948 /*
949  * This function learns the MAC address of the device and registers this along with a
950  * vlan tag to a VMDQ.
951  */
952 static int
953 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
954 {
955         struct ether_hdr *pkt_hdr;
956         struct virtio_net_data_ll *dev_ll;
957         struct virtio_net *dev = vdev->dev;
958         int i, ret;
959
960         /* Learn MAC address of guest device from packet */
961         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
962
963         dev_ll = ll_root_used;
964
965         while (dev_ll != NULL) {
966                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
967                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
968                         return -1;
969                 }
970                 dev_ll = dev_ll->next;
971         }
972
973         for (i = 0; i < ETHER_ADDR_LEN; i++)
974                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
975
976         /* vlan_tag currently uses the device_id. */
977         vdev->vlan_tag = vlan_tags[dev->device_fh];
978
979         /* Print out VMDQ registration info. */
980         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
981                 dev->device_fh,
982                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
983                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
984                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
985                 vdev->vlan_tag);
986
987         /* Register the MAC address. */
988         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
989                                 (uint32_t)dev->device_fh + vmdq_pool_base);
990         if (ret)
991                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
992                                         dev->device_fh);
993
994         /* Enable stripping of the vlan tag as we handle routing. */
995         if (vlan_strip)
996                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
997                         (uint16_t)vdev->vmdq_rx_q, 1);
998
999         /* Set device as ready for RX. */
1000         vdev->ready = DEVICE_RX;
1001
1002         return 0;
1003 }
1004
1005 /*
1006  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1007  * queue before disabling RX on the device.
1008  */
1009 static inline void
1010 unlink_vmdq(struct vhost_dev *vdev)
1011 {
1012         unsigned i = 0;
1013         unsigned rx_count;
1014         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1015
1016         if (vdev->ready == DEVICE_RX) {
1017                 /*clear MAC and VLAN settings*/
1018                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1019                 for (i = 0; i < 6; i++)
1020                         vdev->mac_address.addr_bytes[i] = 0;
1021
1022                 vdev->vlan_tag = 0;
1023
1024                 /*Clear out the receive buffers*/
1025                 rx_count = rte_eth_rx_burst(ports[0],
1026                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1027
1028                 while (rx_count) {
1029                         for (i = 0; i < rx_count; i++)
1030                                 rte_pktmbuf_free(pkts_burst[i]);
1031
1032                         rx_count = rte_eth_rx_burst(ports[0],
1033                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1034                 }
1035
1036                 vdev->ready = DEVICE_MAC_LEARNING;
1037         }
1038 }
1039
1040 /*
1041  * Check if the packet destination MAC address is for a local device. If so then put
1042  * the packet on that devices RX queue. If not then return.
1043  */
1044 static inline int __attribute__((always_inline))
1045 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1046 {
1047         struct virtio_net_data_ll *dev_ll;
1048         struct ether_hdr *pkt_hdr;
1049         uint64_t ret = 0;
1050         struct virtio_net *dev = vdev->dev;
1051         struct virtio_net *tdev; /* destination virito device */
1052
1053         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1054
1055         /*get the used devices list*/
1056         dev_ll = ll_root_used;
1057
1058         while (dev_ll != NULL) {
1059                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1060                                           &dev_ll->vdev->mac_address)) {
1061
1062                         /* Drop the packet if the TX packet is destined for the TX device. */
1063                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1064                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1065                                                         dev->device_fh);
1066                                 return 0;
1067                         }
1068                         tdev = dev_ll->vdev->dev;
1069
1070
1071                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1072
1073                         if (unlikely(dev_ll->vdev->remove)) {
1074                                 /*drop the packet if the device is marked for removal*/
1075                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1076                         } else {
1077                                 /*send the packet to the local virtio device*/
1078                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1079                                 if (enable_stats) {
1080                                         rte_atomic64_add(
1081                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1082                                         1);
1083                                         rte_atomic64_add(
1084                                         &dev_statistics[tdev->device_fh].rx_atomic,
1085                                         ret);
1086                                         dev_statistics[dev->device_fh].tx_total++;
1087                                         dev_statistics[dev->device_fh].tx += ret;
1088                                 }
1089                         }
1090
1091                         return 0;
1092                 }
1093                 dev_ll = dev_ll->next;
1094         }
1095
1096         return -1;
1097 }
1098
1099 /*
1100  * Check if the destination MAC of a packet is one local VM,
1101  * and get its vlan tag, and offset if it is.
1102  */
1103 static inline int __attribute__((always_inline))
1104 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1105         uint32_t *offset, uint16_t *vlan_tag)
1106 {
1107         struct virtio_net_data_ll *dev_ll = ll_root_used;
1108         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1109
1110         while (dev_ll != NULL) {
1111                 if ((dev_ll->vdev->ready == DEVICE_RX)
1112                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1113                 &dev_ll->vdev->mac_address)) {
1114                         /*
1115                          * Drop the packet if the TX packet is
1116                          * destined for the TX device.
1117                          */
1118                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1119                                 LOG_DEBUG(VHOST_DATA,
1120                                 "(%"PRIu64") TX: Source and destination"
1121                                 " MAC addresses are the same. Dropping "
1122                                 "packet.\n",
1123                                 dev_ll->vdev->dev->device_fh);
1124                                 return -1;
1125                         }
1126
1127                         /*
1128                          * HW vlan strip will reduce the packet length
1129                          * by minus length of vlan tag, so need restore
1130                          * the packet length by plus it.
1131                          */
1132                         *offset = VLAN_HLEN;
1133                         *vlan_tag =
1134                         (uint16_t)
1135                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1136
1137                         LOG_DEBUG(VHOST_DATA,
1138                         "(%"PRIu64") TX: pkt to local VM device id:"
1139                         "(%"PRIu64") vlan tag: %d.\n",
1140                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1141                         (int)*vlan_tag);
1142
1143                         break;
1144                 }
1145                 dev_ll = dev_ll->next;
1146         }
1147         return 0;
1148 }
1149
1150 static uint16_t
1151 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1152 {
1153         if (ol_flags & PKT_TX_IPV4)
1154                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1155         else /* assume ethertype == ETHER_TYPE_IPv6 */
1156                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1157 }
1158
1159 static void virtio_tx_offload(struct rte_mbuf *m)
1160 {
1161         void *l3_hdr;
1162         struct ipv4_hdr *ipv4_hdr = NULL;
1163         struct tcp_hdr *tcp_hdr = NULL;
1164         struct udp_hdr *udp_hdr = NULL;
1165         struct sctp_hdr *sctp_hdr = NULL;
1166         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1167
1168         l3_hdr = (char *)eth_hdr + m->l2_len;
1169
1170         if (m->tso_segsz != 0) {
1171                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
1172                 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1173                 m->ol_flags |= PKT_TX_IP_CKSUM;
1174                 ipv4_hdr->hdr_checksum = 0;
1175                 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1176                 return;
1177         }
1178
1179         if (m->ol_flags & PKT_TX_L4_MASK) {
1180                 switch (m->ol_flags & PKT_TX_L4_MASK) {
1181                 case PKT_TX_TCP_CKSUM:
1182                         tcp_hdr = (struct tcp_hdr *)
1183                                         ((char *)l3_hdr + m->l3_len);
1184                         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1185                         break;
1186                 case PKT_TX_UDP_CKSUM:
1187                         udp_hdr = (struct udp_hdr *)
1188                                         ((char *)l3_hdr + m->l3_len);
1189                         udp_hdr->dgram_cksum = get_psd_sum(l3_hdr, m->ol_flags);
1190                         break;
1191                 case PKT_TX_SCTP_CKSUM:
1192                         sctp_hdr = (struct sctp_hdr *)
1193                                         ((char *)l3_hdr + m->l3_len);
1194                         sctp_hdr->cksum = 0;
1195                         break;
1196                 default:
1197                         break;
1198                 }
1199         }
1200 }
1201
1202 /*
1203  * This function routes the TX packet to the correct interface. This may be a local device
1204  * or the physical port.
1205  */
1206 static inline void __attribute__((always_inline))
1207 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1208 {
1209         struct mbuf_table *tx_q;
1210         struct rte_mbuf **m_table;
1211         unsigned len, ret, offset = 0;
1212         const uint16_t lcore_id = rte_lcore_id();
1213         struct virtio_net *dev = vdev->dev;
1214         struct ether_hdr *nh;
1215
1216         /*check if destination is local VM*/
1217         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1218                 rte_pktmbuf_free(m);
1219                 return;
1220         }
1221
1222         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1223                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1224                         rte_pktmbuf_free(m);
1225                         return;
1226                 }
1227         }
1228
1229         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1230
1231         /*Add packet to the port tx queue*/
1232         tx_q = &lcore_tx_queue[lcore_id];
1233         len = tx_q->len;
1234
1235         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1236         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1237                 /* Guest has inserted the vlan tag. */
1238                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1239                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1240                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1241                         (vh->vlan_tci != vlan_tag_be))
1242                         vh->vlan_tci = vlan_tag_be;
1243         } else {
1244                 m->ol_flags |= PKT_TX_VLAN_PKT;
1245
1246                 /*
1247                  * Find the right seg to adjust the data len when offset is
1248                  * bigger than tail room size.
1249                  */
1250                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1251                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1252                                 m->data_len += offset;
1253                         else {
1254                                 struct rte_mbuf *seg = m;
1255
1256                                 while ((seg->next != NULL) &&
1257                                         (offset > rte_pktmbuf_tailroom(seg)))
1258                                         seg = seg->next;
1259
1260                                 seg->data_len += offset;
1261                         }
1262                         m->pkt_len += offset;
1263                 }
1264
1265                 m->vlan_tci = vlan_tag;
1266         }
1267
1268         if ((m->ol_flags & PKT_TX_L4_MASK) || (m->ol_flags & PKT_TX_TCP_SEG))
1269                 virtio_tx_offload(m);
1270
1271         tx_q->m_table[len] = m;
1272         len++;
1273         if (enable_stats) {
1274                 dev_statistics[dev->device_fh].tx_total++;
1275                 dev_statistics[dev->device_fh].tx++;
1276         }
1277
1278         if (unlikely(len == MAX_PKT_BURST)) {
1279                 m_table = (struct rte_mbuf **)tx_q->m_table;
1280                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1281                 /* Free any buffers not handled by TX and update the port stats. */
1282                 if (unlikely(ret < len)) {
1283                         do {
1284                                 rte_pktmbuf_free(m_table[ret]);
1285                         } while (++ret < len);
1286                 }
1287
1288                 len = 0;
1289         }
1290
1291         tx_q->len = len;
1292         return;
1293 }
1294 /*
1295  * This function is called by each data core. It handles all RX/TX registered with the
1296  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1297  * with all devices in the main linked list.
1298  */
1299 static int
1300 switch_worker(__attribute__((unused)) void *arg)
1301 {
1302         struct rte_mempool *mbuf_pool = arg;
1303         struct virtio_net *dev = NULL;
1304         struct vhost_dev *vdev = NULL;
1305         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1306         struct virtio_net_data_ll *dev_ll;
1307         struct mbuf_table *tx_q;
1308         volatile struct lcore_ll_info *lcore_ll;
1309         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1310         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1311         unsigned ret, i;
1312         const uint16_t lcore_id = rte_lcore_id();
1313         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1314         uint16_t rx_count = 0;
1315         uint16_t tx_count;
1316         uint32_t retry = 0;
1317
1318         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1319         lcore_ll = lcore_info[lcore_id].lcore_ll;
1320         prev_tsc = 0;
1321
1322         tx_q = &lcore_tx_queue[lcore_id];
1323         for (i = 0; i < num_cores; i ++) {
1324                 if (lcore_ids[i] == lcore_id) {
1325                         tx_q->txq_id = i;
1326                         break;
1327                 }
1328         }
1329
1330         while(1) {
1331                 cur_tsc = rte_rdtsc();
1332                 /*
1333                  * TX burst queue drain
1334                  */
1335                 diff_tsc = cur_tsc - prev_tsc;
1336                 if (unlikely(diff_tsc > drain_tsc)) {
1337
1338                         if (tx_q->len) {
1339                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1340
1341                                 /*Tx any packets in the queue*/
1342                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1343                                                                            (struct rte_mbuf **)tx_q->m_table,
1344                                                                            (uint16_t)tx_q->len);
1345                                 if (unlikely(ret < tx_q->len)) {
1346                                         do {
1347                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1348                                         } while (++ret < tx_q->len);
1349                                 }
1350
1351                                 tx_q->len = 0;
1352                         }
1353
1354                         prev_tsc = cur_tsc;
1355
1356                 }
1357
1358                 rte_prefetch0(lcore_ll->ll_root_used);
1359                 /*
1360                  * Inform the configuration core that we have exited the linked list and that no devices are
1361                  * in use if requested.
1362                  */
1363                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1364                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1365
1366                 /*
1367                  * Process devices
1368                  */
1369                 dev_ll = lcore_ll->ll_root_used;
1370
1371                 while (dev_ll != NULL) {
1372                         /*get virtio device ID*/
1373                         vdev = dev_ll->vdev;
1374                         dev = vdev->dev;
1375
1376                         if (unlikely(vdev->remove)) {
1377                                 dev_ll = dev_ll->next;
1378                                 unlink_vmdq(vdev);
1379                                 vdev->ready = DEVICE_SAFE_REMOVE;
1380                                 continue;
1381                         }
1382                         if (likely(vdev->ready == DEVICE_RX)) {
1383                                 /*Handle guest RX*/
1384                                 rx_count = rte_eth_rx_burst(ports[0],
1385                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1386
1387                                 if (rx_count) {
1388                                         /*
1389                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1390                                         * Here MAX_PKT_BURST must be less than virtio queue size
1391                                         */
1392                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1393                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1394                                                         rte_delay_us(burst_rx_delay_time);
1395                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1396                                                                 break;
1397                                                 }
1398                                         }
1399                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1400                                         if (enable_stats) {
1401                                                 rte_atomic64_add(
1402                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1403                                                 rx_count);
1404                                                 rte_atomic64_add(
1405                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1406                                         }
1407                                         while (likely(rx_count)) {
1408                                                 rx_count--;
1409                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1410                                         }
1411
1412                                 }
1413                         }
1414
1415                         if (likely(!vdev->remove)) {
1416                                 /* Handle guest TX*/
1417                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1418                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1419                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1420                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1421                                                 while (tx_count)
1422                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1423                                         }
1424                                 }
1425                                 while (tx_count)
1426                                         virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1427                         }
1428
1429                         /*move to the next device in the list*/
1430                         dev_ll = dev_ll->next;
1431                 }
1432         }
1433
1434         return 0;
1435 }
1436
1437 /*
1438  * This function gets available ring number for zero copy rx.
1439  * Only one thread will call this funciton for a paticular virtio device,
1440  * so, it is designed as non-thread-safe function.
1441  */
1442 static inline uint32_t __attribute__((always_inline))
1443 get_available_ring_num_zcp(struct virtio_net *dev)
1444 {
1445         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1446         uint16_t avail_idx;
1447
1448         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1449         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1450 }
1451
1452 /*
1453  * This function gets available ring index for zero copy rx,
1454  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1455  * Only one thread will call this funciton for a paticular virtio device,
1456  * so, it is designed as non-thread-safe function.
1457  */
1458 static inline uint32_t __attribute__((always_inline))
1459 get_available_ring_index_zcp(struct virtio_net *dev,
1460         uint16_t *res_base_idx, uint32_t count)
1461 {
1462         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1463         uint16_t avail_idx;
1464         uint32_t retry = 0;
1465         uint16_t free_entries;
1466
1467         *res_base_idx = vq->last_used_idx_res;
1468         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1469         free_entries = (avail_idx - *res_base_idx);
1470
1471         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1472                         "avail idx: %d, "
1473                         "res base idx:%d, free entries:%d\n",
1474                         dev->device_fh, avail_idx, *res_base_idx,
1475                         free_entries);
1476
1477         /*
1478          * If retry is enabled and the queue is full then we wait
1479          * and retry to avoid packet loss.
1480          */
1481         if (enable_retry && unlikely(count > free_entries)) {
1482                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1483                         rte_delay_us(burst_rx_delay_time);
1484                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1485                         free_entries = (avail_idx - *res_base_idx);
1486                         if (count <= free_entries)
1487                                 break;
1488                 }
1489         }
1490
1491         /*check that we have enough buffers*/
1492         if (unlikely(count > free_entries))
1493                 count = free_entries;
1494
1495         if (unlikely(count == 0)) {
1496                 LOG_DEBUG(VHOST_DATA,
1497                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1498                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1499                         dev->device_fh, avail_idx,
1500                         *res_base_idx, free_entries);
1501                 return 0;
1502         }
1503
1504         vq->last_used_idx_res = *res_base_idx + count;
1505
1506         return count;
1507 }
1508
1509 /*
1510  * This function put descriptor back to used list.
1511  */
1512 static inline void __attribute__((always_inline))
1513 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1514 {
1515         uint16_t res_cur_idx = vq->last_used_idx;
1516         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1517         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1518         rte_compiler_barrier();
1519         *(volatile uint16_t *)&vq->used->idx += 1;
1520         vq->last_used_idx += 1;
1521
1522         /* Kick the guest if necessary. */
1523         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1524                 eventfd_write(vq->callfd, (eventfd_t)1);
1525 }
1526
1527 /*
1528  * This function get available descriptor from vitio vring and un-attached mbuf
1529  * from vpool->ring, and then attach them together. It needs adjust the offset
1530  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1531  * frame data may be put to wrong location in mbuf.
1532  */
1533 static inline void __attribute__((always_inline))
1534 attach_rxmbuf_zcp(struct virtio_net *dev)
1535 {
1536         uint16_t res_base_idx, desc_idx;
1537         uint64_t buff_addr, phys_addr;
1538         struct vhost_virtqueue *vq;
1539         struct vring_desc *desc;
1540         void *obj = NULL;
1541         struct rte_mbuf *mbuf;
1542         struct vpool *vpool;
1543         hpa_type addr_type;
1544         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1545
1546         vpool = &vpool_array[vdev->vmdq_rx_q];
1547         vq = dev->virtqueue[VIRTIO_RXQ];
1548
1549         do {
1550                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1551                                 1) != 1))
1552                         return;
1553                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1554
1555                 desc = &vq->desc[desc_idx];
1556                 if (desc->flags & VRING_DESC_F_NEXT) {
1557                         desc = &vq->desc[desc->next];
1558                         buff_addr = gpa_to_vva(dev, desc->addr);
1559                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1560                                         &addr_type);
1561                 } else {
1562                         buff_addr = gpa_to_vva(dev,
1563                                         desc->addr + vq->vhost_hlen);
1564                         phys_addr = gpa_to_hpa(vdev,
1565                                         desc->addr + vq->vhost_hlen,
1566                                         desc->len, &addr_type);
1567                 }
1568
1569                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1570                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1571                                 " address found when attaching RX frame buffer"
1572                                 " address!\n", dev->device_fh);
1573                         put_desc_to_used_list_zcp(vq, desc_idx);
1574                         continue;
1575                 }
1576
1577                 /*
1578                  * Check if the frame buffer address from guest crosses
1579                  * sub-region or not.
1580                  */
1581                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1582                         RTE_LOG(ERR, VHOST_DATA,
1583                                 "(%"PRIu64") Frame buffer address cross "
1584                                 "sub-regioin found when attaching RX frame "
1585                                 "buffer address!\n",
1586                                 dev->device_fh);
1587                         put_desc_to_used_list_zcp(vq, desc_idx);
1588                         continue;
1589                 }
1590         } while (unlikely(phys_addr == 0));
1591
1592         rte_ring_sc_dequeue(vpool->ring, &obj);
1593         mbuf = obj;
1594         if (unlikely(mbuf == NULL)) {
1595                 LOG_DEBUG(VHOST_DATA,
1596                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1597                         "ring_sc_dequeue fail.\n",
1598                         dev->device_fh);
1599                 put_desc_to_used_list_zcp(vq, desc_idx);
1600                 return;
1601         }
1602
1603         if (unlikely(vpool->buf_size > desc->len)) {
1604                 LOG_DEBUG(VHOST_DATA,
1605                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1606                         "length(%d) of descriptor idx: %d less than room "
1607                         "size required: %d\n",
1608                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1609                 put_desc_to_used_list_zcp(vq, desc_idx);
1610                 rte_ring_sp_enqueue(vpool->ring, obj);
1611                 return;
1612         }
1613
1614         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1615         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1616         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1617         mbuf->data_len = desc->len;
1618         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1619
1620         LOG_DEBUG(VHOST_DATA,
1621                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1622                 "descriptor idx:%d\n",
1623                 dev->device_fh, res_base_idx, desc_idx);
1624
1625         __rte_mbuf_raw_free(mbuf);
1626
1627         return;
1628 }
1629
1630 /*
1631  * Detach an attched packet mbuf -
1632  *  - restore original mbuf address and length values.
1633  *  - reset pktmbuf data and data_len to their default values.
1634  *  All other fields of the given packet mbuf will be left intact.
1635  *
1636  * @param m
1637  *   The attached packet mbuf.
1638  */
1639 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1640 {
1641         const struct rte_mempool *mp = m->pool;
1642         void *buf = rte_mbuf_to_baddr(m);
1643         uint32_t buf_ofs;
1644         uint32_t buf_len = mp->elt_size - sizeof(*m);
1645         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1646
1647         m->buf_addr = buf;
1648         m->buf_len = (uint16_t)buf_len;
1649
1650         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1651                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1652         m->data_off = buf_ofs;
1653
1654         m->data_len = 0;
1655 }
1656
1657 /*
1658  * This function is called after packets have been transimited. It fetchs mbuf
1659  * from vpool->pool, detached it and put into vpool->ring. It also update the
1660  * used index and kick the guest if necessary.
1661  */
1662 static inline uint32_t __attribute__((always_inline))
1663 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1664 {
1665         struct rte_mbuf *mbuf;
1666         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1667         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1668         uint32_t index = 0;
1669         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1670
1671         LOG_DEBUG(VHOST_DATA,
1672                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1673                 "clean is: %d\n",
1674                 dev->device_fh, mbuf_count);
1675         LOG_DEBUG(VHOST_DATA,
1676                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1677                 "clean  is : %d\n",
1678                 dev->device_fh, rte_ring_count(vpool->ring));
1679
1680         for (index = 0; index < mbuf_count; index++) {
1681                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1682                 if (likely(MBUF_EXT_MEM(mbuf)))
1683                         pktmbuf_detach_zcp(mbuf);
1684                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1685
1686                 /* Update used index buffer information. */
1687                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1688                 vq->used->ring[used_idx].len = 0;
1689
1690                 used_idx = (used_idx + 1) & (vq->size - 1);
1691         }
1692
1693         LOG_DEBUG(VHOST_DATA,
1694                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1695                 "clean is: %d\n",
1696                 dev->device_fh, rte_mempool_count(vpool->pool));
1697         LOG_DEBUG(VHOST_DATA,
1698                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1699                 "clean  is : %d\n",
1700                 dev->device_fh, rte_ring_count(vpool->ring));
1701         LOG_DEBUG(VHOST_DATA,
1702                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1703                 "vq->last_used_idx:%d\n",
1704                 dev->device_fh, vq->last_used_idx);
1705
1706         vq->last_used_idx += mbuf_count;
1707
1708         LOG_DEBUG(VHOST_DATA,
1709                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1710                 "vq->last_used_idx:%d\n",
1711                 dev->device_fh, vq->last_used_idx);
1712
1713         rte_compiler_barrier();
1714
1715         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1716
1717         /* Kick guest if required. */
1718         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1719                 eventfd_write(vq->callfd, (eventfd_t)1);
1720
1721         return 0;
1722 }
1723
1724 /*
1725  * This function is called when a virtio device is destroy.
1726  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1727  */
1728 static void mbuf_destroy_zcp(struct vpool *vpool)
1729 {
1730         struct rte_mbuf *mbuf = NULL;
1731         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1732
1733         LOG_DEBUG(VHOST_CONFIG,
1734                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1735                 "mbuf_destroy_zcp is: %d\n",
1736                 mbuf_count);
1737         LOG_DEBUG(VHOST_CONFIG,
1738                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1739                 "mbuf_destroy_zcp  is : %d\n",
1740                 rte_ring_count(vpool->ring));
1741
1742         for (index = 0; index < mbuf_count; index++) {
1743                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1744                 if (likely(mbuf != NULL)) {
1745                         if (likely(MBUF_EXT_MEM(mbuf)))
1746                                 pktmbuf_detach_zcp(mbuf);
1747                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1748                 }
1749         }
1750
1751         LOG_DEBUG(VHOST_CONFIG,
1752                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1753                 "mbuf_destroy_zcp is: %d\n",
1754                 rte_mempool_count(vpool->pool));
1755         LOG_DEBUG(VHOST_CONFIG,
1756                 "in mbuf_destroy_zcp: mbuf count in ring after "
1757                 "mbuf_destroy_zcp is : %d\n",
1758                 rte_ring_count(vpool->ring));
1759 }
1760
1761 /*
1762  * This function update the use flag and counter.
1763  */
1764 static inline uint32_t __attribute__((always_inline))
1765 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1766         uint32_t count)
1767 {
1768         struct vhost_virtqueue *vq;
1769         struct vring_desc *desc;
1770         struct rte_mbuf *buff;
1771         /* The virtio_hdr is initialised to 0. */
1772         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1773                 = {{0, 0, 0, 0, 0, 0}, 0};
1774         uint64_t buff_hdr_addr = 0;
1775         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1776         uint32_t head_idx, packet_success = 0;
1777         uint16_t res_cur_idx;
1778
1779         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1780
1781         if (count == 0)
1782                 return 0;
1783
1784         vq = dev->virtqueue[VIRTIO_RXQ];
1785         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1786
1787         res_cur_idx = vq->last_used_idx;
1788         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1789                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1790
1791         /* Retrieve all of the head indexes first to avoid caching issues. */
1792         for (head_idx = 0; head_idx < count; head_idx++)
1793                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1794
1795         /*Prefetch descriptor index. */
1796         rte_prefetch0(&vq->desc[head[packet_success]]);
1797
1798         while (packet_success != count) {
1799                 /* Get descriptor from available ring */
1800                 desc = &vq->desc[head[packet_success]];
1801
1802                 buff = pkts[packet_success];
1803                 LOG_DEBUG(VHOST_DATA,
1804                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1805                         "pkt[%d] descriptor idx: %d\n",
1806                         dev->device_fh, packet_success,
1807                         MBUF_HEADROOM_UINT32(buff));
1808
1809                 PRINT_PACKET(dev,
1810                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1811                         + RTE_PKTMBUF_HEADROOM),
1812                         rte_pktmbuf_data_len(buff), 0);
1813
1814                 /* Buffer address translation for virtio header. */
1815                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1816                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1817
1818                 /*
1819                  * If the descriptors are chained the header and data are
1820                  * placed in separate buffers.
1821                  */
1822                 if (desc->flags & VRING_DESC_F_NEXT) {
1823                         desc->len = vq->vhost_hlen;
1824                         desc = &vq->desc[desc->next];
1825                         desc->len = rte_pktmbuf_data_len(buff);
1826                 } else {
1827                         desc->len = packet_len;
1828                 }
1829
1830                 /* Update used ring with desc information */
1831                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1832                         = head[packet_success];
1833                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1834                         = packet_len;
1835                 res_cur_idx++;
1836                 packet_success++;
1837
1838                 /* A header is required per buffer. */
1839                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1840                         (const void *)&virtio_hdr, vq->vhost_hlen);
1841
1842                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1843
1844                 if (likely(packet_success < count)) {
1845                         /* Prefetch descriptor index. */
1846                         rte_prefetch0(&vq->desc[head[packet_success]]);
1847                 }
1848         }
1849
1850         rte_compiler_barrier();
1851
1852         LOG_DEBUG(VHOST_DATA,
1853                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1854                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1855                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1856
1857         *(volatile uint16_t *)&vq->used->idx += count;
1858         vq->last_used_idx += count;
1859
1860         LOG_DEBUG(VHOST_DATA,
1861                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1862                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1863                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1864
1865         /* Kick the guest if necessary. */
1866         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1867                 eventfd_write(vq->callfd, (eventfd_t)1);
1868
1869         return count;
1870 }
1871
1872 /*
1873  * This function routes the TX packet to the correct interface.
1874  * This may be a local device or the physical port.
1875  */
1876 static inline void __attribute__((always_inline))
1877 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1878         uint32_t desc_idx, uint8_t need_copy)
1879 {
1880         struct mbuf_table *tx_q;
1881         struct rte_mbuf **m_table;
1882         void *obj = NULL;
1883         struct rte_mbuf *mbuf;
1884         unsigned len, ret, offset = 0;
1885         struct vpool *vpool;
1886         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1887         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1888
1889         /*Add packet to the port tx queue*/
1890         tx_q = &tx_queue_zcp[vmdq_rx_q];
1891         len = tx_q->len;
1892
1893         /* Allocate an mbuf and populate the structure. */
1894         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1895         rte_ring_sc_dequeue(vpool->ring, &obj);
1896         mbuf = obj;
1897         if (unlikely(mbuf == NULL)) {
1898                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1899                 RTE_LOG(ERR, VHOST_DATA,
1900                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1901                         dev->device_fh);
1902                 put_desc_to_used_list_zcp(vq, desc_idx);
1903                 return;
1904         }
1905
1906         if (vm2vm_mode == VM2VM_HARDWARE) {
1907                 /* Avoid using a vlan tag from any vm for external pkt, such as
1908                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1909                  * selection, MAC address determines it as an external pkt
1910                  * which should go to network, while vlan tag determine it as
1911                  * a vm2vm pkt should forward to another vm. Hardware confuse
1912                  * such a ambiguous situation, so pkt will lost.
1913                  */
1914                 vlan_tag = external_pkt_default_vlan_tag;
1915                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1916                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1917                         __rte_mbuf_raw_free(mbuf);
1918                         return;
1919                 }
1920         }
1921
1922         mbuf->nb_segs = m->nb_segs;
1923         mbuf->next = m->next;
1924         mbuf->data_len = m->data_len + offset;
1925         mbuf->pkt_len = mbuf->data_len;
1926         if (unlikely(need_copy)) {
1927                 /* Copy the packet contents to the mbuf. */
1928                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1929                         rte_pktmbuf_mtod(m, void *),
1930                         m->data_len);
1931         } else {
1932                 mbuf->data_off = m->data_off;
1933                 mbuf->buf_physaddr = m->buf_physaddr;
1934                 mbuf->buf_addr = m->buf_addr;
1935         }
1936         mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1937         mbuf->vlan_tci = vlan_tag;
1938         mbuf->l2_len = sizeof(struct ether_hdr);
1939         mbuf->l3_len = sizeof(struct ipv4_hdr);
1940         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1941
1942         tx_q->m_table[len] = mbuf;
1943         len++;
1944
1945         LOG_DEBUG(VHOST_DATA,
1946                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1947                 dev->device_fh,
1948                 mbuf->nb_segs,
1949                 (mbuf->next == NULL) ? "null" : "non-null");
1950
1951         if (enable_stats) {
1952                 dev_statistics[dev->device_fh].tx_total++;
1953                 dev_statistics[dev->device_fh].tx++;
1954         }
1955
1956         if (unlikely(len == MAX_PKT_BURST)) {
1957                 m_table = (struct rte_mbuf **)tx_q->m_table;
1958                 ret = rte_eth_tx_burst(ports[0],
1959                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1960
1961                 /*
1962                  * Free any buffers not handled by TX and update
1963                  * the port stats.
1964                  */
1965                 if (unlikely(ret < len)) {
1966                         do {
1967                                 rte_pktmbuf_free(m_table[ret]);
1968                         } while (++ret < len);
1969                 }
1970
1971                 len = 0;
1972                 txmbuf_clean_zcp(dev, vpool);
1973         }
1974
1975         tx_q->len = len;
1976
1977         return;
1978 }
1979
1980 /*
1981  * This function TX all available packets in virtio TX queue for one
1982  * virtio-net device. If it is first packet, it learns MAC address and
1983  * setup VMDQ.
1984  */
1985 static inline void __attribute__((always_inline))
1986 virtio_dev_tx_zcp(struct virtio_net *dev)
1987 {
1988         struct rte_mbuf m;
1989         struct vhost_virtqueue *vq;
1990         struct vring_desc *desc;
1991         uint64_t buff_addr = 0, phys_addr;
1992         uint32_t head[MAX_PKT_BURST];
1993         uint32_t i;
1994         uint16_t free_entries, packet_success = 0;
1995         uint16_t avail_idx;
1996         uint8_t need_copy = 0;
1997         hpa_type addr_type;
1998         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1999
2000         vq = dev->virtqueue[VIRTIO_TXQ];
2001         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2002
2003         /* If there are no available buffers then return. */
2004         if (vq->last_used_idx_res == avail_idx)
2005                 return;
2006
2007         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2008
2009         /* Prefetch available ring to retrieve head indexes. */
2010         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2011
2012         /* Get the number of free entries in the ring */
2013         free_entries = (avail_idx - vq->last_used_idx_res);
2014
2015         /* Limit to MAX_PKT_BURST. */
2016         free_entries
2017                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2018
2019         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2020                 dev->device_fh, free_entries);
2021
2022         /* Retrieve all of the head indexes first to avoid caching issues. */
2023         for (i = 0; i < free_entries; i++)
2024                 head[i]
2025                         = vq->avail->ring[(vq->last_used_idx_res + i)
2026                         & (vq->size - 1)];
2027
2028         vq->last_used_idx_res += free_entries;
2029
2030         /* Prefetch descriptor index. */
2031         rte_prefetch0(&vq->desc[head[packet_success]]);
2032         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2033
2034         while (packet_success < free_entries) {
2035                 desc = &vq->desc[head[packet_success]];
2036
2037                 /* Discard first buffer as it is the virtio header */
2038                 desc = &vq->desc[desc->next];
2039
2040                 /* Buffer address translation. */
2041                 buff_addr = gpa_to_vva(dev, desc->addr);
2042                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
2043                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2044                         &addr_type);
2045
2046                 if (likely(packet_success < (free_entries - 1)))
2047                         /* Prefetch descriptor index. */
2048                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2049
2050                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2051                         RTE_LOG(ERR, VHOST_DATA,
2052                                 "(%"PRIu64") Invalid frame buffer address found"
2053                                 "when TX packets!\n",
2054                                 dev->device_fh);
2055                         packet_success++;
2056                         continue;
2057                 }
2058
2059                 /* Prefetch buffer address. */
2060                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2061
2062                 /*
2063                  * Setup dummy mbuf. This is copied to a real mbuf if
2064                  * transmitted out the physical port.
2065                  */
2066                 m.data_len = desc->len;
2067                 m.nb_segs = 1;
2068                 m.next = NULL;
2069                 m.data_off = 0;
2070                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2071                 m.buf_physaddr = phys_addr;
2072
2073                 /*
2074                  * Check if the frame buffer address from guest crosses
2075                  * sub-region or not.
2076                  */
2077                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2078                         RTE_LOG(ERR, VHOST_DATA,
2079                                 "(%"PRIu64") Frame buffer address cross "
2080                                 "sub-regioin found when attaching TX frame "
2081                                 "buffer address!\n",
2082                                 dev->device_fh);
2083                         need_copy = 1;
2084                 } else
2085                         need_copy = 0;
2086
2087                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2088
2089                 /*
2090                  * If this is the first received packet we need to learn
2091                  * the MAC and setup VMDQ
2092                  */
2093                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2094                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2095                                 /*
2096                                  * Discard frame if device is scheduled for
2097                                  * removal or a duplicate MAC address is found.
2098                                  */
2099                                 packet_success += free_entries;
2100                                 vq->last_used_idx += packet_success;
2101                                 break;
2102                         }
2103                 }
2104
2105                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2106                 packet_success++;
2107         }
2108 }
2109
2110 /*
2111  * This function is called by each data core. It handles all RX/TX registered
2112  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2113  * addresses are compared with all devices in the main linked list.
2114  */
2115 static int
2116 switch_worker_zcp(__attribute__((unused)) void *arg)
2117 {
2118         struct virtio_net *dev = NULL;
2119         struct vhost_dev  *vdev = NULL;
2120         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2121         struct virtio_net_data_ll *dev_ll;
2122         struct mbuf_table *tx_q;
2123         volatile struct lcore_ll_info *lcore_ll;
2124         const uint64_t drain_tsc
2125                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2126                 * BURST_TX_DRAIN_US;
2127         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2128         unsigned ret;
2129         const uint16_t lcore_id = rte_lcore_id();
2130         uint16_t count_in_ring, rx_count = 0;
2131
2132         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2133
2134         lcore_ll = lcore_info[lcore_id].lcore_ll;
2135         prev_tsc = 0;
2136
2137         while (1) {
2138                 cur_tsc = rte_rdtsc();
2139
2140                 /* TX burst queue drain */
2141                 diff_tsc = cur_tsc - prev_tsc;
2142                 if (unlikely(diff_tsc > drain_tsc)) {
2143                         /*
2144                          * Get mbuf from vpool.pool and detach mbuf and
2145                          * put back into vpool.ring.
2146                          */
2147                         dev_ll = lcore_ll->ll_root_used;
2148                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2149                                 /* Get virtio device ID */
2150                                 vdev = dev_ll->vdev;
2151                                 dev = vdev->dev;
2152
2153                                 if (likely(!vdev->remove)) {
2154                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2155                                         if (tx_q->len) {
2156                                                 LOG_DEBUG(VHOST_DATA,
2157                                                 "TX queue drained after timeout"
2158                                                 " with burst size %u\n",
2159                                                 tx_q->len);
2160
2161                                                 /*
2162                                                  * Tx any packets in the queue
2163                                                  */
2164                                                 ret = rte_eth_tx_burst(
2165                                                         ports[0],
2166                                                         (uint16_t)tx_q->txq_id,
2167                                                         (struct rte_mbuf **)
2168                                                         tx_q->m_table,
2169                                                         (uint16_t)tx_q->len);
2170                                                 if (unlikely(ret < tx_q->len)) {
2171                                                         do {
2172                                                                 rte_pktmbuf_free(
2173                                                                         tx_q->m_table[ret]);
2174                                                         } while (++ret < tx_q->len);
2175                                                 }
2176                                                 tx_q->len = 0;
2177
2178                                                 txmbuf_clean_zcp(dev,
2179                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2180                                         }
2181                                 }
2182                                 dev_ll = dev_ll->next;
2183                         }
2184                         prev_tsc = cur_tsc;
2185                 }
2186
2187                 rte_prefetch0(lcore_ll->ll_root_used);
2188
2189                 /*
2190                  * Inform the configuration core that we have exited the linked
2191                  * list and that no devices are in use if requested.
2192                  */
2193                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2194                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2195
2196                 /* Process devices */
2197                 dev_ll = lcore_ll->ll_root_used;
2198
2199                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2200                         vdev = dev_ll->vdev;
2201                         dev  = vdev->dev;
2202                         if (unlikely(vdev->remove)) {
2203                                 dev_ll = dev_ll->next;
2204                                 unlink_vmdq(vdev);
2205                                 vdev->ready = DEVICE_SAFE_REMOVE;
2206                                 continue;
2207                         }
2208
2209                         if (likely(vdev->ready == DEVICE_RX)) {
2210                                 uint32_t index = vdev->vmdq_rx_q;
2211                                 uint16_t i;
2212                                 count_in_ring
2213                                 = rte_ring_count(vpool_array[index].ring);
2214                                 uint16_t free_entries
2215                                 = (uint16_t)get_available_ring_num_zcp(dev);
2216
2217                                 /*
2218                                  * Attach all mbufs in vpool.ring and put back
2219                                  * into vpool.pool.
2220                                  */
2221                                 for (i = 0;
2222                                 i < RTE_MIN(free_entries,
2223                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2224                                 i++)
2225                                         attach_rxmbuf_zcp(dev);
2226
2227                                 /* Handle guest RX */
2228                                 rx_count = rte_eth_rx_burst(ports[0],
2229                                         vdev->vmdq_rx_q, pkts_burst,
2230                                         MAX_PKT_BURST);
2231
2232                                 if (rx_count) {
2233                                         ret_count = virtio_dev_rx_zcp(dev,
2234                                                         pkts_burst, rx_count);
2235                                         if (enable_stats) {
2236                                                 dev_statistics[dev->device_fh].rx_total
2237                                                         += rx_count;
2238                                                 dev_statistics[dev->device_fh].rx
2239                                                         += ret_count;
2240                                         }
2241                                         while (likely(rx_count)) {
2242                                                 rx_count--;
2243                                                 pktmbuf_detach_zcp(
2244                                                         pkts_burst[rx_count]);
2245                                                 rte_ring_sp_enqueue(
2246                                                         vpool_array[index].ring,
2247                                                         (void *)pkts_burst[rx_count]);
2248                                         }
2249                                 }
2250                         }
2251
2252                         if (likely(!vdev->remove))
2253                                 /* Handle guest TX */
2254                                 virtio_dev_tx_zcp(dev);
2255
2256                         /* Move to the next device in the list */
2257                         dev_ll = dev_ll->next;
2258                 }
2259         }
2260
2261         return 0;
2262 }
2263
2264
2265 /*
2266  * Add an entry to a used linked list. A free entry must first be found
2267  * in the free linked list using get_data_ll_free_entry();
2268  */
2269 static void
2270 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2271         struct virtio_net_data_ll *ll_dev)
2272 {
2273         struct virtio_net_data_ll *ll = *ll_root_addr;
2274
2275         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2276         ll_dev->next = NULL;
2277         rte_compiler_barrier();
2278
2279         /* If ll == NULL then this is the first device. */
2280         if (ll) {
2281                 /* Increment to the tail of the linked list. */
2282                 while ((ll->next != NULL) )
2283                         ll = ll->next;
2284
2285                 ll->next = ll_dev;
2286         } else {
2287                 *ll_root_addr = ll_dev;
2288         }
2289 }
2290
2291 /*
2292  * Remove an entry from a used linked list. The entry must then be added to
2293  * the free linked list using put_data_ll_free_entry().
2294  */
2295 static void
2296 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2297         struct virtio_net_data_ll *ll_dev,
2298         struct virtio_net_data_ll *ll_dev_last)
2299 {
2300         struct virtio_net_data_ll *ll = *ll_root_addr;
2301
2302         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2303                 return;
2304
2305         if (ll_dev == ll)
2306                 *ll_root_addr = ll_dev->next;
2307         else
2308                 if (likely(ll_dev_last != NULL))
2309                         ll_dev_last->next = ll_dev->next;
2310                 else
2311                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2312 }
2313
2314 /*
2315  * Find and return an entry from the free linked list.
2316  */
2317 static struct virtio_net_data_ll *
2318 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2319 {
2320         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2321         struct virtio_net_data_ll *ll_dev;
2322
2323         if (ll_free == NULL)
2324                 return NULL;
2325
2326         ll_dev = ll_free;
2327         *ll_root_addr = ll_free->next;
2328
2329         return ll_dev;
2330 }
2331
2332 /*
2333  * Place an entry back on to the free linked list.
2334  */
2335 static void
2336 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2337         struct virtio_net_data_ll *ll_dev)
2338 {
2339         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2340
2341         if (ll_dev == NULL)
2342                 return;
2343
2344         ll_dev->next = ll_free;
2345         *ll_root_addr = ll_dev;
2346 }
2347
2348 /*
2349  * Creates a linked list of a given size.
2350  */
2351 static struct virtio_net_data_ll *
2352 alloc_data_ll(uint32_t size)
2353 {
2354         struct virtio_net_data_ll *ll_new;
2355         uint32_t i;
2356
2357         /* Malloc and then chain the linked list. */
2358         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2359         if (ll_new == NULL) {
2360                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2361                 return NULL;
2362         }
2363
2364         for (i = 0; i < size - 1; i++) {
2365                 ll_new[i].vdev = NULL;
2366                 ll_new[i].next = &ll_new[i+1];
2367         }
2368         ll_new[i].next = NULL;
2369
2370         return ll_new;
2371 }
2372
2373 /*
2374  * Create the main linked list along with each individual cores linked list. A used and a free list
2375  * are created to manage entries.
2376  */
2377 static int
2378 init_data_ll (void)
2379 {
2380         int lcore;
2381
2382         RTE_LCORE_FOREACH_SLAVE(lcore) {
2383                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2384                 if (lcore_info[lcore].lcore_ll == NULL) {
2385                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2386                         return -1;
2387                 }
2388
2389                 lcore_info[lcore].lcore_ll->device_num = 0;
2390                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2391                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2392                 if (num_devices % num_switching_cores)
2393                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2394                 else
2395                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2396         }
2397
2398         /* Allocate devices up to a maximum of MAX_DEVICES. */
2399         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2400
2401         return 0;
2402 }
2403
2404 /*
2405  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2406  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2407  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2408  */
2409 static void
2410 destroy_device (volatile struct virtio_net *dev)
2411 {
2412         struct virtio_net_data_ll *ll_lcore_dev_cur;
2413         struct virtio_net_data_ll *ll_main_dev_cur;
2414         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2415         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2416         struct vhost_dev *vdev;
2417         int lcore;
2418
2419         dev->flags &= ~VIRTIO_DEV_RUNNING;
2420
2421         vdev = (struct vhost_dev *)dev->priv;
2422         /*set the remove flag. */
2423         vdev->remove = 1;
2424         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2425                 rte_pause();
2426         }
2427
2428         /* Search for entry to be removed from lcore ll */
2429         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2430         while (ll_lcore_dev_cur != NULL) {
2431                 if (ll_lcore_dev_cur->vdev == vdev) {
2432                         break;
2433                 } else {
2434                         ll_lcore_dev_last = ll_lcore_dev_cur;
2435                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2436                 }
2437         }
2438
2439         if (ll_lcore_dev_cur == NULL) {
2440                 RTE_LOG(ERR, VHOST_CONFIG,
2441                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2442                         dev->device_fh);
2443                 return;
2444         }
2445
2446         /* Search for entry to be removed from main ll */
2447         ll_main_dev_cur = ll_root_used;
2448         ll_main_dev_last = NULL;
2449         while (ll_main_dev_cur != NULL) {
2450                 if (ll_main_dev_cur->vdev == vdev) {
2451                         break;
2452                 } else {
2453                         ll_main_dev_last = ll_main_dev_cur;
2454                         ll_main_dev_cur = ll_main_dev_cur->next;
2455                 }
2456         }
2457
2458         /* Remove entries from the lcore and main ll. */
2459         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2460         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2461
2462         /* Set the dev_removal_flag on each lcore. */
2463         RTE_LCORE_FOREACH_SLAVE(lcore) {
2464                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2465         }
2466
2467         /*
2468          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2469          * they can no longer access the device removed from the linked lists and that the devices
2470          * are no longer in use.
2471          */
2472         RTE_LCORE_FOREACH_SLAVE(lcore) {
2473                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2474                         rte_pause();
2475                 }
2476         }
2477
2478         /* Add the entries back to the lcore and main free ll.*/
2479         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2480         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2481
2482         /* Decrement number of device on the lcore. */
2483         lcore_info[vdev->coreid].lcore_ll->device_num--;
2484
2485         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2486
2487         if (zero_copy) {
2488                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2489
2490                 /* Stop the RX queue. */
2491                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2492                         LOG_DEBUG(VHOST_CONFIG,
2493                                 "(%"PRIu64") In destroy_device: Failed to stop "
2494                                 "rx queue:%d\n",
2495                                 dev->device_fh,
2496                                 vdev->vmdq_rx_q);
2497                 }
2498
2499                 LOG_DEBUG(VHOST_CONFIG,
2500                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2501                         "mempool back to ring for RX queue: %d\n",
2502                         dev->device_fh, vdev->vmdq_rx_q);
2503
2504                 mbuf_destroy_zcp(vpool);
2505
2506                 /* Stop the TX queue. */
2507                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2508                         LOG_DEBUG(VHOST_CONFIG,
2509                                 "(%"PRIu64") In destroy_device: Failed to "
2510                                 "stop tx queue:%d\n",
2511                                 dev->device_fh, vdev->vmdq_rx_q);
2512                 }
2513
2514                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2515
2516                 LOG_DEBUG(VHOST_CONFIG,
2517                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2518                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2519                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2520                         dev->device_fh);
2521
2522                 mbuf_destroy_zcp(vpool);
2523                 rte_free(vdev->regions_hpa);
2524         }
2525         rte_free(vdev);
2526
2527 }
2528
2529 /*
2530  * Calculate the region count of physical continous regions for one particular
2531  * region of whose vhost virtual address is continous. The particular region
2532  * start from vva_start, with size of 'size' in argument.
2533  */
2534 static uint32_t
2535 check_hpa_regions(uint64_t vva_start, uint64_t size)
2536 {
2537         uint32_t i, nregions = 0, page_size = getpagesize();
2538         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2539         if (vva_start % page_size) {
2540                 LOG_DEBUG(VHOST_CONFIG,
2541                         "in check_countinous: vva start(%p) mod page_size(%d) "
2542                         "has remainder\n",
2543                         (void *)(uintptr_t)vva_start, page_size);
2544                 return 0;
2545         }
2546         if (size % page_size) {
2547                 LOG_DEBUG(VHOST_CONFIG,
2548                         "in check_countinous: "
2549                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2550                         size, page_size);
2551                 return 0;
2552         }
2553         for (i = 0; i < size - page_size; i = i + page_size) {
2554                 cur_phys_addr
2555                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2556                 next_phys_addr = rte_mem_virt2phy(
2557                         (void *)(uintptr_t)(vva_start + i + page_size));
2558                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2559                         ++nregions;
2560                         LOG_DEBUG(VHOST_CONFIG,
2561                                 "in check_continuous: hva addr:(%p) is not "
2562                                 "continuous with hva addr:(%p), diff:%d\n",
2563                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2564                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2565                                 + page_size), page_size);
2566                         LOG_DEBUG(VHOST_CONFIG,
2567                                 "in check_continuous: hpa addr:(%p) is not "
2568                                 "continuous with hpa addr:(%p), "
2569                                 "diff:(%"PRIu64")\n",
2570                                 (void *)(uintptr_t)cur_phys_addr,
2571                                 (void *)(uintptr_t)next_phys_addr,
2572                                 (next_phys_addr-cur_phys_addr));
2573                 }
2574         }
2575         return nregions;
2576 }
2577
2578 /*
2579  * Divide each region whose vhost virtual address is continous into a few
2580  * sub-regions, make sure the physical address within each sub-region are
2581  * continous. And fill offset(to GPA) and size etc. information of each
2582  * sub-region into regions_hpa.
2583  */
2584 static uint32_t
2585 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2586 {
2587         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2588         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2589
2590         if (mem_region_hpa == NULL)
2591                 return 0;
2592
2593         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2594                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2595                         virtio_memory->regions[regionidx].address_offset;
2596                 mem_region_hpa[regionidx_hpa].guest_phys_address
2597                         = virtio_memory->regions[regionidx].guest_phys_address;
2598                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2599                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2600                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2601                 LOG_DEBUG(VHOST_CONFIG,
2602                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2603                         regionidx_hpa,
2604                         (void *)(uintptr_t)
2605                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2606                 LOG_DEBUG(VHOST_CONFIG,
2607                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2608                         regionidx_hpa,
2609                         (void *)(uintptr_t)
2610                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2611                 for (i = 0, k = 0;
2612                         i < virtio_memory->regions[regionidx].memory_size -
2613                                 page_size;
2614                         i += page_size) {
2615                         cur_phys_addr = rte_mem_virt2phy(
2616                                         (void *)(uintptr_t)(vva_start + i));
2617                         next_phys_addr = rte_mem_virt2phy(
2618                                         (void *)(uintptr_t)(vva_start +
2619                                         i + page_size));
2620                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2621                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2622                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2623                                         k + page_size;
2624                                 mem_region_hpa[regionidx_hpa].memory_size
2625                                         = k + page_size;
2626                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2627                                         "phys addr end  [%d]:(%p)\n",
2628                                         regionidx_hpa,
2629                                         (void *)(uintptr_t)
2630                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2631                                 LOG_DEBUG(VHOST_CONFIG,
2632                                         "in fill_hpa_regions: guest phys addr "
2633                                         "size [%d]:(%p)\n",
2634                                         regionidx_hpa,
2635                                         (void *)(uintptr_t)
2636                                         (mem_region_hpa[regionidx_hpa].memory_size));
2637                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2638                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2639                                 ++regionidx_hpa;
2640                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2641                                         next_phys_addr -
2642                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2643                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2644                                         " phys addr start[%d]:(%p)\n",
2645                                         regionidx_hpa,
2646                                         (void *)(uintptr_t)
2647                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2648                                 LOG_DEBUG(VHOST_CONFIG,
2649                                         "in fill_hpa_regions: host  phys addr "
2650                                         "start[%d]:(%p)\n",
2651                                         regionidx_hpa,
2652                                         (void *)(uintptr_t)
2653                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2654                                 k = 0;
2655                         } else {
2656                                 k += page_size;
2657                         }
2658                 }
2659                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2660                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2661                         + k + page_size;
2662                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2663                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2664                         "[%d]:(%p)\n", regionidx_hpa,
2665                         (void *)(uintptr_t)
2666                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2667                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2668                         "[%d]:(%p)\n", regionidx_hpa,
2669                         (void *)(uintptr_t)
2670                         (mem_region_hpa[regionidx_hpa].memory_size));
2671                 ++regionidx_hpa;
2672         }
2673         return regionidx_hpa;
2674 }
2675
2676 /*
2677  * A new device is added to a data core. First the device is added to the main linked list
2678  * and the allocated to a specific data core.
2679  */
2680 static int
2681 new_device (struct virtio_net *dev)
2682 {
2683         struct virtio_net_data_ll *ll_dev;
2684         int lcore, core_add = 0;
2685         uint32_t device_num_min = num_devices;
2686         struct vhost_dev *vdev;
2687         uint32_t regionidx;
2688
2689         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2690         if (vdev == NULL) {
2691                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2692                         dev->device_fh);
2693                 return -1;
2694         }
2695         vdev->dev = dev;
2696         dev->priv = vdev;
2697
2698         if (zero_copy) {
2699                 vdev->nregions_hpa = dev->mem->nregions;
2700                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2701                         vdev->nregions_hpa
2702                                 += check_hpa_regions(
2703                                         dev->mem->regions[regionidx].guest_phys_address
2704                                         + dev->mem->regions[regionidx].address_offset,
2705                                         dev->mem->regions[regionidx].memory_size);
2706
2707                 }
2708
2709                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2710                                                vdev->nregions_hpa,
2711                                                sizeof(struct virtio_memory_regions_hpa),
2712                                                RTE_CACHE_LINE_SIZE);
2713                 if (vdev->regions_hpa == NULL) {
2714                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2715                         rte_free(vdev);
2716                         return -1;
2717                 }
2718
2719
2720                 if (fill_hpa_memory_regions(
2721                         vdev->regions_hpa, dev->mem
2722                         ) != vdev->nregions_hpa) {
2723
2724                         RTE_LOG(ERR, VHOST_CONFIG,
2725                                 "hpa memory regions number mismatch: "
2726                                 "[%d]\n", vdev->nregions_hpa);
2727                         rte_free(vdev->regions_hpa);
2728                         rte_free(vdev);
2729                         return -1;
2730                 }
2731         }
2732
2733
2734         /* Add device to main ll */
2735         ll_dev = get_data_ll_free_entry(&ll_root_free);
2736         if (ll_dev == NULL) {
2737                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2738                         "of %d devices per core has been reached\n",
2739                         dev->device_fh, num_devices);
2740                 if (vdev->regions_hpa)
2741                         rte_free(vdev->regions_hpa);
2742                 rte_free(vdev);
2743                 return -1;
2744         }
2745         ll_dev->vdev = vdev;
2746         add_data_ll_entry(&ll_root_used, ll_dev);
2747         vdev->vmdq_rx_q
2748                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2749
2750         if (zero_copy) {
2751                 uint32_t index = vdev->vmdq_rx_q;
2752                 uint32_t count_in_ring, i;
2753                 struct mbuf_table *tx_q;
2754
2755                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2756
2757                 LOG_DEBUG(VHOST_CONFIG,
2758                         "(%"PRIu64") in new_device: mbuf count in mempool "
2759                         "before attach is: %d\n",
2760                         dev->device_fh,
2761                         rte_mempool_count(vpool_array[index].pool));
2762                 LOG_DEBUG(VHOST_CONFIG,
2763                         "(%"PRIu64") in new_device: mbuf count in  ring "
2764                         "before attach  is : %d\n",
2765                         dev->device_fh, count_in_ring);
2766
2767                 /*
2768                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2769                  */
2770                 for (i = 0; i < count_in_ring; i++)
2771                         attach_rxmbuf_zcp(dev);
2772
2773                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2774                         "mempool after attach is: %d\n",
2775                         dev->device_fh,
2776                         rte_mempool_count(vpool_array[index].pool));
2777                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2778                         "ring after attach  is : %d\n",
2779                         dev->device_fh,
2780                         rte_ring_count(vpool_array[index].ring));
2781
2782                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2783                 tx_q->txq_id = vdev->vmdq_rx_q;
2784
2785                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2786                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2787
2788                         LOG_DEBUG(VHOST_CONFIG,
2789                                 "(%"PRIu64") In new_device: Failed to start "
2790                                 "tx queue:%d\n",
2791                                 dev->device_fh, vdev->vmdq_rx_q);
2792
2793                         mbuf_destroy_zcp(vpool);
2794                         rte_free(vdev->regions_hpa);
2795                         rte_free(vdev);
2796                         return -1;
2797                 }
2798
2799                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2800                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2801
2802                         LOG_DEBUG(VHOST_CONFIG,
2803                                 "(%"PRIu64") In new_device: Failed to start "
2804                                 "rx queue:%d\n",
2805                                 dev->device_fh, vdev->vmdq_rx_q);
2806
2807                         /* Stop the TX queue. */
2808                         if (rte_eth_dev_tx_queue_stop(ports[0],
2809                                 vdev->vmdq_rx_q) != 0) {
2810                                 LOG_DEBUG(VHOST_CONFIG,
2811                                         "(%"PRIu64") In new_device: Failed to "
2812                                         "stop tx queue:%d\n",
2813                                         dev->device_fh, vdev->vmdq_rx_q);
2814                         }
2815
2816                         mbuf_destroy_zcp(vpool);
2817                         rte_free(vdev->regions_hpa);
2818                         rte_free(vdev);
2819                         return -1;
2820                 }
2821
2822         }
2823
2824         /*reset ready flag*/
2825         vdev->ready = DEVICE_MAC_LEARNING;
2826         vdev->remove = 0;
2827
2828         /* Find a suitable lcore to add the device. */
2829         RTE_LCORE_FOREACH_SLAVE(lcore) {
2830                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2831                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2832                         core_add = lcore;
2833                 }
2834         }
2835         /* Add device to lcore ll */
2836         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2837         if (ll_dev == NULL) {
2838                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2839                 vdev->ready = DEVICE_SAFE_REMOVE;
2840                 destroy_device(dev);
2841                 rte_free(vdev->regions_hpa);
2842                 rte_free(vdev);
2843                 return -1;
2844         }
2845         ll_dev->vdev = vdev;
2846         vdev->coreid = core_add;
2847
2848         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2849
2850         /* Initialize device stats */
2851         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2852
2853         /* Disable notifications. */
2854         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2855         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2856         lcore_info[vdev->coreid].lcore_ll->device_num++;
2857         dev->flags |= VIRTIO_DEV_RUNNING;
2858
2859         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2860
2861         return 0;
2862 }
2863
2864 /*
2865  * These callback allow devices to be added to the data core when configuration
2866  * has been fully complete.
2867  */
2868 static const struct virtio_net_device_ops virtio_net_device_ops =
2869 {
2870         .new_device =  new_device,
2871         .destroy_device = destroy_device,
2872 };
2873
2874 /*
2875  * This is a thread will wake up after a period to print stats if the user has
2876  * enabled them.
2877  */
2878 static void
2879 print_stats(void)
2880 {
2881         struct virtio_net_data_ll *dev_ll;
2882         uint64_t tx_dropped, rx_dropped;
2883         uint64_t tx, tx_total, rx, rx_total;
2884         uint32_t device_fh;
2885         const char clr[] = { 27, '[', '2', 'J', '\0' };
2886         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2887
2888         while(1) {
2889                 sleep(enable_stats);
2890
2891                 /* Clear screen and move to top left */
2892                 printf("%s%s", clr, top_left);
2893
2894                 printf("\nDevice statistics ====================================");
2895
2896                 dev_ll = ll_root_used;
2897                 while (dev_ll != NULL) {
2898                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2899                         tx_total = dev_statistics[device_fh].tx_total;
2900                         tx = dev_statistics[device_fh].tx;
2901                         tx_dropped = tx_total - tx;
2902                         if (zero_copy == 0) {
2903                                 rx_total = rte_atomic64_read(
2904                                         &dev_statistics[device_fh].rx_total_atomic);
2905                                 rx = rte_atomic64_read(
2906                                         &dev_statistics[device_fh].rx_atomic);
2907                         } else {
2908                                 rx_total = dev_statistics[device_fh].rx_total;
2909                                 rx = dev_statistics[device_fh].rx;
2910                         }
2911                         rx_dropped = rx_total - rx;
2912
2913                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2914                                         "\nTX total:            %"PRIu64""
2915                                         "\nTX dropped:          %"PRIu64""
2916                                         "\nTX successful:               %"PRIu64""
2917                                         "\nRX total:            %"PRIu64""
2918                                         "\nRX dropped:          %"PRIu64""
2919                                         "\nRX successful:               %"PRIu64"",
2920                                         device_fh,
2921                                         tx_total,
2922                                         tx_dropped,
2923                                         tx,
2924                                         rx_total,
2925                                         rx_dropped,
2926                                         rx);
2927
2928                         dev_ll = dev_ll->next;
2929                 }
2930                 printf("\n======================================================\n");
2931         }
2932 }
2933
2934 static void
2935 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2936         char *ring_name, uint32_t nb_mbuf)
2937 {
2938         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2939                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2940         if (vpool_array[index].pool != NULL) {
2941                 vpool_array[index].ring
2942                         = rte_ring_create(ring_name,
2943                                 rte_align32pow2(nb_mbuf + 1),
2944                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2945                 if (likely(vpool_array[index].ring != NULL)) {
2946                         LOG_DEBUG(VHOST_CONFIG,
2947                                 "in setup_mempool_tbl: mbuf count in "
2948                                 "mempool is: %d\n",
2949                                 rte_mempool_count(vpool_array[index].pool));
2950                         LOG_DEBUG(VHOST_CONFIG,
2951                                 "in setup_mempool_tbl: mbuf count in "
2952                                 "ring   is: %d\n",
2953                                 rte_ring_count(vpool_array[index].ring));
2954                 } else {
2955                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2956                                 ring_name);
2957                 }
2958
2959                 /* Need consider head room. */
2960                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2961         } else {
2962                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2963         }
2964 }
2965
2966 /* When we receive a INT signal, unregister vhost driver */
2967 static void
2968 sigint_handler(__rte_unused int signum)
2969 {
2970         /* Unregister vhost driver. */
2971         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2972         if (ret != 0)
2973                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2974         exit(0);
2975 }
2976
2977 /*
2978  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2979  * device is also registered here to handle the IOCTLs.
2980  */
2981 int
2982 main(int argc, char *argv[])
2983 {
2984         struct rte_mempool *mbuf_pool = NULL;
2985         unsigned lcore_id, core_id = 0;
2986         unsigned nb_ports, valid_num_ports;
2987         int ret;
2988         uint8_t portid;
2989         uint16_t queue_id;
2990         static pthread_t tid;
2991         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2992
2993         signal(SIGINT, sigint_handler);
2994
2995         /* init EAL */
2996         ret = rte_eal_init(argc, argv);
2997         if (ret < 0)
2998                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2999         argc -= ret;
3000         argv += ret;
3001
3002         /* parse app arguments */
3003         ret = us_vhost_parse_args(argc, argv);
3004         if (ret < 0)
3005                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
3006
3007         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
3008                 if (rte_lcore_is_enabled(lcore_id))
3009                         lcore_ids[core_id ++] = lcore_id;
3010
3011         if (rte_lcore_count() > RTE_MAX_LCORE)
3012                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
3013
3014         /*set the number of swithcing cores available*/
3015         num_switching_cores = rte_lcore_count()-1;
3016
3017         /* Get the number of physical ports. */
3018         nb_ports = rte_eth_dev_count();
3019         if (nb_ports > RTE_MAX_ETHPORTS)
3020                 nb_ports = RTE_MAX_ETHPORTS;
3021
3022         /*
3023          * Update the global var NUM_PORTS and global array PORTS
3024          * and get value of var VALID_NUM_PORTS according to system ports number
3025          */
3026         valid_num_ports = check_ports_num(nb_ports);
3027
3028         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3029                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3030                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3031                 return -1;
3032         }
3033
3034         if (zero_copy == 0) {
3035                 /* Create the mbuf pool. */
3036                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3037                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3038                         0, MBUF_DATA_SIZE, rte_socket_id());
3039                 if (mbuf_pool == NULL)
3040                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3041
3042                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3043                         vpool_array[queue_id].pool = mbuf_pool;
3044
3045                 if (vm2vm_mode == VM2VM_HARDWARE) {
3046                         /* Enable VT loop back to let L2 switch to do it. */
3047                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3048                         LOG_DEBUG(VHOST_CONFIG,
3049                                 "Enable loop back for L2 switch in vmdq.\n");
3050                 }
3051         } else {
3052                 uint32_t nb_mbuf;
3053                 char pool_name[RTE_MEMPOOL_NAMESIZE];
3054                 char ring_name[RTE_MEMPOOL_NAMESIZE];
3055
3056                 nb_mbuf = num_rx_descriptor
3057                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3058                         + num_switching_cores * MAX_PKT_BURST;
3059
3060                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3061                         snprintf(pool_name, sizeof(pool_name),
3062                                 "rxmbuf_pool_%u", queue_id);
3063                         snprintf(ring_name, sizeof(ring_name),
3064                                 "rxmbuf_ring_%u", queue_id);
3065                         setup_mempool_tbl(rte_socket_id(), queue_id,
3066                                 pool_name, ring_name, nb_mbuf);
3067                 }
3068
3069                 nb_mbuf = num_tx_descriptor
3070                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3071                                 + num_switching_cores * MAX_PKT_BURST;
3072
3073                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3074                         snprintf(pool_name, sizeof(pool_name),
3075                                 "txmbuf_pool_%u", queue_id);
3076                         snprintf(ring_name, sizeof(ring_name),
3077                                 "txmbuf_ring_%u", queue_id);
3078                         setup_mempool_tbl(rte_socket_id(),
3079                                 (queue_id + MAX_QUEUES),
3080                                 pool_name, ring_name, nb_mbuf);
3081                 }
3082
3083                 if (vm2vm_mode == VM2VM_HARDWARE) {
3084                         /* Enable VT loop back to let L2 switch to do it. */
3085                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3086                         LOG_DEBUG(VHOST_CONFIG,
3087                                 "Enable loop back for L2 switch in vmdq.\n");
3088                 }
3089         }
3090         /* Set log level. */
3091         rte_set_log_level(LOG_LEVEL);
3092
3093         /* initialize all ports */
3094         for (portid = 0; portid < nb_ports; portid++) {
3095                 /* skip ports that are not enabled */
3096                 if ((enabled_port_mask & (1 << portid)) == 0) {
3097                         RTE_LOG(INFO, VHOST_PORT,
3098                                 "Skipping disabled port %d\n", portid);
3099                         continue;
3100                 }
3101                 if (port_init(portid) != 0)
3102                         rte_exit(EXIT_FAILURE,
3103                                 "Cannot initialize network ports\n");
3104         }
3105
3106         /* Initialise all linked lists. */
3107         if (init_data_ll() == -1)
3108                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3109
3110         /* Initialize device stats */
3111         memset(&dev_statistics, 0, sizeof(dev_statistics));
3112
3113         /* Enable stats if the user option is set. */
3114         if (enable_stats) {
3115                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3116                 if (ret != 0)
3117                         rte_exit(EXIT_FAILURE,
3118                                 "Cannot create print-stats thread\n");
3119
3120                 /* Set thread_name for aid in debugging.  */
3121                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3122                 ret = rte_thread_setname(tid, thread_name);
3123                 if (ret != 0)
3124                         RTE_LOG(ERR, VHOST_CONFIG,
3125                                 "Cannot set print-stats name\n");
3126         }
3127
3128         /* Launch all data cores. */
3129         if (zero_copy == 0) {
3130                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3131                         rte_eal_remote_launch(switch_worker,
3132                                 mbuf_pool, lcore_id);
3133                 }
3134         } else {
3135                 uint32_t count_in_mempool, index, i;
3136                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3137                         /* For all RX and TX queues. */
3138                         count_in_mempool
3139                                 = rte_mempool_count(vpool_array[index].pool);
3140
3141                         /*
3142                          * Transfer all un-attached mbufs from vpool.pool
3143                          * to vpoo.ring.
3144                          */
3145                         for (i = 0; i < count_in_mempool; i++) {
3146                                 struct rte_mbuf *mbuf
3147                                         = __rte_mbuf_raw_alloc(
3148                                                 vpool_array[index].pool);
3149                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3150                                                 (void *)mbuf);
3151                         }
3152
3153                         LOG_DEBUG(VHOST_CONFIG,
3154                                 "in main: mbuf count in mempool at initial "
3155                                 "is: %d\n", count_in_mempool);
3156                         LOG_DEBUG(VHOST_CONFIG,
3157                                 "in main: mbuf count in  ring at initial  is :"
3158                                 " %d\n",
3159                                 rte_ring_count(vpool_array[index].ring));
3160                 }
3161
3162                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3163                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3164                                 lcore_id);
3165         }
3166
3167         if (mergeable == 0)
3168                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3169
3170         /* Register vhost(cuse or user) driver to handle vhost messages. */
3171         ret = rte_vhost_driver_register((char *)&dev_basename);
3172         if (ret != 0)
3173                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3174
3175         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3176
3177         /* Start CUSE session. */
3178         rte_vhost_driver_session_start();
3179         return 0;
3180
3181 }