examples/vhost: fix offload settings
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84
85 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
87
88 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
90
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX                       1
96 #define DEVICE_SAFE_REMOVE      2
97
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120                 + sizeof(struct rte_mbuf)))
121
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124
125 #define INVALID_PORT_ID 0xFF
126
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176
177 struct vpool {
178         struct rte_mempool *pool;
179         struct rte_ring *ring;
180         uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185         VM2VM_DISABLED = 0,
186         VM2VM_SOFTWARE = 1,
187         VM2VM_HARDWARE = 2,
188         VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194         PHYS_ADDR_CONTINUOUS = 0,
195         PHYS_ADDR_CROSS_SUBREG = 1,
196         PHYS_ADDR_INVALID = 2,
197         PHYS_ADDR_LAST
198 } hpa_type;
199
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221         .rxmode = {
222                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223                 .split_hdr_size = 0,
224                 .header_split   = 0, /**< Header Split disabled */
225                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
226                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
227                 /*
228                  * It is necessary for 1G NIC such as I350,
229                  * this fixes bug of ipv4 forwarding in guest can't
230                  * forward pakets from one virtio dev to another virtio dev.
231                  */
232                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
235         },
236
237         .txmode = {
238                 .mq_mode = ETH_MQ_TX_NONE,
239         },
240         .rx_adv_conf = {
241                 /*
242                  * should be overridden separately in code with
243                  * appropriate values
244                  */
245                 .vmdq_rx_conf = {
246                         .nb_queue_pools = ETH_8_POOLS,
247                         .enable_default_pool = 0,
248                         .default_pool = 0,
249                         .nb_pool_maps = 0,
250                         .pool_map = {{0, 0},},
251                 },
252         },
253 };
254
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
266         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286         unsigned len;
287         unsigned txq_id;
288         struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299         unsigned char   h_dest[ETH_ALEN];
300         unsigned char   h_source[ETH_ALEN];
301         __be16          h_vlan_proto;
302         __be16          h_vlan_TCI;
303         __be16          h_vlan_encapsulated_proto;
304 };
305
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309
310 /* Per-device statistics struct */
311 struct device_statistics {
312         uint64_t tx_total;
313         rte_atomic64_t rx_total_atomic;
314         uint64_t rx_total;
315         uint64_t tx;
316         rte_atomic64_t rx_atomic;
317         uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328         struct rte_eth_vmdq_rx_conf conf;
329         struct rte_eth_vmdq_rx_conf *def_conf =
330                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331         unsigned i;
332
333         memset(&conf, 0, sizeof(conf));
334         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335         conf.nb_pool_maps = num_devices;
336         conf.enable_loop_back = def_conf->enable_loop_back;
337         conf.rx_mode = def_conf->rx_mode;
338
339         for (i = 0; i < conf.nb_pool_maps; i++) {
340                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
341                 conf.pool_map[i].pools = (1UL << i);
342         }
343
344         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347         return 0;
348 }
349
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358         if (num_devices > max_nb_devices) {
359                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360                 return -1;
361         }
362         return 0;
363 }
364
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372         struct rte_eth_dev_info dev_info;
373         struct rte_eth_conf port_conf;
374         struct rte_eth_rxconf *rxconf;
375         struct rte_eth_txconf *txconf;
376         int16_t rx_rings, tx_rings;
377         uint16_t rx_ring_size, tx_ring_size;
378         int retval;
379         uint16_t q;
380
381         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382         rte_eth_dev_info_get (port, &dev_info);
383
384         if (dev_info.max_rx_queues > MAX_QUEUES) {
385                 rte_exit(EXIT_FAILURE,
386                         "please define MAX_QUEUES no less than %u in %s\n",
387                         dev_info.max_rx_queues, __FILE__);
388         }
389
390         rxconf = &dev_info.default_rxconf;
391         txconf = &dev_info.default_txconf;
392         rxconf->rx_drop_en = 1;
393
394         /* Enable vlan offload */
395         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396
397         /*
398          * Zero copy defers queue RX/TX start to the time when guest
399          * finishes its startup and packet buffers from that guest are
400          * available.
401          */
402         if (zero_copy) {
403                 rxconf->rx_deferred_start = 1;
404                 rxconf->rx_drop_en = 0;
405                 txconf->tx_deferred_start = 1;
406         }
407
408         /*configure the number of supported virtio devices based on VMDQ limits */
409         num_devices = dev_info.max_vmdq_pools;
410
411         if (zero_copy) {
412                 rx_ring_size = num_rx_descriptor;
413                 tx_ring_size = num_tx_descriptor;
414                 tx_rings = dev_info.max_tx_queues;
415         } else {
416                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418                 tx_rings = (uint16_t)rte_lcore_count();
419         }
420
421         retval = validate_num_devices(MAX_DEVICES);
422         if (retval < 0)
423                 return retval;
424
425         /* Get port configuration. */
426         retval = get_eth_conf(&port_conf, num_devices);
427         if (retval < 0)
428                 return retval;
429         /* NIC queues are divided into pf queues and vmdq queues.  */
430         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432         num_vmdq_queues = num_devices * queues_per_pool;
433         num_queues = num_pf_queues + num_vmdq_queues;
434         vmdq_queue_base = dev_info.vmdq_queue_base;
435         vmdq_pool_base  = dev_info.vmdq_pool_base;
436         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437                 num_pf_queues, num_devices, queues_per_pool);
438
439         if (port >= rte_eth_dev_count()) return -1;
440
441         if (enable_tx_csum == 0)
442                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443
444         if (enable_tso == 0) {
445                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447         }
448
449         rx_rings = (uint16_t)dev_info.max_rx_queues;
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port),
459                                                 rxconf,
460                                                 vpool_array[q].pool);
461                 if (retval < 0)
462                         return retval;
463         }
464         for (q = 0; q < tx_rings; q ++) {
465                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466                                                 rte_eth_dev_socket_id(port),
467                                                 txconf);
468                 if (retval < 0)
469                         return retval;
470         }
471
472         /* Start the device. */
473         retval  = rte_eth_dev_start(port);
474         if (retval < 0) {
475                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476                 return retval;
477         }
478
479         if (promiscuous)
480                 rte_eth_promiscuous_enable(port);
481
482         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486                         (unsigned)port,
487                         vmdq_ports_eth_addr[port].addr_bytes[0],
488                         vmdq_ports_eth_addr[port].addr_bytes[1],
489                         vmdq_ports_eth_addr[port].addr_bytes[2],
490                         vmdq_ports_eth_addr[port].addr_bytes[3],
491                         vmdq_ports_eth_addr[port].addr_bytes[4],
492                         vmdq_ports_eth_addr[port].addr_bytes[5]);
493
494         return 0;
495 }
496
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503         /* parse number string */
504
505         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506                 return -1;
507         else
508                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509
510         return 0;
511 }
512
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519         char *end = NULL;
520         unsigned long pm;
521
522         errno = 0;
523
524         /* parse hexadecimal string */
525         pm = strtoul(portmask, &end, 16);
526         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527                 return -1;
528
529         if (pm == 0)
530                 return -1;
531
532         return pm;
533
534 }
535
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542         char *end = NULL;
543         unsigned long num;
544
545         errno = 0;
546
547         /* parse unsigned int string */
548         num = strtoul(q_arg, &end, 10);
549         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550                 return -1;
551
552         if (num > max_valid_value)
553                 return -1;
554
555         return num;
556
557 }
558
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566         "               --vm2vm [0|1|2]\n"
567         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568         "               --dev-basename <name>\n"
569         "               --nb-devices ND\n"
570         "               -p PORTMASK: Set mask for ports to be used by application\n"
571         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578         "               --dev-basename: The basename to be used for the character device.\n"
579         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
580                         "zero copy\n"
581         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
582                         "used only when zero copy is enabled.\n"
583         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
584                         "used only when zero copy is enabled.\n"
585         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
586         "               --tso [0|1] disable/enable TCP segment offload.\n",
587                prgname);
588 }
589
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596         int opt, ret;
597         int option_index;
598         unsigned i;
599         const char *prgname = argv[0];
600         static struct option long_option[] = {
601                 {"vm2vm", required_argument, NULL, 0},
602                 {"rx-retry", required_argument, NULL, 0},
603                 {"rx-retry-delay", required_argument, NULL, 0},
604                 {"rx-retry-num", required_argument, NULL, 0},
605                 {"mergeable", required_argument, NULL, 0},
606                 {"vlan-strip", required_argument, NULL, 0},
607                 {"stats", required_argument, NULL, 0},
608                 {"dev-basename", required_argument, NULL, 0},
609                 {"zero-copy", required_argument, NULL, 0},
610                 {"rx-desc-num", required_argument, NULL, 0},
611                 {"tx-desc-num", required_argument, NULL, 0},
612                 {"tx-csum", required_argument, NULL, 0},
613                 {"tso", required_argument, NULL, 0},
614                 {NULL, 0, 0, 0},
615         };
616
617         /* Parse command line */
618         while ((opt = getopt_long(argc, argv, "p:P",
619                         long_option, &option_index)) != EOF) {
620                 switch (opt) {
621                 /* Portmask */
622                 case 'p':
623                         enabled_port_mask = parse_portmask(optarg);
624                         if (enabled_port_mask == 0) {
625                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626                                 us_vhost_usage(prgname);
627                                 return -1;
628                         }
629                         break;
630
631                 case 'P':
632                         promiscuous = 1;
633                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634                                 ETH_VMDQ_ACCEPT_BROADCAST |
635                                 ETH_VMDQ_ACCEPT_MULTICAST;
636                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637
638                         break;
639
640                 case 0:
641                         /* Enable/disable vm2vm comms. */
642                         if (!strncmp(long_option[option_index].name, "vm2vm",
643                                 MAX_LONG_OPT_SZ)) {
644                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645                                 if (ret == -1) {
646                                         RTE_LOG(INFO, VHOST_CONFIG,
647                                                 "Invalid argument for "
648                                                 "vm2vm [0|1|2]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         vm2vm_mode = (vm2vm_type)ret;
653                                 }
654                         }
655
656                         /* Enable/disable retries on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, 1);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         enable_retry = ret;
665                                 }
666                         }
667
668                         /* Enable/disable TX checksum offload. */
669                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else
676                                         enable_tx_csum = ret;
677                         }
678
679                         /* Enable/disable TSO offload. */
680                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681                                 ret = parse_num_opt(optarg, 1);
682                                 if (ret == -1) {
683                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684                                         us_vhost_usage(prgname);
685                                         return -1;
686                                 } else
687                                         enable_tso = ret;
688                         }
689
690                         /* Specify the retries delay time (in useconds) on RX. */
691                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         burst_rx_delay_time = ret;
699                                 }
700                         }
701
702                         /* Specify the retries number on RX. */
703                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704                                 ret = parse_num_opt(optarg, INT32_MAX);
705                                 if (ret == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 } else {
710                                         burst_rx_retry_num = ret;
711                                 }
712                         }
713
714                         /* Enable/disable RX mergeable buffers. */
715                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716                                 ret = parse_num_opt(optarg, 1);
717                                 if (ret == -1) {
718                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719                                         us_vhost_usage(prgname);
720                                         return -1;
721                                 } else {
722                                         mergeable = !!ret;
723                                         if (ret) {
724                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
725                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
726                                                         = JUMBO_FRAME_MAX_SIZE;
727                                         }
728                                 }
729                         }
730
731                         /* Enable/disable RX VLAN strip on host. */
732                         if (!strncmp(long_option[option_index].name,
733                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
734                                 ret = parse_num_opt(optarg, 1);
735                                 if (ret == -1) {
736                                         RTE_LOG(INFO, VHOST_CONFIG,
737                                                 "Invalid argument for VLAN strip [0|1]\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         vlan_strip = !!ret;
742                                         vmdq_conf_default.rxmode.hw_vlan_strip =
743                                                 vlan_strip;
744                                 }
745                         }
746
747                         /* Enable/disable stats. */
748                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749                                 ret = parse_num_opt(optarg, INT32_MAX);
750                                 if (ret == -1) {
751                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752                                         us_vhost_usage(prgname);
753                                         return -1;
754                                 } else {
755                                         enable_stats = ret;
756                                 }
757                         }
758
759                         /* Set character device basename. */
760                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761                                 if (us_vhost_parse_basename(optarg) == -1) {
762                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763                                         us_vhost_usage(prgname);
764                                         return -1;
765                                 }
766                         }
767
768                         /* Enable/disable rx/tx zero copy. */
769                         if (!strncmp(long_option[option_index].name,
770                                 "zero-copy", MAX_LONG_OPT_SZ)) {
771                                 ret = parse_num_opt(optarg, 1);
772                                 if (ret == -1) {
773                                         RTE_LOG(INFO, VHOST_CONFIG,
774                                                 "Invalid argument"
775                                                 " for zero-copy [0|1]\n");
776                                         us_vhost_usage(prgname);
777                                         return -1;
778                                 } else
779                                         zero_copy = ret;
780                         }
781
782                         /* Specify the descriptor number on RX. */
783                         if (!strncmp(long_option[option_index].name,
784                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
785                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
786                                 if ((ret == -1) || (!POWEROF2(ret))) {
787                                         RTE_LOG(INFO, VHOST_CONFIG,
788                                         "Invalid argument for rx-desc-num[0-N],"
789                                         "power of 2 required.\n");
790                                         us_vhost_usage(prgname);
791                                         return -1;
792                                 } else {
793                                         num_rx_descriptor = ret;
794                                 }
795                         }
796
797                         /* Specify the descriptor number on TX. */
798                         if (!strncmp(long_option[option_index].name,
799                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
800                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
801                                 if ((ret == -1) || (!POWEROF2(ret))) {
802                                         RTE_LOG(INFO, VHOST_CONFIG,
803                                         "Invalid argument for tx-desc-num [0-N],"
804                                         "power of 2 required.\n");
805                                         us_vhost_usage(prgname);
806                                         return -1;
807                                 } else {
808                                         num_tx_descriptor = ret;
809                                 }
810                         }
811
812                         break;
813
814                         /* Invalid option - print options. */
815                 default:
816                         us_vhost_usage(prgname);
817                         return -1;
818                 }
819         }
820
821         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822                 if (enabled_port_mask & (1 << i))
823                         ports[num_ports++] = (uint8_t)i;
824         }
825
826         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829                 return -1;
830         }
831
832         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833                 RTE_LOG(INFO, VHOST_PORT,
834                         "Vhost zero copy doesn't support software vm2vm,"
835                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
836                 return -1;
837         }
838
839         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840                 RTE_LOG(INFO, VHOST_PORT,
841                         "Vhost zero copy doesn't support jumbo frame,"
842                         "please specify '--mergeable 0' to disable the "
843                         "mergeable feature.\n");
844                 return -1;
845         }
846
847         return 0;
848 }
849
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856         unsigned valid_num_ports = num_ports;
857         unsigned portid;
858
859         if (num_ports > nb_ports) {
860                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861                         num_ports, nb_ports);
862                 num_ports = nb_ports;
863         }
864
865         for (portid = 0; portid < num_ports; portid ++) {
866                 if (ports[portid] >= nb_ports) {
867                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868                                 ports[portid], (nb_ports - 1));
869                         ports[portid] = INVALID_PORT_ID;
870                         valid_num_ports--;
871                 }
872         }
873         return valid_num_ports;
874 }
875
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #ifdef DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
882         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
883         unsigned int index;                                                                                                                                                                                             \
884         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
885                                                                                                                                                                                                                                         \
886         if ((header))                                                                                                                                                                                                   \
887                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
888         else                                                                                                                                                                                                                    \
889                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
890         for (index = 0; index < (size); index++) {                                                                                                                                              \
891                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
892                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
893         }                                                                                                                                                                                                                               \
894         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
895                                                                                                                                                                                                                                         \
896         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908         uint32_t buf_len, hpa_type *addr_type)
909 {
910         struct virtio_memory_regions_hpa *region;
911         uint32_t regionidx;
912         uint64_t vhost_pa = 0;
913
914         *addr_type = PHYS_ADDR_INVALID;
915
916         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917                 region = &vdev->regions_hpa[regionidx];
918                 if ((guest_pa >= region->guest_phys_address) &&
919                         (guest_pa <= region->guest_phys_address_end)) {
920                         vhost_pa = region->host_phys_addr_offset + guest_pa;
921                         if (likely((guest_pa + buf_len - 1)
922                                 <= region->guest_phys_address_end))
923                                 *addr_type = PHYS_ADDR_CONTINUOUS;
924                         else
925                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
926                         break;
927                 }
928         }
929
930         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
931                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932                 (void *)(uintptr_t)vhost_pa);
933
934         return vhost_pa;
935 }
936
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953         struct ether_hdr *pkt_hdr;
954         struct virtio_net_data_ll *dev_ll;
955         struct virtio_net *dev = vdev->dev;
956         int i, ret;
957
958         /* Learn MAC address of guest device from packet */
959         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960
961         dev_ll = ll_root_used;
962
963         while (dev_ll != NULL) {
964                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966                         return -1;
967                 }
968                 dev_ll = dev_ll->next;
969         }
970
971         for (i = 0; i < ETHER_ADDR_LEN; i++)
972                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973
974         /* vlan_tag currently uses the device_id. */
975         vdev->vlan_tag = vlan_tags[dev->device_fh];
976
977         /* Print out VMDQ registration info. */
978         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979                 dev->device_fh,
980                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983                 vdev->vlan_tag);
984
985         /* Register the MAC address. */
986         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987                                 (uint32_t)dev->device_fh + vmdq_pool_base);
988         if (ret)
989                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990                                         dev->device_fh);
991
992         /* Enable stripping of the vlan tag as we handle routing. */
993         if (vlan_strip)
994                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995                         (uint16_t)vdev->vmdq_rx_q, 1);
996
997         /* Set device as ready for RX. */
998         vdev->ready = DEVICE_RX;
999
1000         return 0;
1001 }
1002
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010         unsigned i = 0;
1011         unsigned rx_count;
1012         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013
1014         if (vdev->ready == DEVICE_RX) {
1015                 /*clear MAC and VLAN settings*/
1016                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017                 for (i = 0; i < 6; i++)
1018                         vdev->mac_address.addr_bytes[i] = 0;
1019
1020                 vdev->vlan_tag = 0;
1021
1022                 /*Clear out the receive buffers*/
1023                 rx_count = rte_eth_rx_burst(ports[0],
1024                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025
1026                 while (rx_count) {
1027                         for (i = 0; i < rx_count; i++)
1028                                 rte_pktmbuf_free(pkts_burst[i]);
1029
1030                         rx_count = rte_eth_rx_burst(ports[0],
1031                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032                 }
1033
1034                 vdev->ready = DEVICE_MAC_LEARNING;
1035         }
1036 }
1037
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045         struct virtio_net_data_ll *dev_ll;
1046         struct ether_hdr *pkt_hdr;
1047         uint64_t ret = 0;
1048         struct virtio_net *dev = vdev->dev;
1049         struct virtio_net *tdev; /* destination virito device */
1050
1051         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052
1053         /*get the used devices list*/
1054         dev_ll = ll_root_used;
1055
1056         while (dev_ll != NULL) {
1057                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058                                           &dev_ll->vdev->mac_address)) {
1059
1060                         /* Drop the packet if the TX packet is destined for the TX device. */
1061                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1063                                                         dev->device_fh);
1064                                 return 0;
1065                         }
1066                         tdev = dev_ll->vdev->dev;
1067
1068
1069                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1070
1071                         if (unlikely(dev_ll->vdev->remove)) {
1072                                 /*drop the packet if the device is marked for removal*/
1073                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1074                         } else {
1075                                 /*send the packet to the local virtio device*/
1076                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1077                                 if (enable_stats) {
1078                                         rte_atomic64_add(
1079                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1080                                         1);
1081                                         rte_atomic64_add(
1082                                         &dev_statistics[tdev->device_fh].rx_atomic,
1083                                         ret);
1084                                         dev_statistics[dev->device_fh].tx_total++;
1085                                         dev_statistics[dev->device_fh].tx += ret;
1086                                 }
1087                         }
1088
1089                         return 0;
1090                 }
1091                 dev_ll = dev_ll->next;
1092         }
1093
1094         return -1;
1095 }
1096
1097 /*
1098  * Check if the destination MAC of a packet is one local VM,
1099  * and get its vlan tag, and offset if it is.
1100  */
1101 static inline int __attribute__((always_inline))
1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1103         uint32_t *offset, uint16_t *vlan_tag)
1104 {
1105         struct virtio_net_data_ll *dev_ll = ll_root_used;
1106         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1107
1108         while (dev_ll != NULL) {
1109                 if ((dev_ll->vdev->ready == DEVICE_RX)
1110                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1111                 &dev_ll->vdev->mac_address)) {
1112                         /*
1113                          * Drop the packet if the TX packet is
1114                          * destined for the TX device.
1115                          */
1116                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1117                                 LOG_DEBUG(VHOST_DATA,
1118                                 "(%"PRIu64") TX: Source and destination"
1119                                 " MAC addresses are the same. Dropping "
1120                                 "packet.\n",
1121                                 dev_ll->vdev->dev->device_fh);
1122                                 return -1;
1123                         }
1124
1125                         /*
1126                          * HW vlan strip will reduce the packet length
1127                          * by minus length of vlan tag, so need restore
1128                          * the packet length by plus it.
1129                          */
1130                         *offset = VLAN_HLEN;
1131                         *vlan_tag =
1132                         (uint16_t)
1133                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1134
1135                         LOG_DEBUG(VHOST_DATA,
1136                         "(%"PRIu64") TX: pkt to local VM device id:"
1137                         "(%"PRIu64") vlan tag: %d.\n",
1138                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1139                         (int)*vlan_tag);
1140
1141                         break;
1142                 }
1143                 dev_ll = dev_ll->next;
1144         }
1145         return 0;
1146 }
1147
1148 static uint16_t
1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1150 {
1151         if (ol_flags & PKT_TX_IPV4)
1152                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1153         else /* assume ethertype == ETHER_TYPE_IPv6 */
1154                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1155 }
1156
1157 static void virtio_tx_offload(struct rte_mbuf *m)
1158 {
1159         void *l3_hdr;
1160         struct ipv4_hdr *ipv4_hdr = NULL;
1161         struct tcp_hdr *tcp_hdr = NULL;
1162         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163
1164         l3_hdr = (char *)eth_hdr + m->l2_len;
1165
1166         if (m->ol_flags & PKT_TX_IPV4) {
1167                 ipv4_hdr = l3_hdr;
1168                 ipv4_hdr->hdr_checksum = 0;
1169                 m->ol_flags |= PKT_TX_IP_CKSUM;
1170         }
1171
1172         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1173         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1174 }
1175
1176 /*
1177  * This function routes the TX packet to the correct interface. This may be a local device
1178  * or the physical port.
1179  */
1180 static inline void __attribute__((always_inline))
1181 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1182 {
1183         struct mbuf_table *tx_q;
1184         struct rte_mbuf **m_table;
1185         unsigned len, ret, offset = 0;
1186         const uint16_t lcore_id = rte_lcore_id();
1187         struct virtio_net *dev = vdev->dev;
1188         struct ether_hdr *nh;
1189
1190         /*check if destination is local VM*/
1191         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1192                 rte_pktmbuf_free(m);
1193                 return;
1194         }
1195
1196         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1197                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1198                         rte_pktmbuf_free(m);
1199                         return;
1200                 }
1201         }
1202
1203         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1204
1205         /*Add packet to the port tx queue*/
1206         tx_q = &lcore_tx_queue[lcore_id];
1207         len = tx_q->len;
1208
1209         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1210         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1211                 /* Guest has inserted the vlan tag. */
1212                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1213                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1214                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1215                         (vh->vlan_tci != vlan_tag_be))
1216                         vh->vlan_tci = vlan_tag_be;
1217         } else {
1218                 m->ol_flags |= PKT_TX_VLAN_PKT;
1219
1220                 /*
1221                  * Find the right seg to adjust the data len when offset is
1222                  * bigger than tail room size.
1223                  */
1224                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1225                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1226                                 m->data_len += offset;
1227                         else {
1228                                 struct rte_mbuf *seg = m;
1229
1230                                 while ((seg->next != NULL) &&
1231                                         (offset > rte_pktmbuf_tailroom(seg)))
1232                                         seg = seg->next;
1233
1234                                 seg->data_len += offset;
1235                         }
1236                         m->pkt_len += offset;
1237                 }
1238
1239                 m->vlan_tci = vlan_tag;
1240         }
1241
1242         if (m->ol_flags & PKT_TX_TCP_SEG)
1243                 virtio_tx_offload(m);
1244
1245         tx_q->m_table[len] = m;
1246         len++;
1247         if (enable_stats) {
1248                 dev_statistics[dev->device_fh].tx_total++;
1249                 dev_statistics[dev->device_fh].tx++;
1250         }
1251
1252         if (unlikely(len == MAX_PKT_BURST)) {
1253                 m_table = (struct rte_mbuf **)tx_q->m_table;
1254                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1255                 /* Free any buffers not handled by TX and update the port stats. */
1256                 if (unlikely(ret < len)) {
1257                         do {
1258                                 rte_pktmbuf_free(m_table[ret]);
1259                         } while (++ret < len);
1260                 }
1261
1262                 len = 0;
1263         }
1264
1265         tx_q->len = len;
1266         return;
1267 }
1268 /*
1269  * This function is called by each data core. It handles all RX/TX registered with the
1270  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1271  * with all devices in the main linked list.
1272  */
1273 static int
1274 switch_worker(__attribute__((unused)) void *arg)
1275 {
1276         struct rte_mempool *mbuf_pool = arg;
1277         struct virtio_net *dev = NULL;
1278         struct vhost_dev *vdev = NULL;
1279         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1280         struct virtio_net_data_ll *dev_ll;
1281         struct mbuf_table *tx_q;
1282         volatile struct lcore_ll_info *lcore_ll;
1283         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1284         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1285         unsigned ret, i;
1286         const uint16_t lcore_id = rte_lcore_id();
1287         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1288         uint16_t rx_count = 0;
1289         uint16_t tx_count;
1290         uint32_t retry = 0;
1291
1292         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1293         lcore_ll = lcore_info[lcore_id].lcore_ll;
1294         prev_tsc = 0;
1295
1296         tx_q = &lcore_tx_queue[lcore_id];
1297         for (i = 0; i < num_cores; i ++) {
1298                 if (lcore_ids[i] == lcore_id) {
1299                         tx_q->txq_id = i;
1300                         break;
1301                 }
1302         }
1303
1304         while(1) {
1305                 cur_tsc = rte_rdtsc();
1306                 /*
1307                  * TX burst queue drain
1308                  */
1309                 diff_tsc = cur_tsc - prev_tsc;
1310                 if (unlikely(diff_tsc > drain_tsc)) {
1311
1312                         if (tx_q->len) {
1313                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1314
1315                                 /*Tx any packets in the queue*/
1316                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1317                                                                            (struct rte_mbuf **)tx_q->m_table,
1318                                                                            (uint16_t)tx_q->len);
1319                                 if (unlikely(ret < tx_q->len)) {
1320                                         do {
1321                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1322                                         } while (++ret < tx_q->len);
1323                                 }
1324
1325                                 tx_q->len = 0;
1326                         }
1327
1328                         prev_tsc = cur_tsc;
1329
1330                 }
1331
1332                 rte_prefetch0(lcore_ll->ll_root_used);
1333                 /*
1334                  * Inform the configuration core that we have exited the linked list and that no devices are
1335                  * in use if requested.
1336                  */
1337                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1338                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1339
1340                 /*
1341                  * Process devices
1342                  */
1343                 dev_ll = lcore_ll->ll_root_used;
1344
1345                 while (dev_ll != NULL) {
1346                         /*get virtio device ID*/
1347                         vdev = dev_ll->vdev;
1348                         dev = vdev->dev;
1349
1350                         if (unlikely(vdev->remove)) {
1351                                 dev_ll = dev_ll->next;
1352                                 unlink_vmdq(vdev);
1353                                 vdev->ready = DEVICE_SAFE_REMOVE;
1354                                 continue;
1355                         }
1356                         if (likely(vdev->ready == DEVICE_RX)) {
1357                                 /*Handle guest RX*/
1358                                 rx_count = rte_eth_rx_burst(ports[0],
1359                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1360
1361                                 if (rx_count) {
1362                                         /*
1363                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1364                                         * Here MAX_PKT_BURST must be less than virtio queue size
1365                                         */
1366                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1367                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1368                                                         rte_delay_us(burst_rx_delay_time);
1369                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1370                                                                 break;
1371                                                 }
1372                                         }
1373                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1374                                         if (enable_stats) {
1375                                                 rte_atomic64_add(
1376                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1377                                                 rx_count);
1378                                                 rte_atomic64_add(
1379                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1380                                         }
1381                                         while (likely(rx_count)) {
1382                                                 rx_count--;
1383                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1384                                         }
1385
1386                                 }
1387                         }
1388
1389                         if (likely(!vdev->remove)) {
1390                                 /* Handle guest TX*/
1391                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1392                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1393                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1394                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1395                                                 while (tx_count)
1396                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1397                                         }
1398                                 }
1399                                 for (i = 0; i < tx_count; ++i)
1400                                         virtio_tx_route(vdev, pkts_burst[i], (uint16_t)dev->device_fh);
1401                         }
1402
1403                         /*move to the next device in the list*/
1404                         dev_ll = dev_ll->next;
1405                 }
1406         }
1407
1408         return 0;
1409 }
1410
1411 /*
1412  * This function gets available ring number for zero copy rx.
1413  * Only one thread will call this funciton for a paticular virtio device,
1414  * so, it is designed as non-thread-safe function.
1415  */
1416 static inline uint32_t __attribute__((always_inline))
1417 get_available_ring_num_zcp(struct virtio_net *dev)
1418 {
1419         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1420         uint16_t avail_idx;
1421
1422         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1423         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1424 }
1425
1426 /*
1427  * This function gets available ring index for zero copy rx,
1428  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1429  * Only one thread will call this funciton for a paticular virtio device,
1430  * so, it is designed as non-thread-safe function.
1431  */
1432 static inline uint32_t __attribute__((always_inline))
1433 get_available_ring_index_zcp(struct virtio_net *dev,
1434         uint16_t *res_base_idx, uint32_t count)
1435 {
1436         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1437         uint16_t avail_idx;
1438         uint32_t retry = 0;
1439         uint16_t free_entries;
1440
1441         *res_base_idx = vq->last_used_idx_res;
1442         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1443         free_entries = (avail_idx - *res_base_idx);
1444
1445         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1446                         "avail idx: %d, "
1447                         "res base idx:%d, free entries:%d\n",
1448                         dev->device_fh, avail_idx, *res_base_idx,
1449                         free_entries);
1450
1451         /*
1452          * If retry is enabled and the queue is full then we wait
1453          * and retry to avoid packet loss.
1454          */
1455         if (enable_retry && unlikely(count > free_entries)) {
1456                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1457                         rte_delay_us(burst_rx_delay_time);
1458                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1459                         free_entries = (avail_idx - *res_base_idx);
1460                         if (count <= free_entries)
1461                                 break;
1462                 }
1463         }
1464
1465         /*check that we have enough buffers*/
1466         if (unlikely(count > free_entries))
1467                 count = free_entries;
1468
1469         if (unlikely(count == 0)) {
1470                 LOG_DEBUG(VHOST_DATA,
1471                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1472                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1473                         dev->device_fh, avail_idx,
1474                         *res_base_idx, free_entries);
1475                 return 0;
1476         }
1477
1478         vq->last_used_idx_res = *res_base_idx + count;
1479
1480         return count;
1481 }
1482
1483 /*
1484  * This function put descriptor back to used list.
1485  */
1486 static inline void __attribute__((always_inline))
1487 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1488 {
1489         uint16_t res_cur_idx = vq->last_used_idx;
1490         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1491         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1492         rte_compiler_barrier();
1493         *(volatile uint16_t *)&vq->used->idx += 1;
1494         vq->last_used_idx += 1;
1495
1496         /* Kick the guest if necessary. */
1497         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1498                 eventfd_write(vq->callfd, (eventfd_t)1);
1499 }
1500
1501 /*
1502  * This function get available descriptor from vitio vring and un-attached mbuf
1503  * from vpool->ring, and then attach them together. It needs adjust the offset
1504  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1505  * frame data may be put to wrong location in mbuf.
1506  */
1507 static inline void __attribute__((always_inline))
1508 attach_rxmbuf_zcp(struct virtio_net *dev)
1509 {
1510         uint16_t res_base_idx, desc_idx;
1511         uint64_t buff_addr, phys_addr;
1512         struct vhost_virtqueue *vq;
1513         struct vring_desc *desc;
1514         void *obj = NULL;
1515         struct rte_mbuf *mbuf;
1516         struct vpool *vpool;
1517         hpa_type addr_type;
1518         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1519
1520         vpool = &vpool_array[vdev->vmdq_rx_q];
1521         vq = dev->virtqueue[VIRTIO_RXQ];
1522
1523         do {
1524                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1525                                 1) != 1))
1526                         return;
1527                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1528
1529                 desc = &vq->desc[desc_idx];
1530                 if (desc->flags & VRING_DESC_F_NEXT) {
1531                         desc = &vq->desc[desc->next];
1532                         buff_addr = gpa_to_vva(dev, desc->addr);
1533                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1534                                         &addr_type);
1535                 } else {
1536                         buff_addr = gpa_to_vva(dev,
1537                                         desc->addr + vq->vhost_hlen);
1538                         phys_addr = gpa_to_hpa(vdev,
1539                                         desc->addr + vq->vhost_hlen,
1540                                         desc->len, &addr_type);
1541                 }
1542
1543                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1544                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1545                                 " address found when attaching RX frame buffer"
1546                                 " address!\n", dev->device_fh);
1547                         put_desc_to_used_list_zcp(vq, desc_idx);
1548                         continue;
1549                 }
1550
1551                 /*
1552                  * Check if the frame buffer address from guest crosses
1553                  * sub-region or not.
1554                  */
1555                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1556                         RTE_LOG(ERR, VHOST_DATA,
1557                                 "(%"PRIu64") Frame buffer address cross "
1558                                 "sub-regioin found when attaching RX frame "
1559                                 "buffer address!\n",
1560                                 dev->device_fh);
1561                         put_desc_to_used_list_zcp(vq, desc_idx);
1562                         continue;
1563                 }
1564         } while (unlikely(phys_addr == 0));
1565
1566         rte_ring_sc_dequeue(vpool->ring, &obj);
1567         mbuf = obj;
1568         if (unlikely(mbuf == NULL)) {
1569                 LOG_DEBUG(VHOST_DATA,
1570                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1571                         "ring_sc_dequeue fail.\n",
1572                         dev->device_fh);
1573                 put_desc_to_used_list_zcp(vq, desc_idx);
1574                 return;
1575         }
1576
1577         if (unlikely(vpool->buf_size > desc->len)) {
1578                 LOG_DEBUG(VHOST_DATA,
1579                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1580                         "length(%d) of descriptor idx: %d less than room "
1581                         "size required: %d\n",
1582                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1583                 put_desc_to_used_list_zcp(vq, desc_idx);
1584                 rte_ring_sp_enqueue(vpool->ring, obj);
1585                 return;
1586         }
1587
1588         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1589         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1590         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1591         mbuf->data_len = desc->len;
1592         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1593
1594         LOG_DEBUG(VHOST_DATA,
1595                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1596                 "descriptor idx:%d\n",
1597                 dev->device_fh, res_base_idx, desc_idx);
1598
1599         __rte_mbuf_raw_free(mbuf);
1600
1601         return;
1602 }
1603
1604 /*
1605  * Detach an attched packet mbuf -
1606  *  - restore original mbuf address and length values.
1607  *  - reset pktmbuf data and data_len to their default values.
1608  *  All other fields of the given packet mbuf will be left intact.
1609  *
1610  * @param m
1611  *   The attached packet mbuf.
1612  */
1613 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1614 {
1615         const struct rte_mempool *mp = m->pool;
1616         void *buf = rte_mbuf_to_baddr(m);
1617         uint32_t buf_ofs;
1618         uint32_t buf_len = mp->elt_size - sizeof(*m);
1619         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1620
1621         m->buf_addr = buf;
1622         m->buf_len = (uint16_t)buf_len;
1623
1624         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1625                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1626         m->data_off = buf_ofs;
1627
1628         m->data_len = 0;
1629 }
1630
1631 /*
1632  * This function is called after packets have been transimited. It fetchs mbuf
1633  * from vpool->pool, detached it and put into vpool->ring. It also update the
1634  * used index and kick the guest if necessary.
1635  */
1636 static inline uint32_t __attribute__((always_inline))
1637 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1638 {
1639         struct rte_mbuf *mbuf;
1640         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1641         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1642         uint32_t index = 0;
1643         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1644
1645         LOG_DEBUG(VHOST_DATA,
1646                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1647                 "clean is: %d\n",
1648                 dev->device_fh, mbuf_count);
1649         LOG_DEBUG(VHOST_DATA,
1650                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1651                 "clean  is : %d\n",
1652                 dev->device_fh, rte_ring_count(vpool->ring));
1653
1654         for (index = 0; index < mbuf_count; index++) {
1655                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1656                 if (likely(MBUF_EXT_MEM(mbuf)))
1657                         pktmbuf_detach_zcp(mbuf);
1658                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1659
1660                 /* Update used index buffer information. */
1661                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1662                 vq->used->ring[used_idx].len = 0;
1663
1664                 used_idx = (used_idx + 1) & (vq->size - 1);
1665         }
1666
1667         LOG_DEBUG(VHOST_DATA,
1668                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1669                 "clean is: %d\n",
1670                 dev->device_fh, rte_mempool_count(vpool->pool));
1671         LOG_DEBUG(VHOST_DATA,
1672                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1673                 "clean  is : %d\n",
1674                 dev->device_fh, rte_ring_count(vpool->ring));
1675         LOG_DEBUG(VHOST_DATA,
1676                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1677                 "vq->last_used_idx:%d\n",
1678                 dev->device_fh, vq->last_used_idx);
1679
1680         vq->last_used_idx += mbuf_count;
1681
1682         LOG_DEBUG(VHOST_DATA,
1683                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1684                 "vq->last_used_idx:%d\n",
1685                 dev->device_fh, vq->last_used_idx);
1686
1687         rte_compiler_barrier();
1688
1689         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1690
1691         /* Kick guest if required. */
1692         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1693                 eventfd_write(vq->callfd, (eventfd_t)1);
1694
1695         return 0;
1696 }
1697
1698 /*
1699  * This function is called when a virtio device is destroy.
1700  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1701  */
1702 static void mbuf_destroy_zcp(struct vpool *vpool)
1703 {
1704         struct rte_mbuf *mbuf = NULL;
1705         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1706
1707         LOG_DEBUG(VHOST_CONFIG,
1708                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1709                 "mbuf_destroy_zcp is: %d\n",
1710                 mbuf_count);
1711         LOG_DEBUG(VHOST_CONFIG,
1712                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1713                 "mbuf_destroy_zcp  is : %d\n",
1714                 rte_ring_count(vpool->ring));
1715
1716         for (index = 0; index < mbuf_count; index++) {
1717                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1718                 if (likely(mbuf != NULL)) {
1719                         if (likely(MBUF_EXT_MEM(mbuf)))
1720                                 pktmbuf_detach_zcp(mbuf);
1721                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1722                 }
1723         }
1724
1725         LOG_DEBUG(VHOST_CONFIG,
1726                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1727                 "mbuf_destroy_zcp is: %d\n",
1728                 rte_mempool_count(vpool->pool));
1729         LOG_DEBUG(VHOST_CONFIG,
1730                 "in mbuf_destroy_zcp: mbuf count in ring after "
1731                 "mbuf_destroy_zcp is : %d\n",
1732                 rte_ring_count(vpool->ring));
1733 }
1734
1735 /*
1736  * This function update the use flag and counter.
1737  */
1738 static inline uint32_t __attribute__((always_inline))
1739 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1740         uint32_t count)
1741 {
1742         struct vhost_virtqueue *vq;
1743         struct vring_desc *desc;
1744         struct rte_mbuf *buff;
1745         /* The virtio_hdr is initialised to 0. */
1746         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1747                 = {{0, 0, 0, 0, 0, 0}, 0};
1748         uint64_t buff_hdr_addr = 0;
1749         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1750         uint32_t head_idx, packet_success = 0;
1751         uint16_t res_cur_idx;
1752
1753         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1754
1755         if (count == 0)
1756                 return 0;
1757
1758         vq = dev->virtqueue[VIRTIO_RXQ];
1759         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1760
1761         res_cur_idx = vq->last_used_idx;
1762         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1763                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1764
1765         /* Retrieve all of the head indexes first to avoid caching issues. */
1766         for (head_idx = 0; head_idx < count; head_idx++)
1767                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1768
1769         /*Prefetch descriptor index. */
1770         rte_prefetch0(&vq->desc[head[packet_success]]);
1771
1772         while (packet_success != count) {
1773                 /* Get descriptor from available ring */
1774                 desc = &vq->desc[head[packet_success]];
1775
1776                 buff = pkts[packet_success];
1777                 LOG_DEBUG(VHOST_DATA,
1778                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1779                         "pkt[%d] descriptor idx: %d\n",
1780                         dev->device_fh, packet_success,
1781                         MBUF_HEADROOM_UINT32(buff));
1782
1783                 PRINT_PACKET(dev,
1784                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1785                         + RTE_PKTMBUF_HEADROOM),
1786                         rte_pktmbuf_data_len(buff), 0);
1787
1788                 /* Buffer address translation for virtio header. */
1789                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1790                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1791
1792                 /*
1793                  * If the descriptors are chained the header and data are
1794                  * placed in separate buffers.
1795                  */
1796                 if (desc->flags & VRING_DESC_F_NEXT) {
1797                         desc->len = vq->vhost_hlen;
1798                         desc = &vq->desc[desc->next];
1799                         desc->len = rte_pktmbuf_data_len(buff);
1800                 } else {
1801                         desc->len = packet_len;
1802                 }
1803
1804                 /* Update used ring with desc information */
1805                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1806                         = head[packet_success];
1807                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1808                         = packet_len;
1809                 res_cur_idx++;
1810                 packet_success++;
1811
1812                 /* A header is required per buffer. */
1813                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1814                         (const void *)&virtio_hdr, vq->vhost_hlen);
1815
1816                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1817
1818                 if (likely(packet_success < count)) {
1819                         /* Prefetch descriptor index. */
1820                         rte_prefetch0(&vq->desc[head[packet_success]]);
1821                 }
1822         }
1823
1824         rte_compiler_barrier();
1825
1826         LOG_DEBUG(VHOST_DATA,
1827                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1828                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1829                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1830
1831         *(volatile uint16_t *)&vq->used->idx += count;
1832         vq->last_used_idx += count;
1833
1834         LOG_DEBUG(VHOST_DATA,
1835                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1836                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1837                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1838
1839         /* Kick the guest if necessary. */
1840         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1841                 eventfd_write(vq->callfd, (eventfd_t)1);
1842
1843         return count;
1844 }
1845
1846 /*
1847  * This function routes the TX packet to the correct interface.
1848  * This may be a local device or the physical port.
1849  */
1850 static inline void __attribute__((always_inline))
1851 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1852         uint32_t desc_idx, uint8_t need_copy)
1853 {
1854         struct mbuf_table *tx_q;
1855         struct rte_mbuf **m_table;
1856         void *obj = NULL;
1857         struct rte_mbuf *mbuf;
1858         unsigned len, ret, offset = 0;
1859         struct vpool *vpool;
1860         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1861         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1862
1863         /*Add packet to the port tx queue*/
1864         tx_q = &tx_queue_zcp[vmdq_rx_q];
1865         len = tx_q->len;
1866
1867         /* Allocate an mbuf and populate the structure. */
1868         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1869         rte_ring_sc_dequeue(vpool->ring, &obj);
1870         mbuf = obj;
1871         if (unlikely(mbuf == NULL)) {
1872                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1873                 RTE_LOG(ERR, VHOST_DATA,
1874                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1875                         dev->device_fh);
1876                 put_desc_to_used_list_zcp(vq, desc_idx);
1877                 return;
1878         }
1879
1880         if (vm2vm_mode == VM2VM_HARDWARE) {
1881                 /* Avoid using a vlan tag from any vm for external pkt, such as
1882                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1883                  * selection, MAC address determines it as an external pkt
1884                  * which should go to network, while vlan tag determine it as
1885                  * a vm2vm pkt should forward to another vm. Hardware confuse
1886                  * such a ambiguous situation, so pkt will lost.
1887                  */
1888                 vlan_tag = external_pkt_default_vlan_tag;
1889                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1890                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1891                         __rte_mbuf_raw_free(mbuf);
1892                         return;
1893                 }
1894         }
1895
1896         mbuf->nb_segs = m->nb_segs;
1897         mbuf->next = m->next;
1898         mbuf->data_len = m->data_len + offset;
1899         mbuf->pkt_len = mbuf->data_len;
1900         if (unlikely(need_copy)) {
1901                 /* Copy the packet contents to the mbuf. */
1902                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1903                         rte_pktmbuf_mtod(m, void *),
1904                         m->data_len);
1905         } else {
1906                 mbuf->data_off = m->data_off;
1907                 mbuf->buf_physaddr = m->buf_physaddr;
1908                 mbuf->buf_addr = m->buf_addr;
1909         }
1910         mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1911         mbuf->vlan_tci = vlan_tag;
1912         mbuf->l2_len = sizeof(struct ether_hdr);
1913         mbuf->l3_len = sizeof(struct ipv4_hdr);
1914         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1915
1916         tx_q->m_table[len] = mbuf;
1917         len++;
1918
1919         LOG_DEBUG(VHOST_DATA,
1920                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1921                 dev->device_fh,
1922                 mbuf->nb_segs,
1923                 (mbuf->next == NULL) ? "null" : "non-null");
1924
1925         if (enable_stats) {
1926                 dev_statistics[dev->device_fh].tx_total++;
1927                 dev_statistics[dev->device_fh].tx++;
1928         }
1929
1930         if (unlikely(len == MAX_PKT_BURST)) {
1931                 m_table = (struct rte_mbuf **)tx_q->m_table;
1932                 ret = rte_eth_tx_burst(ports[0],
1933                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1934
1935                 /*
1936                  * Free any buffers not handled by TX and update
1937                  * the port stats.
1938                  */
1939                 if (unlikely(ret < len)) {
1940                         do {
1941                                 rte_pktmbuf_free(m_table[ret]);
1942                         } while (++ret < len);
1943                 }
1944
1945                 len = 0;
1946                 txmbuf_clean_zcp(dev, vpool);
1947         }
1948
1949         tx_q->len = len;
1950
1951         return;
1952 }
1953
1954 /*
1955  * This function TX all available packets in virtio TX queue for one
1956  * virtio-net device. If it is first packet, it learns MAC address and
1957  * setup VMDQ.
1958  */
1959 static inline void __attribute__((always_inline))
1960 virtio_dev_tx_zcp(struct virtio_net *dev)
1961 {
1962         struct rte_mbuf m;
1963         struct vhost_virtqueue *vq;
1964         struct vring_desc *desc;
1965         uint64_t buff_addr = 0, phys_addr;
1966         uint32_t head[MAX_PKT_BURST];
1967         uint32_t i;
1968         uint16_t free_entries, packet_success = 0;
1969         uint16_t avail_idx;
1970         uint8_t need_copy = 0;
1971         hpa_type addr_type;
1972         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1973
1974         vq = dev->virtqueue[VIRTIO_TXQ];
1975         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1976
1977         /* If there are no available buffers then return. */
1978         if (vq->last_used_idx_res == avail_idx)
1979                 return;
1980
1981         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1982
1983         /* Prefetch available ring to retrieve head indexes. */
1984         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1985
1986         /* Get the number of free entries in the ring */
1987         free_entries = (avail_idx - vq->last_used_idx_res);
1988
1989         /* Limit to MAX_PKT_BURST. */
1990         free_entries
1991                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1992
1993         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1994                 dev->device_fh, free_entries);
1995
1996         /* Retrieve all of the head indexes first to avoid caching issues. */
1997         for (i = 0; i < free_entries; i++)
1998                 head[i]
1999                         = vq->avail->ring[(vq->last_used_idx_res + i)
2000                         & (vq->size - 1)];
2001
2002         vq->last_used_idx_res += free_entries;
2003
2004         /* Prefetch descriptor index. */
2005         rte_prefetch0(&vq->desc[head[packet_success]]);
2006         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2007
2008         while (packet_success < free_entries) {
2009                 desc = &vq->desc[head[packet_success]];
2010
2011                 /* Discard first buffer as it is the virtio header */
2012                 desc = &vq->desc[desc->next];
2013
2014                 /* Buffer address translation. */
2015                 buff_addr = gpa_to_vva(dev, desc->addr);
2016                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
2017                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2018                         &addr_type);
2019
2020                 if (likely(packet_success < (free_entries - 1)))
2021                         /* Prefetch descriptor index. */
2022                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2023
2024                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2025                         RTE_LOG(ERR, VHOST_DATA,
2026                                 "(%"PRIu64") Invalid frame buffer address found"
2027                                 "when TX packets!\n",
2028                                 dev->device_fh);
2029                         packet_success++;
2030                         continue;
2031                 }
2032
2033                 /* Prefetch buffer address. */
2034                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2035
2036                 /*
2037                  * Setup dummy mbuf. This is copied to a real mbuf if
2038                  * transmitted out the physical port.
2039                  */
2040                 m.data_len = desc->len;
2041                 m.nb_segs = 1;
2042                 m.next = NULL;
2043                 m.data_off = 0;
2044                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2045                 m.buf_physaddr = phys_addr;
2046
2047                 /*
2048                  * Check if the frame buffer address from guest crosses
2049                  * sub-region or not.
2050                  */
2051                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2052                         RTE_LOG(ERR, VHOST_DATA,
2053                                 "(%"PRIu64") Frame buffer address cross "
2054                                 "sub-regioin found when attaching TX frame "
2055                                 "buffer address!\n",
2056                                 dev->device_fh);
2057                         need_copy = 1;
2058                 } else
2059                         need_copy = 0;
2060
2061                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2062
2063                 /*
2064                  * If this is the first received packet we need to learn
2065                  * the MAC and setup VMDQ
2066                  */
2067                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2068                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2069                                 /*
2070                                  * Discard frame if device is scheduled for
2071                                  * removal or a duplicate MAC address is found.
2072                                  */
2073                                 packet_success += free_entries;
2074                                 vq->last_used_idx += packet_success;
2075                                 break;
2076                         }
2077                 }
2078
2079                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2080                 packet_success++;
2081         }
2082 }
2083
2084 /*
2085  * This function is called by each data core. It handles all RX/TX registered
2086  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2087  * addresses are compared with all devices in the main linked list.
2088  */
2089 static int
2090 switch_worker_zcp(__attribute__((unused)) void *arg)
2091 {
2092         struct virtio_net *dev = NULL;
2093         struct vhost_dev  *vdev = NULL;
2094         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2095         struct virtio_net_data_ll *dev_ll;
2096         struct mbuf_table *tx_q;
2097         volatile struct lcore_ll_info *lcore_ll;
2098         const uint64_t drain_tsc
2099                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2100                 * BURST_TX_DRAIN_US;
2101         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2102         unsigned ret;
2103         const uint16_t lcore_id = rte_lcore_id();
2104         uint16_t count_in_ring, rx_count = 0;
2105
2106         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2107
2108         lcore_ll = lcore_info[lcore_id].lcore_ll;
2109         prev_tsc = 0;
2110
2111         while (1) {
2112                 cur_tsc = rte_rdtsc();
2113
2114                 /* TX burst queue drain */
2115                 diff_tsc = cur_tsc - prev_tsc;
2116                 if (unlikely(diff_tsc > drain_tsc)) {
2117                         /*
2118                          * Get mbuf from vpool.pool and detach mbuf and
2119                          * put back into vpool.ring.
2120                          */
2121                         dev_ll = lcore_ll->ll_root_used;
2122                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2123                                 /* Get virtio device ID */
2124                                 vdev = dev_ll->vdev;
2125                                 dev = vdev->dev;
2126
2127                                 if (likely(!vdev->remove)) {
2128                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2129                                         if (tx_q->len) {
2130                                                 LOG_DEBUG(VHOST_DATA,
2131                                                 "TX queue drained after timeout"
2132                                                 " with burst size %u\n",
2133                                                 tx_q->len);
2134
2135                                                 /*
2136                                                  * Tx any packets in the queue
2137                                                  */
2138                                                 ret = rte_eth_tx_burst(
2139                                                         ports[0],
2140                                                         (uint16_t)tx_q->txq_id,
2141                                                         (struct rte_mbuf **)
2142                                                         tx_q->m_table,
2143                                                         (uint16_t)tx_q->len);
2144                                                 if (unlikely(ret < tx_q->len)) {
2145                                                         do {
2146                                                                 rte_pktmbuf_free(
2147                                                                         tx_q->m_table[ret]);
2148                                                         } while (++ret < tx_q->len);
2149                                                 }
2150                                                 tx_q->len = 0;
2151
2152                                                 txmbuf_clean_zcp(dev,
2153                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2154                                         }
2155                                 }
2156                                 dev_ll = dev_ll->next;
2157                         }
2158                         prev_tsc = cur_tsc;
2159                 }
2160
2161                 rte_prefetch0(lcore_ll->ll_root_used);
2162
2163                 /*
2164                  * Inform the configuration core that we have exited the linked
2165                  * list and that no devices are in use if requested.
2166                  */
2167                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2168                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2169
2170                 /* Process devices */
2171                 dev_ll = lcore_ll->ll_root_used;
2172
2173                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2174                         vdev = dev_ll->vdev;
2175                         dev  = vdev->dev;
2176                         if (unlikely(vdev->remove)) {
2177                                 dev_ll = dev_ll->next;
2178                                 unlink_vmdq(vdev);
2179                                 vdev->ready = DEVICE_SAFE_REMOVE;
2180                                 continue;
2181                         }
2182
2183                         if (likely(vdev->ready == DEVICE_RX)) {
2184                                 uint32_t index = vdev->vmdq_rx_q;
2185                                 uint16_t i;
2186                                 count_in_ring
2187                                 = rte_ring_count(vpool_array[index].ring);
2188                                 uint16_t free_entries
2189                                 = (uint16_t)get_available_ring_num_zcp(dev);
2190
2191                                 /*
2192                                  * Attach all mbufs in vpool.ring and put back
2193                                  * into vpool.pool.
2194                                  */
2195                                 for (i = 0;
2196                                 i < RTE_MIN(free_entries,
2197                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2198                                 i++)
2199                                         attach_rxmbuf_zcp(dev);
2200
2201                                 /* Handle guest RX */
2202                                 rx_count = rte_eth_rx_burst(ports[0],
2203                                         vdev->vmdq_rx_q, pkts_burst,
2204                                         MAX_PKT_BURST);
2205
2206                                 if (rx_count) {
2207                                         ret_count = virtio_dev_rx_zcp(dev,
2208                                                         pkts_burst, rx_count);
2209                                         if (enable_stats) {
2210                                                 dev_statistics[dev->device_fh].rx_total
2211                                                         += rx_count;
2212                                                 dev_statistics[dev->device_fh].rx
2213                                                         += ret_count;
2214                                         }
2215                                         while (likely(rx_count)) {
2216                                                 rx_count--;
2217                                                 pktmbuf_detach_zcp(
2218                                                         pkts_burst[rx_count]);
2219                                                 rte_ring_sp_enqueue(
2220                                                         vpool_array[index].ring,
2221                                                         (void *)pkts_burst[rx_count]);
2222                                         }
2223                                 }
2224                         }
2225
2226                         if (likely(!vdev->remove))
2227                                 /* Handle guest TX */
2228                                 virtio_dev_tx_zcp(dev);
2229
2230                         /* Move to the next device in the list */
2231                         dev_ll = dev_ll->next;
2232                 }
2233         }
2234
2235         return 0;
2236 }
2237
2238
2239 /*
2240  * Add an entry to a used linked list. A free entry must first be found
2241  * in the free linked list using get_data_ll_free_entry();
2242  */
2243 static void
2244 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2245         struct virtio_net_data_ll *ll_dev)
2246 {
2247         struct virtio_net_data_ll *ll = *ll_root_addr;
2248
2249         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2250         ll_dev->next = NULL;
2251         rte_compiler_barrier();
2252
2253         /* If ll == NULL then this is the first device. */
2254         if (ll) {
2255                 /* Increment to the tail of the linked list. */
2256                 while ((ll->next != NULL) )
2257                         ll = ll->next;
2258
2259                 ll->next = ll_dev;
2260         } else {
2261                 *ll_root_addr = ll_dev;
2262         }
2263 }
2264
2265 /*
2266  * Remove an entry from a used linked list. The entry must then be added to
2267  * the free linked list using put_data_ll_free_entry().
2268  */
2269 static void
2270 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2271         struct virtio_net_data_ll *ll_dev,
2272         struct virtio_net_data_ll *ll_dev_last)
2273 {
2274         struct virtio_net_data_ll *ll = *ll_root_addr;
2275
2276         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2277                 return;
2278
2279         if (ll_dev == ll)
2280                 *ll_root_addr = ll_dev->next;
2281         else
2282                 if (likely(ll_dev_last != NULL))
2283                         ll_dev_last->next = ll_dev->next;
2284                 else
2285                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2286 }
2287
2288 /*
2289  * Find and return an entry from the free linked list.
2290  */
2291 static struct virtio_net_data_ll *
2292 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2293 {
2294         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2295         struct virtio_net_data_ll *ll_dev;
2296
2297         if (ll_free == NULL)
2298                 return NULL;
2299
2300         ll_dev = ll_free;
2301         *ll_root_addr = ll_free->next;
2302
2303         return ll_dev;
2304 }
2305
2306 /*
2307  * Place an entry back on to the free linked list.
2308  */
2309 static void
2310 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2311         struct virtio_net_data_ll *ll_dev)
2312 {
2313         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2314
2315         if (ll_dev == NULL)
2316                 return;
2317
2318         ll_dev->next = ll_free;
2319         *ll_root_addr = ll_dev;
2320 }
2321
2322 /*
2323  * Creates a linked list of a given size.
2324  */
2325 static struct virtio_net_data_ll *
2326 alloc_data_ll(uint32_t size)
2327 {
2328         struct virtio_net_data_ll *ll_new;
2329         uint32_t i;
2330
2331         /* Malloc and then chain the linked list. */
2332         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2333         if (ll_new == NULL) {
2334                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2335                 return NULL;
2336         }
2337
2338         for (i = 0; i < size - 1; i++) {
2339                 ll_new[i].vdev = NULL;
2340                 ll_new[i].next = &ll_new[i+1];
2341         }
2342         ll_new[i].next = NULL;
2343
2344         return ll_new;
2345 }
2346
2347 /*
2348  * Create the main linked list along with each individual cores linked list. A used and a free list
2349  * are created to manage entries.
2350  */
2351 static int
2352 init_data_ll (void)
2353 {
2354         int lcore;
2355
2356         RTE_LCORE_FOREACH_SLAVE(lcore) {
2357                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2358                 if (lcore_info[lcore].lcore_ll == NULL) {
2359                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2360                         return -1;
2361                 }
2362
2363                 lcore_info[lcore].lcore_ll->device_num = 0;
2364                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2365                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2366                 if (num_devices % num_switching_cores)
2367                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2368                 else
2369                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2370         }
2371
2372         /* Allocate devices up to a maximum of MAX_DEVICES. */
2373         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2374
2375         return 0;
2376 }
2377
2378 /*
2379  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2380  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2381  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2382  */
2383 static void
2384 destroy_device (volatile struct virtio_net *dev)
2385 {
2386         struct virtio_net_data_ll *ll_lcore_dev_cur;
2387         struct virtio_net_data_ll *ll_main_dev_cur;
2388         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2389         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2390         struct vhost_dev *vdev;
2391         int lcore;
2392
2393         dev->flags &= ~VIRTIO_DEV_RUNNING;
2394
2395         vdev = (struct vhost_dev *)dev->priv;
2396         /*set the remove flag. */
2397         vdev->remove = 1;
2398         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2399                 rte_pause();
2400         }
2401
2402         /* Search for entry to be removed from lcore ll */
2403         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2404         while (ll_lcore_dev_cur != NULL) {
2405                 if (ll_lcore_dev_cur->vdev == vdev) {
2406                         break;
2407                 } else {
2408                         ll_lcore_dev_last = ll_lcore_dev_cur;
2409                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2410                 }
2411         }
2412
2413         if (ll_lcore_dev_cur == NULL) {
2414                 RTE_LOG(ERR, VHOST_CONFIG,
2415                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2416                         dev->device_fh);
2417                 return;
2418         }
2419
2420         /* Search for entry to be removed from main ll */
2421         ll_main_dev_cur = ll_root_used;
2422         ll_main_dev_last = NULL;
2423         while (ll_main_dev_cur != NULL) {
2424                 if (ll_main_dev_cur->vdev == vdev) {
2425                         break;
2426                 } else {
2427                         ll_main_dev_last = ll_main_dev_cur;
2428                         ll_main_dev_cur = ll_main_dev_cur->next;
2429                 }
2430         }
2431
2432         /* Remove entries from the lcore and main ll. */
2433         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2434         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2435
2436         /* Set the dev_removal_flag on each lcore. */
2437         RTE_LCORE_FOREACH_SLAVE(lcore) {
2438                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2439         }
2440
2441         /*
2442          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2443          * they can no longer access the device removed from the linked lists and that the devices
2444          * are no longer in use.
2445          */
2446         RTE_LCORE_FOREACH_SLAVE(lcore) {
2447                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2448                         rte_pause();
2449                 }
2450         }
2451
2452         /* Add the entries back to the lcore and main free ll.*/
2453         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2454         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2455
2456         /* Decrement number of device on the lcore. */
2457         lcore_info[vdev->coreid].lcore_ll->device_num--;
2458
2459         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2460
2461         if (zero_copy) {
2462                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2463
2464                 /* Stop the RX queue. */
2465                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2466                         LOG_DEBUG(VHOST_CONFIG,
2467                                 "(%"PRIu64") In destroy_device: Failed to stop "
2468                                 "rx queue:%d\n",
2469                                 dev->device_fh,
2470                                 vdev->vmdq_rx_q);
2471                 }
2472
2473                 LOG_DEBUG(VHOST_CONFIG,
2474                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2475                         "mempool back to ring for RX queue: %d\n",
2476                         dev->device_fh, vdev->vmdq_rx_q);
2477
2478                 mbuf_destroy_zcp(vpool);
2479
2480                 /* Stop the TX queue. */
2481                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2482                         LOG_DEBUG(VHOST_CONFIG,
2483                                 "(%"PRIu64") In destroy_device: Failed to "
2484                                 "stop tx queue:%d\n",
2485                                 dev->device_fh, vdev->vmdq_rx_q);
2486                 }
2487
2488                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2489
2490                 LOG_DEBUG(VHOST_CONFIG,
2491                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2492                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2493                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2494                         dev->device_fh);
2495
2496                 mbuf_destroy_zcp(vpool);
2497                 rte_free(vdev->regions_hpa);
2498         }
2499         rte_free(vdev);
2500
2501 }
2502
2503 /*
2504  * Calculate the region count of physical continous regions for one particular
2505  * region of whose vhost virtual address is continous. The particular region
2506  * start from vva_start, with size of 'size' in argument.
2507  */
2508 static uint32_t
2509 check_hpa_regions(uint64_t vva_start, uint64_t size)
2510 {
2511         uint32_t i, nregions = 0, page_size = getpagesize();
2512         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2513         if (vva_start % page_size) {
2514                 LOG_DEBUG(VHOST_CONFIG,
2515                         "in check_countinous: vva start(%p) mod page_size(%d) "
2516                         "has remainder\n",
2517                         (void *)(uintptr_t)vva_start, page_size);
2518                 return 0;
2519         }
2520         if (size % page_size) {
2521                 LOG_DEBUG(VHOST_CONFIG,
2522                         "in check_countinous: "
2523                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2524                         size, page_size);
2525                 return 0;
2526         }
2527         for (i = 0; i < size - page_size; i = i + page_size) {
2528                 cur_phys_addr
2529                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2530                 next_phys_addr = rte_mem_virt2phy(
2531                         (void *)(uintptr_t)(vva_start + i + page_size));
2532                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2533                         ++nregions;
2534                         LOG_DEBUG(VHOST_CONFIG,
2535                                 "in check_continuous: hva addr:(%p) is not "
2536                                 "continuous with hva addr:(%p), diff:%d\n",
2537                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2538                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2539                                 + page_size), page_size);
2540                         LOG_DEBUG(VHOST_CONFIG,
2541                                 "in check_continuous: hpa addr:(%p) is not "
2542                                 "continuous with hpa addr:(%p), "
2543                                 "diff:(%"PRIu64")\n",
2544                                 (void *)(uintptr_t)cur_phys_addr,
2545                                 (void *)(uintptr_t)next_phys_addr,
2546                                 (next_phys_addr-cur_phys_addr));
2547                 }
2548         }
2549         return nregions;
2550 }
2551
2552 /*
2553  * Divide each region whose vhost virtual address is continous into a few
2554  * sub-regions, make sure the physical address within each sub-region are
2555  * continous. And fill offset(to GPA) and size etc. information of each
2556  * sub-region into regions_hpa.
2557  */
2558 static uint32_t
2559 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2560 {
2561         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2562         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2563
2564         if (mem_region_hpa == NULL)
2565                 return 0;
2566
2567         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2568                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2569                         virtio_memory->regions[regionidx].address_offset;
2570                 mem_region_hpa[regionidx_hpa].guest_phys_address
2571                         = virtio_memory->regions[regionidx].guest_phys_address;
2572                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2573                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2574                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2575                 LOG_DEBUG(VHOST_CONFIG,
2576                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2577                         regionidx_hpa,
2578                         (void *)(uintptr_t)
2579                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2580                 LOG_DEBUG(VHOST_CONFIG,
2581                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2582                         regionidx_hpa,
2583                         (void *)(uintptr_t)
2584                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2585                 for (i = 0, k = 0;
2586                         i < virtio_memory->regions[regionidx].memory_size -
2587                                 page_size;
2588                         i += page_size) {
2589                         cur_phys_addr = rte_mem_virt2phy(
2590                                         (void *)(uintptr_t)(vva_start + i));
2591                         next_phys_addr = rte_mem_virt2phy(
2592                                         (void *)(uintptr_t)(vva_start +
2593                                         i + page_size));
2594                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2595                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2596                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2597                                         k + page_size;
2598                                 mem_region_hpa[regionidx_hpa].memory_size
2599                                         = k + page_size;
2600                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2601                                         "phys addr end  [%d]:(%p)\n",
2602                                         regionidx_hpa,
2603                                         (void *)(uintptr_t)
2604                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2605                                 LOG_DEBUG(VHOST_CONFIG,
2606                                         "in fill_hpa_regions: guest phys addr "
2607                                         "size [%d]:(%p)\n",
2608                                         regionidx_hpa,
2609                                         (void *)(uintptr_t)
2610                                         (mem_region_hpa[regionidx_hpa].memory_size));
2611                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2612                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2613                                 ++regionidx_hpa;
2614                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2615                                         next_phys_addr -
2616                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2617                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2618                                         " phys addr start[%d]:(%p)\n",
2619                                         regionidx_hpa,
2620                                         (void *)(uintptr_t)
2621                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2622                                 LOG_DEBUG(VHOST_CONFIG,
2623                                         "in fill_hpa_regions: host  phys addr "
2624                                         "start[%d]:(%p)\n",
2625                                         regionidx_hpa,
2626                                         (void *)(uintptr_t)
2627                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2628                                 k = 0;
2629                         } else {
2630                                 k += page_size;
2631                         }
2632                 }
2633                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2634                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2635                         + k + page_size;
2636                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2637                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2638                         "[%d]:(%p)\n", regionidx_hpa,
2639                         (void *)(uintptr_t)
2640                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2641                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2642                         "[%d]:(%p)\n", regionidx_hpa,
2643                         (void *)(uintptr_t)
2644                         (mem_region_hpa[regionidx_hpa].memory_size));
2645                 ++regionidx_hpa;
2646         }
2647         return regionidx_hpa;
2648 }
2649
2650 /*
2651  * A new device is added to a data core. First the device is added to the main linked list
2652  * and the allocated to a specific data core.
2653  */
2654 static int
2655 new_device (struct virtio_net *dev)
2656 {
2657         struct virtio_net_data_ll *ll_dev;
2658         int lcore, core_add = 0;
2659         uint32_t device_num_min = num_devices;
2660         struct vhost_dev *vdev;
2661         uint32_t regionidx;
2662
2663         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2664         if (vdev == NULL) {
2665                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2666                         dev->device_fh);
2667                 return -1;
2668         }
2669         vdev->dev = dev;
2670         dev->priv = vdev;
2671
2672         if (zero_copy) {
2673                 vdev->nregions_hpa = dev->mem->nregions;
2674                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2675                         vdev->nregions_hpa
2676                                 += check_hpa_regions(
2677                                         dev->mem->regions[regionidx].guest_phys_address
2678                                         + dev->mem->regions[regionidx].address_offset,
2679                                         dev->mem->regions[regionidx].memory_size);
2680
2681                 }
2682
2683                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2684                                                vdev->nregions_hpa,
2685                                                sizeof(struct virtio_memory_regions_hpa),
2686                                                RTE_CACHE_LINE_SIZE);
2687                 if (vdev->regions_hpa == NULL) {
2688                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2689                         rte_free(vdev);
2690                         return -1;
2691                 }
2692
2693
2694                 if (fill_hpa_memory_regions(
2695                         vdev->regions_hpa, dev->mem
2696                         ) != vdev->nregions_hpa) {
2697
2698                         RTE_LOG(ERR, VHOST_CONFIG,
2699                                 "hpa memory regions number mismatch: "
2700                                 "[%d]\n", vdev->nregions_hpa);
2701                         rte_free(vdev->regions_hpa);
2702                         rte_free(vdev);
2703                         return -1;
2704                 }
2705         }
2706
2707
2708         /* Add device to main ll */
2709         ll_dev = get_data_ll_free_entry(&ll_root_free);
2710         if (ll_dev == NULL) {
2711                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2712                         "of %d devices per core has been reached\n",
2713                         dev->device_fh, num_devices);
2714                 if (vdev->regions_hpa)
2715                         rte_free(vdev->regions_hpa);
2716                 rte_free(vdev);
2717                 return -1;
2718         }
2719         ll_dev->vdev = vdev;
2720         add_data_ll_entry(&ll_root_used, ll_dev);
2721         vdev->vmdq_rx_q
2722                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2723
2724         if (zero_copy) {
2725                 uint32_t index = vdev->vmdq_rx_q;
2726                 uint32_t count_in_ring, i;
2727                 struct mbuf_table *tx_q;
2728
2729                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2730
2731                 LOG_DEBUG(VHOST_CONFIG,
2732                         "(%"PRIu64") in new_device: mbuf count in mempool "
2733                         "before attach is: %d\n",
2734                         dev->device_fh,
2735                         rte_mempool_count(vpool_array[index].pool));
2736                 LOG_DEBUG(VHOST_CONFIG,
2737                         "(%"PRIu64") in new_device: mbuf count in  ring "
2738                         "before attach  is : %d\n",
2739                         dev->device_fh, count_in_ring);
2740
2741                 /*
2742                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2743                  */
2744                 for (i = 0; i < count_in_ring; i++)
2745                         attach_rxmbuf_zcp(dev);
2746
2747                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2748                         "mempool after attach is: %d\n",
2749                         dev->device_fh,
2750                         rte_mempool_count(vpool_array[index].pool));
2751                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2752                         "ring after attach  is : %d\n",
2753                         dev->device_fh,
2754                         rte_ring_count(vpool_array[index].ring));
2755
2756                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2757                 tx_q->txq_id = vdev->vmdq_rx_q;
2758
2759                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2760                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2761
2762                         LOG_DEBUG(VHOST_CONFIG,
2763                                 "(%"PRIu64") In new_device: Failed to start "
2764                                 "tx queue:%d\n",
2765                                 dev->device_fh, vdev->vmdq_rx_q);
2766
2767                         mbuf_destroy_zcp(vpool);
2768                         rte_free(vdev->regions_hpa);
2769                         rte_free(vdev);
2770                         return -1;
2771                 }
2772
2773                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2774                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2775
2776                         LOG_DEBUG(VHOST_CONFIG,
2777                                 "(%"PRIu64") In new_device: Failed to start "
2778                                 "rx queue:%d\n",
2779                                 dev->device_fh, vdev->vmdq_rx_q);
2780
2781                         /* Stop the TX queue. */
2782                         if (rte_eth_dev_tx_queue_stop(ports[0],
2783                                 vdev->vmdq_rx_q) != 0) {
2784                                 LOG_DEBUG(VHOST_CONFIG,
2785                                         "(%"PRIu64") In new_device: Failed to "
2786                                         "stop tx queue:%d\n",
2787                                         dev->device_fh, vdev->vmdq_rx_q);
2788                         }
2789
2790                         mbuf_destroy_zcp(vpool);
2791                         rte_free(vdev->regions_hpa);
2792                         rte_free(vdev);
2793                         return -1;
2794                 }
2795
2796         }
2797
2798         /*reset ready flag*/
2799         vdev->ready = DEVICE_MAC_LEARNING;
2800         vdev->remove = 0;
2801
2802         /* Find a suitable lcore to add the device. */
2803         RTE_LCORE_FOREACH_SLAVE(lcore) {
2804                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2805                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2806                         core_add = lcore;
2807                 }
2808         }
2809         /* Add device to lcore ll */
2810         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2811         if (ll_dev == NULL) {
2812                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2813                 vdev->ready = DEVICE_SAFE_REMOVE;
2814                 destroy_device(dev);
2815                 rte_free(vdev->regions_hpa);
2816                 rte_free(vdev);
2817                 return -1;
2818         }
2819         ll_dev->vdev = vdev;
2820         vdev->coreid = core_add;
2821
2822         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2823
2824         /* Initialize device stats */
2825         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2826
2827         /* Disable notifications. */
2828         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2829         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2830         lcore_info[vdev->coreid].lcore_ll->device_num++;
2831         dev->flags |= VIRTIO_DEV_RUNNING;
2832
2833         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2834
2835         return 0;
2836 }
2837
2838 /*
2839  * These callback allow devices to be added to the data core when configuration
2840  * has been fully complete.
2841  */
2842 static const struct virtio_net_device_ops virtio_net_device_ops =
2843 {
2844         .new_device =  new_device,
2845         .destroy_device = destroy_device,
2846 };
2847
2848 /*
2849  * This is a thread will wake up after a period to print stats if the user has
2850  * enabled them.
2851  */
2852 static void
2853 print_stats(void)
2854 {
2855         struct virtio_net_data_ll *dev_ll;
2856         uint64_t tx_dropped, rx_dropped;
2857         uint64_t tx, tx_total, rx, rx_total;
2858         uint32_t device_fh;
2859         const char clr[] = { 27, '[', '2', 'J', '\0' };
2860         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2861
2862         while(1) {
2863                 sleep(enable_stats);
2864
2865                 /* Clear screen and move to top left */
2866                 printf("%s%s", clr, top_left);
2867
2868                 printf("\nDevice statistics ====================================");
2869
2870                 dev_ll = ll_root_used;
2871                 while (dev_ll != NULL) {
2872                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2873                         tx_total = dev_statistics[device_fh].tx_total;
2874                         tx = dev_statistics[device_fh].tx;
2875                         tx_dropped = tx_total - tx;
2876                         if (zero_copy == 0) {
2877                                 rx_total = rte_atomic64_read(
2878                                         &dev_statistics[device_fh].rx_total_atomic);
2879                                 rx = rte_atomic64_read(
2880                                         &dev_statistics[device_fh].rx_atomic);
2881                         } else {
2882                                 rx_total = dev_statistics[device_fh].rx_total;
2883                                 rx = dev_statistics[device_fh].rx;
2884                         }
2885                         rx_dropped = rx_total - rx;
2886
2887                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2888                                         "\nTX total:            %"PRIu64""
2889                                         "\nTX dropped:          %"PRIu64""
2890                                         "\nTX successful:               %"PRIu64""
2891                                         "\nRX total:            %"PRIu64""
2892                                         "\nRX dropped:          %"PRIu64""
2893                                         "\nRX successful:               %"PRIu64"",
2894                                         device_fh,
2895                                         tx_total,
2896                                         tx_dropped,
2897                                         tx,
2898                                         rx_total,
2899                                         rx_dropped,
2900                                         rx);
2901
2902                         dev_ll = dev_ll->next;
2903                 }
2904                 printf("\n======================================================\n");
2905         }
2906 }
2907
2908 static void
2909 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2910         char *ring_name, uint32_t nb_mbuf)
2911 {
2912         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2913                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2914         if (vpool_array[index].pool != NULL) {
2915                 vpool_array[index].ring
2916                         = rte_ring_create(ring_name,
2917                                 rte_align32pow2(nb_mbuf + 1),
2918                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2919                 if (likely(vpool_array[index].ring != NULL)) {
2920                         LOG_DEBUG(VHOST_CONFIG,
2921                                 "in setup_mempool_tbl: mbuf count in "
2922                                 "mempool is: %d\n",
2923                                 rte_mempool_count(vpool_array[index].pool));
2924                         LOG_DEBUG(VHOST_CONFIG,
2925                                 "in setup_mempool_tbl: mbuf count in "
2926                                 "ring   is: %d\n",
2927                                 rte_ring_count(vpool_array[index].ring));
2928                 } else {
2929                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2930                                 ring_name);
2931                 }
2932
2933                 /* Need consider head room. */
2934                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2935         } else {
2936                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2937         }
2938 }
2939
2940 /* When we receive a INT signal, unregister vhost driver */
2941 static void
2942 sigint_handler(__rte_unused int signum)
2943 {
2944         /* Unregister vhost driver. */
2945         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2946         if (ret != 0)
2947                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2948         exit(0);
2949 }
2950
2951 /*
2952  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2953  * device is also registered here to handle the IOCTLs.
2954  */
2955 int
2956 main(int argc, char *argv[])
2957 {
2958         struct rte_mempool *mbuf_pool = NULL;
2959         unsigned lcore_id, core_id = 0;
2960         unsigned nb_ports, valid_num_ports;
2961         int ret;
2962         uint8_t portid;
2963         uint16_t queue_id;
2964         static pthread_t tid;
2965         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2966
2967         signal(SIGINT, sigint_handler);
2968
2969         /* init EAL */
2970         ret = rte_eal_init(argc, argv);
2971         if (ret < 0)
2972                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2973         argc -= ret;
2974         argv += ret;
2975
2976         /* parse app arguments */
2977         ret = us_vhost_parse_args(argc, argv);
2978         if (ret < 0)
2979                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2980
2981         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2982                 if (rte_lcore_is_enabled(lcore_id))
2983                         lcore_ids[core_id ++] = lcore_id;
2984
2985         if (rte_lcore_count() > RTE_MAX_LCORE)
2986                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2987
2988         /*set the number of swithcing cores available*/
2989         num_switching_cores = rte_lcore_count()-1;
2990
2991         /* Get the number of physical ports. */
2992         nb_ports = rte_eth_dev_count();
2993         if (nb_ports > RTE_MAX_ETHPORTS)
2994                 nb_ports = RTE_MAX_ETHPORTS;
2995
2996         /*
2997          * Update the global var NUM_PORTS and global array PORTS
2998          * and get value of var VALID_NUM_PORTS according to system ports number
2999          */
3000         valid_num_ports = check_ports_num(nb_ports);
3001
3002         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3003                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3004                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3005                 return -1;
3006         }
3007
3008         if (zero_copy == 0) {
3009                 /* Create the mbuf pool. */
3010                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3011                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3012                         0, MBUF_DATA_SIZE, rte_socket_id());
3013                 if (mbuf_pool == NULL)
3014                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3015
3016                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3017                         vpool_array[queue_id].pool = mbuf_pool;
3018
3019                 if (vm2vm_mode == VM2VM_HARDWARE) {
3020                         /* Enable VT loop back to let L2 switch to do it. */
3021                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3022                         LOG_DEBUG(VHOST_CONFIG,
3023                                 "Enable loop back for L2 switch in vmdq.\n");
3024                 }
3025         } else {
3026                 uint32_t nb_mbuf;
3027                 char pool_name[RTE_MEMPOOL_NAMESIZE];
3028                 char ring_name[RTE_MEMPOOL_NAMESIZE];
3029
3030                 nb_mbuf = num_rx_descriptor
3031                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3032                         + num_switching_cores * MAX_PKT_BURST;
3033
3034                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3035                         snprintf(pool_name, sizeof(pool_name),
3036                                 "rxmbuf_pool_%u", queue_id);
3037                         snprintf(ring_name, sizeof(ring_name),
3038                                 "rxmbuf_ring_%u", queue_id);
3039                         setup_mempool_tbl(rte_socket_id(), queue_id,
3040                                 pool_name, ring_name, nb_mbuf);
3041                 }
3042
3043                 nb_mbuf = num_tx_descriptor
3044                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3045                                 + num_switching_cores * MAX_PKT_BURST;
3046
3047                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3048                         snprintf(pool_name, sizeof(pool_name),
3049                                 "txmbuf_pool_%u", queue_id);
3050                         snprintf(ring_name, sizeof(ring_name),
3051                                 "txmbuf_ring_%u", queue_id);
3052                         setup_mempool_tbl(rte_socket_id(),
3053                                 (queue_id + MAX_QUEUES),
3054                                 pool_name, ring_name, nb_mbuf);
3055                 }
3056
3057                 if (vm2vm_mode == VM2VM_HARDWARE) {
3058                         /* Enable VT loop back to let L2 switch to do it. */
3059                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3060                         LOG_DEBUG(VHOST_CONFIG,
3061                                 "Enable loop back for L2 switch in vmdq.\n");
3062                 }
3063         }
3064         /* Set log level. */
3065         rte_set_log_level(LOG_LEVEL);
3066
3067         /* initialize all ports */
3068         for (portid = 0; portid < nb_ports; portid++) {
3069                 /* skip ports that are not enabled */
3070                 if ((enabled_port_mask & (1 << portid)) == 0) {
3071                         RTE_LOG(INFO, VHOST_PORT,
3072                                 "Skipping disabled port %d\n", portid);
3073                         continue;
3074                 }
3075                 if (port_init(portid) != 0)
3076                         rte_exit(EXIT_FAILURE,
3077                                 "Cannot initialize network ports\n");
3078         }
3079
3080         /* Initialise all linked lists. */
3081         if (init_data_ll() == -1)
3082                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3083
3084         /* Initialize device stats */
3085         memset(&dev_statistics, 0, sizeof(dev_statistics));
3086
3087         /* Enable stats if the user option is set. */
3088         if (enable_stats) {
3089                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3090                 if (ret != 0)
3091                         rte_exit(EXIT_FAILURE,
3092                                 "Cannot create print-stats thread\n");
3093
3094                 /* Set thread_name for aid in debugging.  */
3095                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3096                 ret = rte_thread_setname(tid, thread_name);
3097                 if (ret != 0)
3098                         RTE_LOG(ERR, VHOST_CONFIG,
3099                                 "Cannot set print-stats name\n");
3100         }
3101
3102         /* Launch all data cores. */
3103         if (zero_copy == 0) {
3104                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3105                         rte_eal_remote_launch(switch_worker,
3106                                 mbuf_pool, lcore_id);
3107                 }
3108         } else {
3109                 uint32_t count_in_mempool, index, i;
3110                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3111                         /* For all RX and TX queues. */
3112                         count_in_mempool
3113                                 = rte_mempool_count(vpool_array[index].pool);
3114
3115                         /*
3116                          * Transfer all un-attached mbufs from vpool.pool
3117                          * to vpoo.ring.
3118                          */
3119                         for (i = 0; i < count_in_mempool; i++) {
3120                                 struct rte_mbuf *mbuf
3121                                         = __rte_mbuf_raw_alloc(
3122                                                 vpool_array[index].pool);
3123                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3124                                                 (void *)mbuf);
3125                         }
3126
3127                         LOG_DEBUG(VHOST_CONFIG,
3128                                 "in main: mbuf count in mempool at initial "
3129                                 "is: %d\n", count_in_mempool);
3130                         LOG_DEBUG(VHOST_CONFIG,
3131                                 "in main: mbuf count in  ring at initial  is :"
3132                                 " %d\n",
3133                                 rte_ring_count(vpool_array[index].ring));
3134                 }
3135
3136                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3137                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3138                                 lcore_id);
3139         }
3140
3141         if (mergeable == 0)
3142                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3143
3144         /* Register vhost(cuse or user) driver to handle vhost messages. */
3145         ret = rte_vhost_driver_register((char *)&dev_basename);
3146         if (ret != 0)
3147                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3148
3149         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3150
3151         /* Start CUSE session. */
3152         rte_vhost_driver_session_start();
3153         return 0;
3154
3155 }