78fd1ab155aa1e39b33f3b86642cc020820adf63
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84
85 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
87
88 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
90
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX                       1
96 #define DEVICE_SAFE_REMOVE      2
97
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120                 + sizeof(struct rte_mbuf)))
121
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124
125 #define INVALID_PORT_ID 0xFF
126
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176
177 struct vpool {
178         struct rte_mempool *pool;
179         struct rte_ring *ring;
180         uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185         VM2VM_DISABLED = 0,
186         VM2VM_SOFTWARE = 1,
187         VM2VM_HARDWARE = 2,
188         VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194         PHYS_ADDR_CONTINUOUS = 0,
195         PHYS_ADDR_CROSS_SUBREG = 1,
196         PHYS_ADDR_INVALID = 2,
197         PHYS_ADDR_LAST
198 } hpa_type;
199
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221         .rxmode = {
222                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223                 .split_hdr_size = 0,
224                 .header_split   = 0, /**< Header Split disabled */
225                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
226                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
227                 /*
228                  * It is necessary for 1G NIC such as I350,
229                  * this fixes bug of ipv4 forwarding in guest can't
230                  * forward pakets from one virtio dev to another virtio dev.
231                  */
232                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
235         },
236
237         .txmode = {
238                 .mq_mode = ETH_MQ_TX_NONE,
239         },
240         .rx_adv_conf = {
241                 /*
242                  * should be overridden separately in code with
243                  * appropriate values
244                  */
245                 .vmdq_rx_conf = {
246                         .nb_queue_pools = ETH_8_POOLS,
247                         .enable_default_pool = 0,
248                         .default_pool = 0,
249                         .nb_pool_maps = 0,
250                         .pool_map = {{0, 0},},
251                 },
252         },
253 };
254
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
266         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286         unsigned len;
287         unsigned txq_id;
288         struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299         unsigned char   h_dest[ETH_ALEN];
300         unsigned char   h_source[ETH_ALEN];
301         __be16          h_vlan_proto;
302         __be16          h_vlan_TCI;
303         __be16          h_vlan_encapsulated_proto;
304 };
305
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309
310 /* Per-device statistics struct */
311 struct device_statistics {
312         uint64_t tx_total;
313         rte_atomic64_t rx_total_atomic;
314         uint64_t rx_total;
315         uint64_t tx;
316         rte_atomic64_t rx_atomic;
317         uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328         struct rte_eth_vmdq_rx_conf conf;
329         struct rte_eth_vmdq_rx_conf *def_conf =
330                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331         unsigned i;
332
333         memset(&conf, 0, sizeof(conf));
334         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335         conf.nb_pool_maps = num_devices;
336         conf.enable_loop_back = def_conf->enable_loop_back;
337         conf.rx_mode = def_conf->rx_mode;
338
339         for (i = 0; i < conf.nb_pool_maps; i++) {
340                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
341                 conf.pool_map[i].pools = (1UL << i);
342         }
343
344         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347         return 0;
348 }
349
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358         if (num_devices > max_nb_devices) {
359                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360                 return -1;
361         }
362         return 0;
363 }
364
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372         struct rte_eth_dev_info dev_info;
373         struct rte_eth_conf port_conf;
374         struct rte_eth_rxconf *rxconf;
375         struct rte_eth_txconf *txconf;
376         int16_t rx_rings, tx_rings;
377         uint16_t rx_ring_size, tx_ring_size;
378         int retval;
379         uint16_t q;
380
381         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382         rte_eth_dev_info_get (port, &dev_info);
383
384         if (dev_info.max_rx_queues > MAX_QUEUES) {
385                 rte_exit(EXIT_FAILURE,
386                         "please define MAX_QUEUES no less than %u in %s\n",
387                         dev_info.max_rx_queues, __FILE__);
388         }
389
390         rxconf = &dev_info.default_rxconf;
391         txconf = &dev_info.default_txconf;
392         rxconf->rx_drop_en = 1;
393
394         /* Enable vlan offload */
395         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396
397         /*
398          * Zero copy defers queue RX/TX start to the time when guest
399          * finishes its startup and packet buffers from that guest are
400          * available.
401          */
402         if (zero_copy) {
403                 rxconf->rx_deferred_start = 1;
404                 rxconf->rx_drop_en = 0;
405                 txconf->tx_deferred_start = 1;
406         }
407
408         /*configure the number of supported virtio devices based on VMDQ limits */
409         num_devices = dev_info.max_vmdq_pools;
410
411         if (zero_copy) {
412                 rx_ring_size = num_rx_descriptor;
413                 tx_ring_size = num_tx_descriptor;
414                 tx_rings = dev_info.max_tx_queues;
415         } else {
416                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418                 tx_rings = (uint16_t)rte_lcore_count();
419         }
420
421         retval = validate_num_devices(MAX_DEVICES);
422         if (retval < 0)
423                 return retval;
424
425         /* Get port configuration. */
426         retval = get_eth_conf(&port_conf, num_devices);
427         if (retval < 0)
428                 return retval;
429         /* NIC queues are divided into pf queues and vmdq queues.  */
430         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432         num_vmdq_queues = num_devices * queues_per_pool;
433         num_queues = num_pf_queues + num_vmdq_queues;
434         vmdq_queue_base = dev_info.vmdq_queue_base;
435         vmdq_pool_base  = dev_info.vmdq_pool_base;
436         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437                 num_pf_queues, num_devices, queues_per_pool);
438
439         if (port >= rte_eth_dev_count()) return -1;
440
441         if (enable_tx_csum == 0)
442                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443
444         if (enable_tso == 0) {
445                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447         }
448
449         rx_rings = (uint16_t)dev_info.max_rx_queues;
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port),
459                                                 rxconf,
460                                                 vpool_array[q].pool);
461                 if (retval < 0)
462                         return retval;
463         }
464         for (q = 0; q < tx_rings; q ++) {
465                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466                                                 rte_eth_dev_socket_id(port),
467                                                 txconf);
468                 if (retval < 0)
469                         return retval;
470         }
471
472         /* Start the device. */
473         retval  = rte_eth_dev_start(port);
474         if (retval < 0) {
475                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476                 return retval;
477         }
478
479         if (promiscuous)
480                 rte_eth_promiscuous_enable(port);
481
482         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486                         (unsigned)port,
487                         vmdq_ports_eth_addr[port].addr_bytes[0],
488                         vmdq_ports_eth_addr[port].addr_bytes[1],
489                         vmdq_ports_eth_addr[port].addr_bytes[2],
490                         vmdq_ports_eth_addr[port].addr_bytes[3],
491                         vmdq_ports_eth_addr[port].addr_bytes[4],
492                         vmdq_ports_eth_addr[port].addr_bytes[5]);
493
494         return 0;
495 }
496
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503         /* parse number string */
504
505         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506                 return -1;
507         else
508                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509
510         return 0;
511 }
512
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519         char *end = NULL;
520         unsigned long pm;
521
522         errno = 0;
523
524         /* parse hexadecimal string */
525         pm = strtoul(portmask, &end, 16);
526         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527                 return -1;
528
529         if (pm == 0)
530                 return -1;
531
532         return pm;
533
534 }
535
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542         char *end = NULL;
543         unsigned long num;
544
545         errno = 0;
546
547         /* parse unsigned int string */
548         num = strtoul(q_arg, &end, 10);
549         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550                 return -1;
551
552         if (num > max_valid_value)
553                 return -1;
554
555         return num;
556
557 }
558
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566         "               --vm2vm [0|1|2]\n"
567         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568         "               --dev-basename <name>\n"
569         "               --nb-devices ND\n"
570         "               -p PORTMASK: Set mask for ports to be used by application\n"
571         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578         "               --dev-basename: The basename to be used for the character device.\n"
579         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
580                         "zero copy\n"
581         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
582                         "used only when zero copy is enabled.\n"
583         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
584                         "used only when zero copy is enabled.\n"
585         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
586         "               --tso [0|1] disable/enable TCP segment offload.\n",
587                prgname);
588 }
589
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596         int opt, ret;
597         int option_index;
598         unsigned i;
599         const char *prgname = argv[0];
600         static struct option long_option[] = {
601                 {"vm2vm", required_argument, NULL, 0},
602                 {"rx-retry", required_argument, NULL, 0},
603                 {"rx-retry-delay", required_argument, NULL, 0},
604                 {"rx-retry-num", required_argument, NULL, 0},
605                 {"mergeable", required_argument, NULL, 0},
606                 {"vlan-strip", required_argument, NULL, 0},
607                 {"stats", required_argument, NULL, 0},
608                 {"dev-basename", required_argument, NULL, 0},
609                 {"zero-copy", required_argument, NULL, 0},
610                 {"rx-desc-num", required_argument, NULL, 0},
611                 {"tx-desc-num", required_argument, NULL, 0},
612                 {"tx-csum", required_argument, NULL, 0},
613                 {"tso", required_argument, NULL, 0},
614                 {NULL, 0, 0, 0},
615         };
616
617         /* Parse command line */
618         while ((opt = getopt_long(argc, argv, "p:P",
619                         long_option, &option_index)) != EOF) {
620                 switch (opt) {
621                 /* Portmask */
622                 case 'p':
623                         enabled_port_mask = parse_portmask(optarg);
624                         if (enabled_port_mask == 0) {
625                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626                                 us_vhost_usage(prgname);
627                                 return -1;
628                         }
629                         break;
630
631                 case 'P':
632                         promiscuous = 1;
633                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634                                 ETH_VMDQ_ACCEPT_BROADCAST |
635                                 ETH_VMDQ_ACCEPT_MULTICAST;
636                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637
638                         break;
639
640                 case 0:
641                         /* Enable/disable vm2vm comms. */
642                         if (!strncmp(long_option[option_index].name, "vm2vm",
643                                 MAX_LONG_OPT_SZ)) {
644                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645                                 if (ret == -1) {
646                                         RTE_LOG(INFO, VHOST_CONFIG,
647                                                 "Invalid argument for "
648                                                 "vm2vm [0|1|2]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         vm2vm_mode = (vm2vm_type)ret;
653                                 }
654                         }
655
656                         /* Enable/disable retries on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, 1);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         enable_retry = ret;
665                                 }
666                         }
667
668                         /* Enable/disable TX checksum offload. */
669                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else
676                                         enable_tx_csum = ret;
677                         }
678
679                         /* Enable/disable TSO offload. */
680                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681                                 ret = parse_num_opt(optarg, 1);
682                                 if (ret == -1) {
683                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684                                         us_vhost_usage(prgname);
685                                         return -1;
686                                 } else
687                                         enable_tso = ret;
688                         }
689
690                         /* Specify the retries delay time (in useconds) on RX. */
691                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         burst_rx_delay_time = ret;
699                                 }
700                         }
701
702                         /* Specify the retries number on RX. */
703                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704                                 ret = parse_num_opt(optarg, INT32_MAX);
705                                 if (ret == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 } else {
710                                         burst_rx_retry_num = ret;
711                                 }
712                         }
713
714                         /* Enable/disable RX mergeable buffers. */
715                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716                                 ret = parse_num_opt(optarg, 1);
717                                 if (ret == -1) {
718                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719                                         us_vhost_usage(prgname);
720                                         return -1;
721                                 } else {
722                                         mergeable = !!ret;
723                                         if (ret) {
724                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
725                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
726                                                         = JUMBO_FRAME_MAX_SIZE;
727                                         }
728                                 }
729                         }
730
731                         /* Enable/disable RX VLAN strip on host. */
732                         if (!strncmp(long_option[option_index].name,
733                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
734                                 ret = parse_num_opt(optarg, 1);
735                                 if (ret == -1) {
736                                         RTE_LOG(INFO, VHOST_CONFIG,
737                                                 "Invalid argument for VLAN strip [0|1]\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         vlan_strip = !!ret;
742                                         vmdq_conf_default.rxmode.hw_vlan_strip =
743                                                 vlan_strip;
744                                 }
745                         }
746
747                         /* Enable/disable stats. */
748                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749                                 ret = parse_num_opt(optarg, INT32_MAX);
750                                 if (ret == -1) {
751                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752                                         us_vhost_usage(prgname);
753                                         return -1;
754                                 } else {
755                                         enable_stats = ret;
756                                 }
757                         }
758
759                         /* Set character device basename. */
760                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761                                 if (us_vhost_parse_basename(optarg) == -1) {
762                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763                                         us_vhost_usage(prgname);
764                                         return -1;
765                                 }
766                         }
767
768                         /* Enable/disable rx/tx zero copy. */
769                         if (!strncmp(long_option[option_index].name,
770                                 "zero-copy", MAX_LONG_OPT_SZ)) {
771                                 ret = parse_num_opt(optarg, 1);
772                                 if (ret == -1) {
773                                         RTE_LOG(INFO, VHOST_CONFIG,
774                                                 "Invalid argument"
775                                                 " for zero-copy [0|1]\n");
776                                         us_vhost_usage(prgname);
777                                         return -1;
778                                 } else
779                                         zero_copy = ret;
780                         }
781
782                         /* Specify the descriptor number on RX. */
783                         if (!strncmp(long_option[option_index].name,
784                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
785                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
786                                 if ((ret == -1) || (!POWEROF2(ret))) {
787                                         RTE_LOG(INFO, VHOST_CONFIG,
788                                         "Invalid argument for rx-desc-num[0-N],"
789                                         "power of 2 required.\n");
790                                         us_vhost_usage(prgname);
791                                         return -1;
792                                 } else {
793                                         num_rx_descriptor = ret;
794                                 }
795                         }
796
797                         /* Specify the descriptor number on TX. */
798                         if (!strncmp(long_option[option_index].name,
799                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
800                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
801                                 if ((ret == -1) || (!POWEROF2(ret))) {
802                                         RTE_LOG(INFO, VHOST_CONFIG,
803                                         "Invalid argument for tx-desc-num [0-N],"
804                                         "power of 2 required.\n");
805                                         us_vhost_usage(prgname);
806                                         return -1;
807                                 } else {
808                                         num_tx_descriptor = ret;
809                                 }
810                         }
811
812                         break;
813
814                         /* Invalid option - print options. */
815                 default:
816                         us_vhost_usage(prgname);
817                         return -1;
818                 }
819         }
820
821         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822                 if (enabled_port_mask & (1 << i))
823                         ports[num_ports++] = (uint8_t)i;
824         }
825
826         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829                 return -1;
830         }
831
832         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833                 RTE_LOG(INFO, VHOST_PORT,
834                         "Vhost zero copy doesn't support software vm2vm,"
835                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
836                 return -1;
837         }
838
839         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840                 RTE_LOG(INFO, VHOST_PORT,
841                         "Vhost zero copy doesn't support jumbo frame,"
842                         "please specify '--mergeable 0' to disable the "
843                         "mergeable feature.\n");
844                 return -1;
845         }
846
847         return 0;
848 }
849
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856         unsigned valid_num_ports = num_ports;
857         unsigned portid;
858
859         if (num_ports > nb_ports) {
860                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861                         num_ports, nb_ports);
862                 num_ports = nb_ports;
863         }
864
865         for (portid = 0; portid < num_ports; portid ++) {
866                 if (ports[portid] >= nb_ports) {
867                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868                                 ports[portid], (nb_ports - 1));
869                         ports[portid] = INVALID_PORT_ID;
870                         valid_num_ports--;
871                 }
872         }
873         return valid_num_ports;
874 }
875
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #if RTE_LOG_LEVEL >= RTE_LOG_DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
882         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
883         unsigned int index;                                                                                                                                                                                             \
884         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
885                                                                                                                                                                                                                                         \
886         if ((header))                                                                                                                                                                                                   \
887                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
888         else                                                                                                                                                                                                                    \
889                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
890         for (index = 0; index < (size); index++) {                                                                                                                                              \
891                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
892                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
893         }                                                                                                                                                                                                                               \
894         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
895                                                                                                                                                                                                                                         \
896         RTE_LOG(DEBUG, VHOST_DATA, "%s", packet);                                                                                                                                                                       \
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908         uint32_t buf_len, hpa_type *addr_type)
909 {
910         struct virtio_memory_regions_hpa *region;
911         uint32_t regionidx;
912         uint64_t vhost_pa = 0;
913
914         *addr_type = PHYS_ADDR_INVALID;
915
916         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917                 region = &vdev->regions_hpa[regionidx];
918                 if ((guest_pa >= region->guest_phys_address) &&
919                         (guest_pa <= region->guest_phys_address_end)) {
920                         vhost_pa = region->host_phys_addr_offset + guest_pa;
921                         if (likely((guest_pa + buf_len - 1)
922                                 <= region->guest_phys_address_end))
923                                 *addr_type = PHYS_ADDR_CONTINUOUS;
924                         else
925                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
926                         break;
927                 }
928         }
929
930         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") GPA %p| HPA %p\n",
931                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932                 (void *)(uintptr_t)vhost_pa);
933
934         return vhost_pa;
935 }
936
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953         struct ether_hdr *pkt_hdr;
954         struct virtio_net_data_ll *dev_ll;
955         struct virtio_net *dev = vdev->dev;
956         int i, ret;
957
958         /* Learn MAC address of guest device from packet */
959         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960
961         dev_ll = ll_root_used;
962
963         while (dev_ll != NULL) {
964                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966                         return -1;
967                 }
968                 dev_ll = dev_ll->next;
969         }
970
971         for (i = 0; i < ETHER_ADDR_LEN; i++)
972                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973
974         /* vlan_tag currently uses the device_id. */
975         vdev->vlan_tag = vlan_tags[dev->device_fh];
976
977         /* Print out VMDQ registration info. */
978         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979                 dev->device_fh,
980                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983                 vdev->vlan_tag);
984
985         /* Register the MAC address. */
986         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987                                 (uint32_t)dev->device_fh + vmdq_pool_base);
988         if (ret)
989                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990                                         dev->device_fh);
991
992         /* Enable stripping of the vlan tag as we handle routing. */
993         if (vlan_strip)
994                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995                         (uint16_t)vdev->vmdq_rx_q, 1);
996
997         /* Set device as ready for RX. */
998         vdev->ready = DEVICE_RX;
999
1000         return 0;
1001 }
1002
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010         unsigned i = 0;
1011         unsigned rx_count;
1012         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013
1014         if (vdev->ready == DEVICE_RX) {
1015                 /*clear MAC and VLAN settings*/
1016                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017                 for (i = 0; i < 6; i++)
1018                         vdev->mac_address.addr_bytes[i] = 0;
1019
1020                 vdev->vlan_tag = 0;
1021
1022                 /*Clear out the receive buffers*/
1023                 rx_count = rte_eth_rx_burst(ports[0],
1024                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025
1026                 while (rx_count) {
1027                         for (i = 0; i < rx_count; i++)
1028                                 rte_pktmbuf_free(pkts_burst[i]);
1029
1030                         rx_count = rte_eth_rx_burst(ports[0],
1031                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032                 }
1033
1034                 vdev->ready = DEVICE_MAC_LEARNING;
1035         }
1036 }
1037
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045         struct virtio_net_data_ll *dev_ll;
1046         struct ether_hdr *pkt_hdr;
1047         uint64_t ret = 0;
1048         struct virtio_net *dev = vdev->dev;
1049         struct virtio_net *tdev; /* destination virito device */
1050
1051         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052
1053         /*get the used devices list*/
1054         dev_ll = ll_root_used;
1055
1056         while (dev_ll != NULL) {
1057                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058                                           &dev_ll->vdev->mac_address)) {
1059
1060                         /* Drop the packet if the TX packet is destined for the TX device. */
1061                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062                                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1063                                         "Source and destination MAC addresses are the same. "
1064                                         "Dropping packet.\n",
1065                                         dev->device_fh);
1066                                 return 0;
1067                         }
1068                         tdev = dev_ll->vdev->dev;
1069
1070
1071                         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1072                                 "MAC address is local\n", tdev->device_fh);
1073
1074                         if (unlikely(dev_ll->vdev->remove)) {
1075                                 /*drop the packet if the device is marked for removal*/
1076                                 RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") "
1077                                         "Device is marked for removal\n", tdev->device_fh);
1078                         } else {
1079                                 /*send the packet to the local virtio device*/
1080                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1081                                 if (enable_stats) {
1082                                         rte_atomic64_add(
1083                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1084                                         1);
1085                                         rte_atomic64_add(
1086                                         &dev_statistics[tdev->device_fh].rx_atomic,
1087                                         ret);
1088                                         dev_statistics[dev->device_fh].tx_total++;
1089                                         dev_statistics[dev->device_fh].tx += ret;
1090                                 }
1091                         }
1092
1093                         return 0;
1094                 }
1095                 dev_ll = dev_ll->next;
1096         }
1097
1098         return -1;
1099 }
1100
1101 /*
1102  * Check if the destination MAC of a packet is one local VM,
1103  * and get its vlan tag, and offset if it is.
1104  */
1105 static inline int __attribute__((always_inline))
1106 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1107         uint32_t *offset, uint16_t *vlan_tag)
1108 {
1109         struct virtio_net_data_ll *dev_ll = ll_root_used;
1110         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1111
1112         while (dev_ll != NULL) {
1113                 if ((dev_ll->vdev->ready == DEVICE_RX)
1114                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1115                 &dev_ll->vdev->mac_address)) {
1116                         /*
1117                          * Drop the packet if the TX packet is
1118                          * destined for the TX device.
1119                          */
1120                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1121                                 RTE_LOG(DEBUG, VHOST_DATA,
1122                                 "(%"PRIu64") TX: Source and destination"
1123                                 " MAC addresses are the same. Dropping "
1124                                 "packet.\n",
1125                                 dev_ll->vdev->dev->device_fh);
1126                                 return -1;
1127                         }
1128
1129                         /*
1130                          * HW vlan strip will reduce the packet length
1131                          * by minus length of vlan tag, so need restore
1132                          * the packet length by plus it.
1133                          */
1134                         *offset = VLAN_HLEN;
1135                         *vlan_tag =
1136                         (uint16_t)
1137                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1138
1139                         RTE_LOG(DEBUG, VHOST_DATA,
1140                         "(%"PRIu64") TX: pkt to local VM device id:"
1141                         "(%"PRIu64") vlan tag: %d.\n",
1142                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1143                         (int)*vlan_tag);
1144
1145                         break;
1146                 }
1147                 dev_ll = dev_ll->next;
1148         }
1149         return 0;
1150 }
1151
1152 static uint16_t
1153 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1154 {
1155         if (ol_flags & PKT_TX_IPV4)
1156                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1157         else /* assume ethertype == ETHER_TYPE_IPv6 */
1158                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1159 }
1160
1161 static void virtio_tx_offload(struct rte_mbuf *m)
1162 {
1163         void *l3_hdr;
1164         struct ipv4_hdr *ipv4_hdr = NULL;
1165         struct tcp_hdr *tcp_hdr = NULL;
1166         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1167
1168         l3_hdr = (char *)eth_hdr + m->l2_len;
1169
1170         if (m->ol_flags & PKT_TX_IPV4) {
1171                 ipv4_hdr = l3_hdr;
1172                 ipv4_hdr->hdr_checksum = 0;
1173                 m->ol_flags |= PKT_TX_IP_CKSUM;
1174         }
1175
1176         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1177         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1178 }
1179
1180 /*
1181  * This function routes the TX packet to the correct interface. This may be a local device
1182  * or the physical port.
1183  */
1184 static inline void __attribute__((always_inline))
1185 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1186 {
1187         struct mbuf_table *tx_q;
1188         struct rte_mbuf **m_table;
1189         unsigned len, ret, offset = 0;
1190         const uint16_t lcore_id = rte_lcore_id();
1191         struct virtio_net *dev = vdev->dev;
1192         struct ether_hdr *nh;
1193
1194         /*check if destination is local VM*/
1195         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1196                 rte_pktmbuf_free(m);
1197                 return;
1198         }
1199
1200         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1201                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1202                         rte_pktmbuf_free(m);
1203                         return;
1204                 }
1205         }
1206
1207         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") TX: "
1208                 "MAC address is external\n", dev->device_fh);
1209
1210         /*Add packet to the port tx queue*/
1211         tx_q = &lcore_tx_queue[lcore_id];
1212         len = tx_q->len;
1213
1214         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1215         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1216                 /* Guest has inserted the vlan tag. */
1217                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1218                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1219                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1220                         (vh->vlan_tci != vlan_tag_be))
1221                         vh->vlan_tci = vlan_tag_be;
1222         } else {
1223                 m->ol_flags |= PKT_TX_VLAN_PKT;
1224
1225                 /*
1226                  * Find the right seg to adjust the data len when offset is
1227                  * bigger than tail room size.
1228                  */
1229                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1230                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1231                                 m->data_len += offset;
1232                         else {
1233                                 struct rte_mbuf *seg = m;
1234
1235                                 while ((seg->next != NULL) &&
1236                                         (offset > rte_pktmbuf_tailroom(seg)))
1237                                         seg = seg->next;
1238
1239                                 seg->data_len += offset;
1240                         }
1241                         m->pkt_len += offset;
1242                 }
1243
1244                 m->vlan_tci = vlan_tag;
1245         }
1246
1247         if (m->ol_flags & PKT_TX_TCP_SEG)
1248                 virtio_tx_offload(m);
1249
1250         tx_q->m_table[len] = m;
1251         len++;
1252         if (enable_stats) {
1253                 dev_statistics[dev->device_fh].tx_total++;
1254                 dev_statistics[dev->device_fh].tx++;
1255         }
1256
1257         if (unlikely(len == MAX_PKT_BURST)) {
1258                 m_table = (struct rte_mbuf **)tx_q->m_table;
1259                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1260                 /* Free any buffers not handled by TX and update the port stats. */
1261                 if (unlikely(ret < len)) {
1262                         do {
1263                                 rte_pktmbuf_free(m_table[ret]);
1264                         } while (++ret < len);
1265                 }
1266
1267                 len = 0;
1268         }
1269
1270         tx_q->len = len;
1271         return;
1272 }
1273 /*
1274  * This function is called by each data core. It handles all RX/TX registered with the
1275  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1276  * with all devices in the main linked list.
1277  */
1278 static int
1279 switch_worker(__attribute__((unused)) void *arg)
1280 {
1281         struct rte_mempool *mbuf_pool = arg;
1282         struct virtio_net *dev = NULL;
1283         struct vhost_dev *vdev = NULL;
1284         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1285         struct virtio_net_data_ll *dev_ll;
1286         struct mbuf_table *tx_q;
1287         volatile struct lcore_ll_info *lcore_ll;
1288         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1289         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1290         unsigned ret, i;
1291         const uint16_t lcore_id = rte_lcore_id();
1292         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1293         uint16_t rx_count = 0;
1294         uint16_t tx_count;
1295         uint32_t retry = 0;
1296
1297         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1298         lcore_ll = lcore_info[lcore_id].lcore_ll;
1299         prev_tsc = 0;
1300
1301         tx_q = &lcore_tx_queue[lcore_id];
1302         for (i = 0; i < num_cores; i ++) {
1303                 if (lcore_ids[i] == lcore_id) {
1304                         tx_q->txq_id = i;
1305                         break;
1306                 }
1307         }
1308
1309         while(1) {
1310                 cur_tsc = rte_rdtsc();
1311                 /*
1312                  * TX burst queue drain
1313                  */
1314                 diff_tsc = cur_tsc - prev_tsc;
1315                 if (unlikely(diff_tsc > drain_tsc)) {
1316
1317                         if (tx_q->len) {
1318                                 RTE_LOG(DEBUG, VHOST_DATA,
1319                                         "TX queue drained after timeout with burst size %u\n",
1320                                         tx_q->len);
1321
1322                                 /*Tx any packets in the queue*/
1323                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1324                                                                            (struct rte_mbuf **)tx_q->m_table,
1325                                                                            (uint16_t)tx_q->len);
1326                                 if (unlikely(ret < tx_q->len)) {
1327                                         do {
1328                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1329                                         } while (++ret < tx_q->len);
1330                                 }
1331
1332                                 tx_q->len = 0;
1333                         }
1334
1335                         prev_tsc = cur_tsc;
1336
1337                 }
1338
1339                 rte_prefetch0(lcore_ll->ll_root_used);
1340                 /*
1341                  * Inform the configuration core that we have exited the linked list and that no devices are
1342                  * in use if requested.
1343                  */
1344                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1345                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1346
1347                 /*
1348                  * Process devices
1349                  */
1350                 dev_ll = lcore_ll->ll_root_used;
1351
1352                 while (dev_ll != NULL) {
1353                         /*get virtio device ID*/
1354                         vdev = dev_ll->vdev;
1355                         dev = vdev->dev;
1356
1357                         if (unlikely(vdev->remove)) {
1358                                 dev_ll = dev_ll->next;
1359                                 unlink_vmdq(vdev);
1360                                 vdev->ready = DEVICE_SAFE_REMOVE;
1361                                 continue;
1362                         }
1363                         if (likely(vdev->ready == DEVICE_RX)) {
1364                                 /*Handle guest RX*/
1365                                 rx_count = rte_eth_rx_burst(ports[0],
1366                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1367
1368                                 if (rx_count) {
1369                                         /*
1370                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1371                                         * Here MAX_PKT_BURST must be less than virtio queue size
1372                                         */
1373                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1374                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1375                                                         rte_delay_us(burst_rx_delay_time);
1376                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1377                                                                 break;
1378                                                 }
1379                                         }
1380                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1381                                         if (enable_stats) {
1382                                                 rte_atomic64_add(
1383                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1384                                                 rx_count);
1385                                                 rte_atomic64_add(
1386                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1387                                         }
1388                                         while (likely(rx_count)) {
1389                                                 rx_count--;
1390                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1391                                         }
1392
1393                                 }
1394                         }
1395
1396                         if (likely(!vdev->remove)) {
1397                                 /* Handle guest TX*/
1398                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1399                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1400                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1401                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1402                                                 while (tx_count)
1403                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1404                                         }
1405                                 }
1406                                 for (i = 0; i < tx_count; ++i) {
1407                                         virtio_tx_route(vdev, pkts_burst[i],
1408                                                 vlan_tags[(uint16_t)dev->device_fh]);
1409                                 }
1410                         }
1411
1412                         /*move to the next device in the list*/
1413                         dev_ll = dev_ll->next;
1414                 }
1415         }
1416
1417         return 0;
1418 }
1419
1420 /*
1421  * This function gets available ring number for zero copy rx.
1422  * Only one thread will call this funciton for a paticular virtio device,
1423  * so, it is designed as non-thread-safe function.
1424  */
1425 static inline uint32_t __attribute__((always_inline))
1426 get_available_ring_num_zcp(struct virtio_net *dev)
1427 {
1428         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1429         uint16_t avail_idx;
1430
1431         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1432         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1433 }
1434
1435 /*
1436  * This function gets available ring index for zero copy rx,
1437  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1438  * Only one thread will call this funciton for a paticular virtio device,
1439  * so, it is designed as non-thread-safe function.
1440  */
1441 static inline uint32_t __attribute__((always_inline))
1442 get_available_ring_index_zcp(struct virtio_net *dev,
1443         uint16_t *res_base_idx, uint32_t count)
1444 {
1445         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1446         uint16_t avail_idx;
1447         uint32_t retry = 0;
1448         uint16_t free_entries;
1449
1450         *res_base_idx = vq->last_used_idx_res;
1451         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1452         free_entries = (avail_idx - *res_base_idx);
1453
1454         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") in get_available_ring_index_zcp: "
1455                         "avail idx: %d, "
1456                         "res base idx:%d, free entries:%d\n",
1457                         dev->device_fh, avail_idx, *res_base_idx,
1458                         free_entries);
1459
1460         /*
1461          * If retry is enabled and the queue is full then we wait
1462          * and retry to avoid packet loss.
1463          */
1464         if (enable_retry && unlikely(count > free_entries)) {
1465                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1466                         rte_delay_us(burst_rx_delay_time);
1467                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1468                         free_entries = (avail_idx - *res_base_idx);
1469                         if (count <= free_entries)
1470                                 break;
1471                 }
1472         }
1473
1474         /*check that we have enough buffers*/
1475         if (unlikely(count > free_entries))
1476                 count = free_entries;
1477
1478         if (unlikely(count == 0)) {
1479                 RTE_LOG(DEBUG, VHOST_DATA,
1480                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1481                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1482                         dev->device_fh, avail_idx,
1483                         *res_base_idx, free_entries);
1484                 return 0;
1485         }
1486
1487         vq->last_used_idx_res = *res_base_idx + count;
1488
1489         return count;
1490 }
1491
1492 /*
1493  * This function put descriptor back to used list.
1494  */
1495 static inline void __attribute__((always_inline))
1496 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1497 {
1498         uint16_t res_cur_idx = vq->last_used_idx;
1499         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1500         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1501         rte_compiler_barrier();
1502         *(volatile uint16_t *)&vq->used->idx += 1;
1503         vq->last_used_idx += 1;
1504
1505         /* Kick the guest if necessary. */
1506         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1507                 eventfd_write(vq->callfd, (eventfd_t)1);
1508 }
1509
1510 /*
1511  * This function get available descriptor from vitio vring and un-attached mbuf
1512  * from vpool->ring, and then attach them together. It needs adjust the offset
1513  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1514  * frame data may be put to wrong location in mbuf.
1515  */
1516 static inline void __attribute__((always_inline))
1517 attach_rxmbuf_zcp(struct virtio_net *dev)
1518 {
1519         uint16_t res_base_idx, desc_idx;
1520         uint64_t buff_addr, phys_addr;
1521         struct vhost_virtqueue *vq;
1522         struct vring_desc *desc;
1523         void *obj = NULL;
1524         struct rte_mbuf *mbuf;
1525         struct vpool *vpool;
1526         hpa_type addr_type;
1527         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1528
1529         vpool = &vpool_array[vdev->vmdq_rx_q];
1530         vq = dev->virtqueue[VIRTIO_RXQ];
1531
1532         do {
1533                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1534                                 1) != 1))
1535                         return;
1536                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1537
1538                 desc = &vq->desc[desc_idx];
1539                 if (desc->flags & VRING_DESC_F_NEXT) {
1540                         desc = &vq->desc[desc->next];
1541                         buff_addr = gpa_to_vva(dev, desc->addr);
1542                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1543                                         &addr_type);
1544                 } else {
1545                         buff_addr = gpa_to_vva(dev,
1546                                         desc->addr + vq->vhost_hlen);
1547                         phys_addr = gpa_to_hpa(vdev,
1548                                         desc->addr + vq->vhost_hlen,
1549                                         desc->len, &addr_type);
1550                 }
1551
1552                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1553                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1554                                 " address found when attaching RX frame buffer"
1555                                 " address!\n", dev->device_fh);
1556                         put_desc_to_used_list_zcp(vq, desc_idx);
1557                         continue;
1558                 }
1559
1560                 /*
1561                  * Check if the frame buffer address from guest crosses
1562                  * sub-region or not.
1563                  */
1564                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1565                         RTE_LOG(ERR, VHOST_DATA,
1566                                 "(%"PRIu64") Frame buffer address cross "
1567                                 "sub-regioin found when attaching RX frame "
1568                                 "buffer address!\n",
1569                                 dev->device_fh);
1570                         put_desc_to_used_list_zcp(vq, desc_idx);
1571                         continue;
1572                 }
1573         } while (unlikely(phys_addr == 0));
1574
1575         rte_ring_sc_dequeue(vpool->ring, &obj);
1576         mbuf = obj;
1577         if (unlikely(mbuf == NULL)) {
1578                 RTE_LOG(DEBUG, VHOST_DATA,
1579                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1580                         "ring_sc_dequeue fail.\n",
1581                         dev->device_fh);
1582                 put_desc_to_used_list_zcp(vq, desc_idx);
1583                 return;
1584         }
1585
1586         if (unlikely(vpool->buf_size > desc->len)) {
1587                 RTE_LOG(DEBUG, VHOST_DATA,
1588                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1589                         "length(%d) of descriptor idx: %d less than room "
1590                         "size required: %d\n",
1591                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1592                 put_desc_to_used_list_zcp(vq, desc_idx);
1593                 rte_ring_sp_enqueue(vpool->ring, obj);
1594                 return;
1595         }
1596
1597         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1598         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1599         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1600         mbuf->data_len = desc->len;
1601         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1602
1603         RTE_LOG(DEBUG, VHOST_DATA,
1604                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1605                 "descriptor idx:%d\n",
1606                 dev->device_fh, res_base_idx, desc_idx);
1607
1608         __rte_mbuf_raw_free(mbuf);
1609
1610         return;
1611 }
1612
1613 /*
1614  * Detach an attched packet mbuf -
1615  *  - restore original mbuf address and length values.
1616  *  - reset pktmbuf data and data_len to their default values.
1617  *  All other fields of the given packet mbuf will be left intact.
1618  *
1619  * @param m
1620  *   The attached packet mbuf.
1621  */
1622 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1623 {
1624         const struct rte_mempool *mp = m->pool;
1625         void *buf = rte_mbuf_to_baddr(m);
1626         uint32_t buf_ofs;
1627         uint32_t buf_len = mp->elt_size - sizeof(*m);
1628         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1629
1630         m->buf_addr = buf;
1631         m->buf_len = (uint16_t)buf_len;
1632
1633         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1634                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1635         m->data_off = buf_ofs;
1636
1637         m->data_len = 0;
1638 }
1639
1640 /*
1641  * This function is called after packets have been transimited. It fetchs mbuf
1642  * from vpool->pool, detached it and put into vpool->ring. It also update the
1643  * used index and kick the guest if necessary.
1644  */
1645 static inline uint32_t __attribute__((always_inline))
1646 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1647 {
1648         struct rte_mbuf *mbuf;
1649         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1650         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1651         uint32_t index = 0;
1652         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1653
1654         RTE_LOG(DEBUG, VHOST_DATA,
1655                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1656                 "clean is: %d\n",
1657                 dev->device_fh, mbuf_count);
1658         RTE_LOG(DEBUG, VHOST_DATA,
1659                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1660                 "clean  is : %d\n",
1661                 dev->device_fh, rte_ring_count(vpool->ring));
1662
1663         for (index = 0; index < mbuf_count; index++) {
1664                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1665                 if (likely(MBUF_EXT_MEM(mbuf)))
1666                         pktmbuf_detach_zcp(mbuf);
1667                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1668
1669                 /* Update used index buffer information. */
1670                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1671                 vq->used->ring[used_idx].len = 0;
1672
1673                 used_idx = (used_idx + 1) & (vq->size - 1);
1674         }
1675
1676         RTE_LOG(DEBUG, VHOST_DATA,
1677                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1678                 "clean is: %d\n",
1679                 dev->device_fh, rte_mempool_count(vpool->pool));
1680         RTE_LOG(DEBUG, VHOST_DATA,
1681                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1682                 "clean  is : %d\n",
1683                 dev->device_fh, rte_ring_count(vpool->ring));
1684         RTE_LOG(DEBUG, VHOST_DATA,
1685                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1686                 "vq->last_used_idx:%d\n",
1687                 dev->device_fh, vq->last_used_idx);
1688
1689         vq->last_used_idx += mbuf_count;
1690
1691         RTE_LOG(DEBUG, VHOST_DATA,
1692                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1693                 "vq->last_used_idx:%d\n",
1694                 dev->device_fh, vq->last_used_idx);
1695
1696         rte_compiler_barrier();
1697
1698         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1699
1700         /* Kick guest if required. */
1701         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1702                 eventfd_write(vq->callfd, (eventfd_t)1);
1703
1704         return 0;
1705 }
1706
1707 /*
1708  * This function is called when a virtio device is destroy.
1709  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1710  */
1711 static void mbuf_destroy_zcp(struct vpool *vpool)
1712 {
1713         struct rte_mbuf *mbuf = NULL;
1714         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1715
1716         RTE_LOG(DEBUG, VHOST_CONFIG,
1717                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1718                 "mbuf_destroy_zcp is: %d\n",
1719                 mbuf_count);
1720         RTE_LOG(DEBUG, VHOST_CONFIG,
1721                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1722                 "mbuf_destroy_zcp  is : %d\n",
1723                 rte_ring_count(vpool->ring));
1724
1725         for (index = 0; index < mbuf_count; index++) {
1726                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1727                 if (likely(mbuf != NULL)) {
1728                         if (likely(MBUF_EXT_MEM(mbuf)))
1729                                 pktmbuf_detach_zcp(mbuf);
1730                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1731                 }
1732         }
1733
1734         RTE_LOG(DEBUG, VHOST_CONFIG,
1735                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1736                 "mbuf_destroy_zcp is: %d\n",
1737                 rte_mempool_count(vpool->pool));
1738         RTE_LOG(DEBUG, VHOST_CONFIG,
1739                 "in mbuf_destroy_zcp: mbuf count in ring after "
1740                 "mbuf_destroy_zcp is : %d\n",
1741                 rte_ring_count(vpool->ring));
1742 }
1743
1744 /*
1745  * This function update the use flag and counter.
1746  */
1747 static inline uint32_t __attribute__((always_inline))
1748 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1749         uint32_t count)
1750 {
1751         struct vhost_virtqueue *vq;
1752         struct vring_desc *desc;
1753         struct rte_mbuf *buff;
1754         /* The virtio_hdr is initialised to 0. */
1755         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1756                 = {{0, 0, 0, 0, 0, 0}, 0};
1757         uint64_t buff_hdr_addr = 0;
1758         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1759         uint32_t head_idx, packet_success = 0;
1760         uint16_t res_cur_idx;
1761
1762         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_rx()\n",
1763                 dev->device_fh);
1764
1765         if (count == 0)
1766                 return 0;
1767
1768         vq = dev->virtqueue[VIRTIO_RXQ];
1769         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1770
1771         res_cur_idx = vq->last_used_idx;
1772         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Current Index %d| End Index %d\n",
1773                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1774
1775         /* Retrieve all of the head indexes first to avoid caching issues. */
1776         for (head_idx = 0; head_idx < count; head_idx++)
1777                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1778
1779         /*Prefetch descriptor index. */
1780         rte_prefetch0(&vq->desc[head[packet_success]]);
1781
1782         while (packet_success != count) {
1783                 /* Get descriptor from available ring */
1784                 desc = &vq->desc[head[packet_success]];
1785
1786                 buff = pkts[packet_success];
1787                 RTE_LOG(DEBUG, VHOST_DATA,
1788                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1789                         "pkt[%d] descriptor idx: %d\n",
1790                         dev->device_fh, packet_success,
1791                         MBUF_HEADROOM_UINT32(buff));
1792
1793                 PRINT_PACKET(dev,
1794                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1795                         + RTE_PKTMBUF_HEADROOM),
1796                         rte_pktmbuf_data_len(buff), 0);
1797
1798                 /* Buffer address translation for virtio header. */
1799                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1800                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1801
1802                 /*
1803                  * If the descriptors are chained the header and data are
1804                  * placed in separate buffers.
1805                  */
1806                 if (desc->flags & VRING_DESC_F_NEXT) {
1807                         desc->len = vq->vhost_hlen;
1808                         desc = &vq->desc[desc->next];
1809                         desc->len = rte_pktmbuf_data_len(buff);
1810                 } else {
1811                         desc->len = packet_len;
1812                 }
1813
1814                 /* Update used ring with desc information */
1815                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1816                         = head[packet_success];
1817                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1818                         = packet_len;
1819                 res_cur_idx++;
1820                 packet_success++;
1821
1822                 /* A header is required per buffer. */
1823                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1824                         (const void *)&virtio_hdr, vq->vhost_hlen);
1825
1826                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1827
1828                 if (likely(packet_success < count)) {
1829                         /* Prefetch descriptor index. */
1830                         rte_prefetch0(&vq->desc[head[packet_success]]);
1831                 }
1832         }
1833
1834         rte_compiler_barrier();
1835
1836         RTE_LOG(DEBUG, VHOST_DATA,
1837                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1838                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1839                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1840
1841         *(volatile uint16_t *)&vq->used->idx += count;
1842         vq->last_used_idx += count;
1843
1844         RTE_LOG(DEBUG, VHOST_DATA,
1845                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1846                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1847                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1848
1849         /* Kick the guest if necessary. */
1850         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1851                 eventfd_write(vq->callfd, (eventfd_t)1);
1852
1853         return count;
1854 }
1855
1856 /*
1857  * This function routes the TX packet to the correct interface.
1858  * This may be a local device or the physical port.
1859  */
1860 static inline void __attribute__((always_inline))
1861 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1862         uint32_t desc_idx, uint8_t need_copy)
1863 {
1864         struct mbuf_table *tx_q;
1865         struct rte_mbuf **m_table;
1866         void *obj = NULL;
1867         struct rte_mbuf *mbuf;
1868         unsigned len, ret, offset = 0;
1869         struct vpool *vpool;
1870         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1871         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1872
1873         /*Add packet to the port tx queue*/
1874         tx_q = &tx_queue_zcp[vmdq_rx_q];
1875         len = tx_q->len;
1876
1877         /* Allocate an mbuf and populate the structure. */
1878         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1879         rte_ring_sc_dequeue(vpool->ring, &obj);
1880         mbuf = obj;
1881         if (unlikely(mbuf == NULL)) {
1882                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1883                 RTE_LOG(ERR, VHOST_DATA,
1884                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1885                         dev->device_fh);
1886                 put_desc_to_used_list_zcp(vq, desc_idx);
1887                 return;
1888         }
1889
1890         if (vm2vm_mode == VM2VM_HARDWARE) {
1891                 /* Avoid using a vlan tag from any vm for external pkt, such as
1892                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1893                  * selection, MAC address determines it as an external pkt
1894                  * which should go to network, while vlan tag determine it as
1895                  * a vm2vm pkt should forward to another vm. Hardware confuse
1896                  * such a ambiguous situation, so pkt will lost.
1897                  */
1898                 vlan_tag = external_pkt_default_vlan_tag;
1899                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1900                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1901                         __rte_mbuf_raw_free(mbuf);
1902                         return;
1903                 }
1904         }
1905
1906         mbuf->nb_segs = m->nb_segs;
1907         mbuf->next = m->next;
1908         mbuf->data_len = m->data_len + offset;
1909         mbuf->pkt_len = mbuf->data_len;
1910         if (unlikely(need_copy)) {
1911                 /* Copy the packet contents to the mbuf. */
1912                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1913                         rte_pktmbuf_mtod(m, void *),
1914                         m->data_len);
1915         } else {
1916                 mbuf->data_off = m->data_off;
1917                 mbuf->buf_physaddr = m->buf_physaddr;
1918                 mbuf->buf_addr = m->buf_addr;
1919         }
1920         mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1921         mbuf->vlan_tci = vlan_tag;
1922         mbuf->l2_len = sizeof(struct ether_hdr);
1923         mbuf->l3_len = sizeof(struct ipv4_hdr);
1924         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1925
1926         tx_q->m_table[len] = mbuf;
1927         len++;
1928
1929         RTE_LOG(DEBUG, VHOST_DATA,
1930                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1931                 dev->device_fh,
1932                 mbuf->nb_segs,
1933                 (mbuf->next == NULL) ? "null" : "non-null");
1934
1935         if (enable_stats) {
1936                 dev_statistics[dev->device_fh].tx_total++;
1937                 dev_statistics[dev->device_fh].tx++;
1938         }
1939
1940         if (unlikely(len == MAX_PKT_BURST)) {
1941                 m_table = (struct rte_mbuf **)tx_q->m_table;
1942                 ret = rte_eth_tx_burst(ports[0],
1943                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1944
1945                 /*
1946                  * Free any buffers not handled by TX and update
1947                  * the port stats.
1948                  */
1949                 if (unlikely(ret < len)) {
1950                         do {
1951                                 rte_pktmbuf_free(m_table[ret]);
1952                         } while (++ret < len);
1953                 }
1954
1955                 len = 0;
1956                 txmbuf_clean_zcp(dev, vpool);
1957         }
1958
1959         tx_q->len = len;
1960
1961         return;
1962 }
1963
1964 /*
1965  * This function TX all available packets in virtio TX queue for one
1966  * virtio-net device. If it is first packet, it learns MAC address and
1967  * setup VMDQ.
1968  */
1969 static inline void __attribute__((always_inline))
1970 virtio_dev_tx_zcp(struct virtio_net *dev)
1971 {
1972         struct rte_mbuf m;
1973         struct vhost_virtqueue *vq;
1974         struct vring_desc *desc;
1975         uint64_t buff_addr = 0, phys_addr;
1976         uint32_t head[MAX_PKT_BURST];
1977         uint32_t i;
1978         uint16_t free_entries, packet_success = 0;
1979         uint16_t avail_idx;
1980         uint8_t need_copy = 0;
1981         hpa_type addr_type;
1982         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1983
1984         vq = dev->virtqueue[VIRTIO_TXQ];
1985         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1986
1987         /* If there are no available buffers then return. */
1988         if (vq->last_used_idx_res == avail_idx)
1989                 return;
1990
1991         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") virtio_dev_tx()\n",
1992                 dev->device_fh);
1993
1994         /* Prefetch available ring to retrieve head indexes. */
1995         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1996
1997         /* Get the number of free entries in the ring */
1998         free_entries = (avail_idx - vq->last_used_idx_res);
1999
2000         /* Limit to MAX_PKT_BURST. */
2001         free_entries
2002                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2003
2004         RTE_LOG(DEBUG, VHOST_DATA, "(%" PRIu64 ") Buffers available %d\n",
2005                 dev->device_fh, free_entries);
2006
2007         /* Retrieve all of the head indexes first to avoid caching issues. */
2008         for (i = 0; i < free_entries; i++)
2009                 head[i]
2010                         = vq->avail->ring[(vq->last_used_idx_res + i)
2011                         & (vq->size - 1)];
2012
2013         vq->last_used_idx_res += free_entries;
2014
2015         /* Prefetch descriptor index. */
2016         rte_prefetch0(&vq->desc[head[packet_success]]);
2017         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2018
2019         while (packet_success < free_entries) {
2020                 desc = &vq->desc[head[packet_success]];
2021
2022                 /* Discard first buffer as it is the virtio header */
2023                 desc = &vq->desc[desc->next];
2024
2025                 /* Buffer address translation. */
2026                 buff_addr = gpa_to_vva(dev, desc->addr);
2027                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
2028                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2029                         &addr_type);
2030
2031                 if (likely(packet_success < (free_entries - 1)))
2032                         /* Prefetch descriptor index. */
2033                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2034
2035                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2036                         RTE_LOG(ERR, VHOST_DATA,
2037                                 "(%"PRIu64") Invalid frame buffer address found"
2038                                 "when TX packets!\n",
2039                                 dev->device_fh);
2040                         packet_success++;
2041                         continue;
2042                 }
2043
2044                 /* Prefetch buffer address. */
2045                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2046
2047                 /*
2048                  * Setup dummy mbuf. This is copied to a real mbuf if
2049                  * transmitted out the physical port.
2050                  */
2051                 m.data_len = desc->len;
2052                 m.nb_segs = 1;
2053                 m.next = NULL;
2054                 m.data_off = 0;
2055                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2056                 m.buf_physaddr = phys_addr;
2057
2058                 /*
2059                  * Check if the frame buffer address from guest crosses
2060                  * sub-region or not.
2061                  */
2062                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2063                         RTE_LOG(ERR, VHOST_DATA,
2064                                 "(%"PRIu64") Frame buffer address cross "
2065                                 "sub-regioin found when attaching TX frame "
2066                                 "buffer address!\n",
2067                                 dev->device_fh);
2068                         need_copy = 1;
2069                 } else
2070                         need_copy = 0;
2071
2072                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2073
2074                 /*
2075                  * If this is the first received packet we need to learn
2076                  * the MAC and setup VMDQ
2077                  */
2078                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2079                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2080                                 /*
2081                                  * Discard frame if device is scheduled for
2082                                  * removal or a duplicate MAC address is found.
2083                                  */
2084                                 packet_success += free_entries;
2085                                 vq->last_used_idx += packet_success;
2086                                 break;
2087                         }
2088                 }
2089
2090                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2091                 packet_success++;
2092         }
2093 }
2094
2095 /*
2096  * This function is called by each data core. It handles all RX/TX registered
2097  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2098  * addresses are compared with all devices in the main linked list.
2099  */
2100 static int
2101 switch_worker_zcp(__attribute__((unused)) void *arg)
2102 {
2103         struct virtio_net *dev = NULL;
2104         struct vhost_dev  *vdev = NULL;
2105         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2106         struct virtio_net_data_ll *dev_ll;
2107         struct mbuf_table *tx_q;
2108         volatile struct lcore_ll_info *lcore_ll;
2109         const uint64_t drain_tsc
2110                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2111                 * BURST_TX_DRAIN_US;
2112         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2113         unsigned ret;
2114         const uint16_t lcore_id = rte_lcore_id();
2115         uint16_t count_in_ring, rx_count = 0;
2116
2117         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2118
2119         lcore_ll = lcore_info[lcore_id].lcore_ll;
2120         prev_tsc = 0;
2121
2122         while (1) {
2123                 cur_tsc = rte_rdtsc();
2124
2125                 /* TX burst queue drain */
2126                 diff_tsc = cur_tsc - prev_tsc;
2127                 if (unlikely(diff_tsc > drain_tsc)) {
2128                         /*
2129                          * Get mbuf from vpool.pool and detach mbuf and
2130                          * put back into vpool.ring.
2131                          */
2132                         dev_ll = lcore_ll->ll_root_used;
2133                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2134                                 /* Get virtio device ID */
2135                                 vdev = dev_ll->vdev;
2136                                 dev = vdev->dev;
2137
2138                                 if (likely(!vdev->remove)) {
2139                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2140                                         if (tx_q->len) {
2141                                                 RTE_LOG(DEBUG, VHOST_DATA,
2142                                                 "TX queue drained after timeout"
2143                                                 " with burst size %u\n",
2144                                                 tx_q->len);
2145
2146                                                 /*
2147                                                  * Tx any packets in the queue
2148                                                  */
2149                                                 ret = rte_eth_tx_burst(
2150                                                         ports[0],
2151                                                         (uint16_t)tx_q->txq_id,
2152                                                         (struct rte_mbuf **)
2153                                                         tx_q->m_table,
2154                                                         (uint16_t)tx_q->len);
2155                                                 if (unlikely(ret < tx_q->len)) {
2156                                                         do {
2157                                                                 rte_pktmbuf_free(
2158                                                                         tx_q->m_table[ret]);
2159                                                         } while (++ret < tx_q->len);
2160                                                 }
2161                                                 tx_q->len = 0;
2162
2163                                                 txmbuf_clean_zcp(dev,
2164                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2165                                         }
2166                                 }
2167                                 dev_ll = dev_ll->next;
2168                         }
2169                         prev_tsc = cur_tsc;
2170                 }
2171
2172                 rte_prefetch0(lcore_ll->ll_root_used);
2173
2174                 /*
2175                  * Inform the configuration core that we have exited the linked
2176                  * list and that no devices are in use if requested.
2177                  */
2178                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2179                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2180
2181                 /* Process devices */
2182                 dev_ll = lcore_ll->ll_root_used;
2183
2184                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2185                         vdev = dev_ll->vdev;
2186                         dev  = vdev->dev;
2187                         if (unlikely(vdev->remove)) {
2188                                 dev_ll = dev_ll->next;
2189                                 unlink_vmdq(vdev);
2190                                 vdev->ready = DEVICE_SAFE_REMOVE;
2191                                 continue;
2192                         }
2193
2194                         if (likely(vdev->ready == DEVICE_RX)) {
2195                                 uint32_t index = vdev->vmdq_rx_q;
2196                                 uint16_t i;
2197                                 count_in_ring
2198                                 = rte_ring_count(vpool_array[index].ring);
2199                                 uint16_t free_entries
2200                                 = (uint16_t)get_available_ring_num_zcp(dev);
2201
2202                                 /*
2203                                  * Attach all mbufs in vpool.ring and put back
2204                                  * into vpool.pool.
2205                                  */
2206                                 for (i = 0;
2207                                 i < RTE_MIN(free_entries,
2208                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2209                                 i++)
2210                                         attach_rxmbuf_zcp(dev);
2211
2212                                 /* Handle guest RX */
2213                                 rx_count = rte_eth_rx_burst(ports[0],
2214                                         vdev->vmdq_rx_q, pkts_burst,
2215                                         MAX_PKT_BURST);
2216
2217                                 if (rx_count) {
2218                                         ret_count = virtio_dev_rx_zcp(dev,
2219                                                         pkts_burst, rx_count);
2220                                         if (enable_stats) {
2221                                                 dev_statistics[dev->device_fh].rx_total
2222                                                         += rx_count;
2223                                                 dev_statistics[dev->device_fh].rx
2224                                                         += ret_count;
2225                                         }
2226                                         while (likely(rx_count)) {
2227                                                 rx_count--;
2228                                                 pktmbuf_detach_zcp(
2229                                                         pkts_burst[rx_count]);
2230                                                 rte_ring_sp_enqueue(
2231                                                         vpool_array[index].ring,
2232                                                         (void *)pkts_burst[rx_count]);
2233                                         }
2234                                 }
2235                         }
2236
2237                         if (likely(!vdev->remove))
2238                                 /* Handle guest TX */
2239                                 virtio_dev_tx_zcp(dev);
2240
2241                         /* Move to the next device in the list */
2242                         dev_ll = dev_ll->next;
2243                 }
2244         }
2245
2246         return 0;
2247 }
2248
2249
2250 /*
2251  * Add an entry to a used linked list. A free entry must first be found
2252  * in the free linked list using get_data_ll_free_entry();
2253  */
2254 static void
2255 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2256         struct virtio_net_data_ll *ll_dev)
2257 {
2258         struct virtio_net_data_ll *ll = *ll_root_addr;
2259
2260         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2261         ll_dev->next = NULL;
2262         rte_compiler_barrier();
2263
2264         /* If ll == NULL then this is the first device. */
2265         if (ll) {
2266                 /* Increment to the tail of the linked list. */
2267                 while ((ll->next != NULL) )
2268                         ll = ll->next;
2269
2270                 ll->next = ll_dev;
2271         } else {
2272                 *ll_root_addr = ll_dev;
2273         }
2274 }
2275
2276 /*
2277  * Remove an entry from a used linked list. The entry must then be added to
2278  * the free linked list using put_data_ll_free_entry().
2279  */
2280 static void
2281 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2282         struct virtio_net_data_ll *ll_dev,
2283         struct virtio_net_data_ll *ll_dev_last)
2284 {
2285         struct virtio_net_data_ll *ll = *ll_root_addr;
2286
2287         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2288                 return;
2289
2290         if (ll_dev == ll)
2291                 *ll_root_addr = ll_dev->next;
2292         else
2293                 if (likely(ll_dev_last != NULL))
2294                         ll_dev_last->next = ll_dev->next;
2295                 else
2296                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2297 }
2298
2299 /*
2300  * Find and return an entry from the free linked list.
2301  */
2302 static struct virtio_net_data_ll *
2303 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2304 {
2305         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2306         struct virtio_net_data_ll *ll_dev;
2307
2308         if (ll_free == NULL)
2309                 return NULL;
2310
2311         ll_dev = ll_free;
2312         *ll_root_addr = ll_free->next;
2313
2314         return ll_dev;
2315 }
2316
2317 /*
2318  * Place an entry back on to the free linked list.
2319  */
2320 static void
2321 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2322         struct virtio_net_data_ll *ll_dev)
2323 {
2324         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2325
2326         if (ll_dev == NULL)
2327                 return;
2328
2329         ll_dev->next = ll_free;
2330         *ll_root_addr = ll_dev;
2331 }
2332
2333 /*
2334  * Creates a linked list of a given size.
2335  */
2336 static struct virtio_net_data_ll *
2337 alloc_data_ll(uint32_t size)
2338 {
2339         struct virtio_net_data_ll *ll_new;
2340         uint32_t i;
2341
2342         /* Malloc and then chain the linked list. */
2343         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2344         if (ll_new == NULL) {
2345                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2346                 return NULL;
2347         }
2348
2349         for (i = 0; i < size - 1; i++) {
2350                 ll_new[i].vdev = NULL;
2351                 ll_new[i].next = &ll_new[i+1];
2352         }
2353         ll_new[i].next = NULL;
2354
2355         return ll_new;
2356 }
2357
2358 /*
2359  * Create the main linked list along with each individual cores linked list. A used and a free list
2360  * are created to manage entries.
2361  */
2362 static int
2363 init_data_ll (void)
2364 {
2365         int lcore;
2366
2367         RTE_LCORE_FOREACH_SLAVE(lcore) {
2368                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2369                 if (lcore_info[lcore].lcore_ll == NULL) {
2370                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2371                         return -1;
2372                 }
2373
2374                 lcore_info[lcore].lcore_ll->device_num = 0;
2375                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2376                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2377                 if (num_devices % num_switching_cores)
2378                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2379                 else
2380                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2381         }
2382
2383         /* Allocate devices up to a maximum of MAX_DEVICES. */
2384         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2385
2386         return 0;
2387 }
2388
2389 /*
2390  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2391  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2392  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2393  */
2394 static void
2395 destroy_device (volatile struct virtio_net *dev)
2396 {
2397         struct virtio_net_data_ll *ll_lcore_dev_cur;
2398         struct virtio_net_data_ll *ll_main_dev_cur;
2399         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2400         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2401         struct vhost_dev *vdev;
2402         int lcore;
2403
2404         dev->flags &= ~VIRTIO_DEV_RUNNING;
2405
2406         vdev = (struct vhost_dev *)dev->priv;
2407         /*set the remove flag. */
2408         vdev->remove = 1;
2409         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2410                 rte_pause();
2411         }
2412
2413         /* Search for entry to be removed from lcore ll */
2414         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2415         while (ll_lcore_dev_cur != NULL) {
2416                 if (ll_lcore_dev_cur->vdev == vdev) {
2417                         break;
2418                 } else {
2419                         ll_lcore_dev_last = ll_lcore_dev_cur;
2420                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2421                 }
2422         }
2423
2424         if (ll_lcore_dev_cur == NULL) {
2425                 RTE_LOG(ERR, VHOST_CONFIG,
2426                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2427                         dev->device_fh);
2428                 return;
2429         }
2430
2431         /* Search for entry to be removed from main ll */
2432         ll_main_dev_cur = ll_root_used;
2433         ll_main_dev_last = NULL;
2434         while (ll_main_dev_cur != NULL) {
2435                 if (ll_main_dev_cur->vdev == vdev) {
2436                         break;
2437                 } else {
2438                         ll_main_dev_last = ll_main_dev_cur;
2439                         ll_main_dev_cur = ll_main_dev_cur->next;
2440                 }
2441         }
2442
2443         /* Remove entries from the lcore and main ll. */
2444         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2445         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2446
2447         /* Set the dev_removal_flag on each lcore. */
2448         RTE_LCORE_FOREACH_SLAVE(lcore) {
2449                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2450         }
2451
2452         /*
2453          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2454          * they can no longer access the device removed from the linked lists and that the devices
2455          * are no longer in use.
2456          */
2457         RTE_LCORE_FOREACH_SLAVE(lcore) {
2458                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2459                         rte_pause();
2460                 }
2461         }
2462
2463         /* Add the entries back to the lcore and main free ll.*/
2464         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2465         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2466
2467         /* Decrement number of device on the lcore. */
2468         lcore_info[vdev->coreid].lcore_ll->device_num--;
2469
2470         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2471
2472         if (zero_copy) {
2473                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2474
2475                 /* Stop the RX queue. */
2476                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2477                         RTE_LOG(DEBUG, VHOST_CONFIG,
2478                                 "(%"PRIu64") In destroy_device: Failed to stop "
2479                                 "rx queue:%d\n",
2480                                 dev->device_fh,
2481                                 vdev->vmdq_rx_q);
2482                 }
2483
2484                 RTE_LOG(DEBUG, VHOST_CONFIG,
2485                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2486                         "mempool back to ring for RX queue: %d\n",
2487                         dev->device_fh, vdev->vmdq_rx_q);
2488
2489                 mbuf_destroy_zcp(vpool);
2490
2491                 /* Stop the TX queue. */
2492                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2493                         RTE_LOG(DEBUG, VHOST_CONFIG,
2494                                 "(%"PRIu64") In destroy_device: Failed to "
2495                                 "stop tx queue:%d\n",
2496                                 dev->device_fh, vdev->vmdq_rx_q);
2497                 }
2498
2499                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2500
2501                 RTE_LOG(DEBUG, VHOST_CONFIG,
2502                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2503                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2504                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2505                         dev->device_fh);
2506
2507                 mbuf_destroy_zcp(vpool);
2508                 rte_free(vdev->regions_hpa);
2509         }
2510         rte_free(vdev);
2511
2512 }
2513
2514 /*
2515  * Calculate the region count of physical continous regions for one particular
2516  * region of whose vhost virtual address is continous. The particular region
2517  * start from vva_start, with size of 'size' in argument.
2518  */
2519 static uint32_t
2520 check_hpa_regions(uint64_t vva_start, uint64_t size)
2521 {
2522         uint32_t i, nregions = 0, page_size = getpagesize();
2523         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2524         if (vva_start % page_size) {
2525                 RTE_LOG(DEBUG, VHOST_CONFIG,
2526                         "in check_countinous: vva start(%p) mod page_size(%d) "
2527                         "has remainder\n",
2528                         (void *)(uintptr_t)vva_start, page_size);
2529                 return 0;
2530         }
2531         if (size % page_size) {
2532                 RTE_LOG(DEBUG, VHOST_CONFIG,
2533                         "in check_countinous: "
2534                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2535                         size, page_size);
2536                 return 0;
2537         }
2538         for (i = 0; i < size - page_size; i = i + page_size) {
2539                 cur_phys_addr
2540                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2541                 next_phys_addr = rte_mem_virt2phy(
2542                         (void *)(uintptr_t)(vva_start + i + page_size));
2543                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2544                         ++nregions;
2545                         RTE_LOG(DEBUG, VHOST_CONFIG,
2546                                 "in check_continuous: hva addr:(%p) is not "
2547                                 "continuous with hva addr:(%p), diff:%d\n",
2548                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2549                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2550                                 + page_size), page_size);
2551                         RTE_LOG(DEBUG, VHOST_CONFIG,
2552                                 "in check_continuous: hpa addr:(%p) is not "
2553                                 "continuous with hpa addr:(%p), "
2554                                 "diff:(%"PRIu64")\n",
2555                                 (void *)(uintptr_t)cur_phys_addr,
2556                                 (void *)(uintptr_t)next_phys_addr,
2557                                 (next_phys_addr-cur_phys_addr));
2558                 }
2559         }
2560         return nregions;
2561 }
2562
2563 /*
2564  * Divide each region whose vhost virtual address is continous into a few
2565  * sub-regions, make sure the physical address within each sub-region are
2566  * continous. And fill offset(to GPA) and size etc. information of each
2567  * sub-region into regions_hpa.
2568  */
2569 static uint32_t
2570 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2571 {
2572         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2573         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2574
2575         if (mem_region_hpa == NULL)
2576                 return 0;
2577
2578         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2579                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2580                         virtio_memory->regions[regionidx].address_offset;
2581                 mem_region_hpa[regionidx_hpa].guest_phys_address
2582                         = virtio_memory->regions[regionidx].guest_phys_address;
2583                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2584                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2585                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2586                 RTE_LOG(DEBUG, VHOST_CONFIG,
2587                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2588                         regionidx_hpa,
2589                         (void *)(uintptr_t)
2590                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2591                 RTE_LOG(DEBUG, VHOST_CONFIG,
2592                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2593                         regionidx_hpa,
2594                         (void *)(uintptr_t)
2595                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2596                 for (i = 0, k = 0;
2597                         i < virtio_memory->regions[regionidx].memory_size -
2598                                 page_size;
2599                         i += page_size) {
2600                         cur_phys_addr = rte_mem_virt2phy(
2601                                         (void *)(uintptr_t)(vva_start + i));
2602                         next_phys_addr = rte_mem_virt2phy(
2603                                         (void *)(uintptr_t)(vva_start +
2604                                         i + page_size));
2605                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2606                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2607                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2608                                         k + page_size;
2609                                 mem_region_hpa[regionidx_hpa].memory_size
2610                                         = k + page_size;
2611                                 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest "
2612                                         "phys addr end  [%d]:(%p)\n",
2613                                         regionidx_hpa,
2614                                         (void *)(uintptr_t)
2615                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2616                                 RTE_LOG(DEBUG, VHOST_CONFIG,
2617                                         "in fill_hpa_regions: guest phys addr "
2618                                         "size [%d]:(%p)\n",
2619                                         regionidx_hpa,
2620                                         (void *)(uintptr_t)
2621                                         (mem_region_hpa[regionidx_hpa].memory_size));
2622                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2623                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2624                                 ++regionidx_hpa;
2625                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2626                                         next_phys_addr -
2627                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2628                                 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest"
2629                                         " phys addr start[%d]:(%p)\n",
2630                                         regionidx_hpa,
2631                                         (void *)(uintptr_t)
2632                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2633                                 RTE_LOG(DEBUG, VHOST_CONFIG,
2634                                         "in fill_hpa_regions: host  phys addr "
2635                                         "start[%d]:(%p)\n",
2636                                         regionidx_hpa,
2637                                         (void *)(uintptr_t)
2638                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2639                                 k = 0;
2640                         } else {
2641                                 k += page_size;
2642                         }
2643                 }
2644                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2645                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2646                         + k + page_size;
2647                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2648                 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2649                         "[%d]:(%p)\n", regionidx_hpa,
2650                         (void *)(uintptr_t)
2651                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2652                 RTE_LOG(DEBUG, VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2653                         "[%d]:(%p)\n", regionidx_hpa,
2654                         (void *)(uintptr_t)
2655                         (mem_region_hpa[regionidx_hpa].memory_size));
2656                 ++regionidx_hpa;
2657         }
2658         return regionidx_hpa;
2659 }
2660
2661 /*
2662  * A new device is added to a data core. First the device is added to the main linked list
2663  * and the allocated to a specific data core.
2664  */
2665 static int
2666 new_device (struct virtio_net *dev)
2667 {
2668         struct virtio_net_data_ll *ll_dev;
2669         int lcore, core_add = 0;
2670         uint32_t device_num_min = num_devices;
2671         struct vhost_dev *vdev;
2672         uint32_t regionidx;
2673
2674         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2675         if (vdev == NULL) {
2676                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2677                         dev->device_fh);
2678                 return -1;
2679         }
2680         vdev->dev = dev;
2681         dev->priv = vdev;
2682
2683         if (zero_copy) {
2684                 vdev->nregions_hpa = dev->mem->nregions;
2685                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2686                         vdev->nregions_hpa
2687                                 += check_hpa_regions(
2688                                         dev->mem->regions[regionidx].guest_phys_address
2689                                         + dev->mem->regions[regionidx].address_offset,
2690                                         dev->mem->regions[regionidx].memory_size);
2691
2692                 }
2693
2694                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2695                                                vdev->nregions_hpa,
2696                                                sizeof(struct virtio_memory_regions_hpa),
2697                                                RTE_CACHE_LINE_SIZE);
2698                 if (vdev->regions_hpa == NULL) {
2699                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2700                         rte_free(vdev);
2701                         return -1;
2702                 }
2703
2704
2705                 if (fill_hpa_memory_regions(
2706                         vdev->regions_hpa, dev->mem
2707                         ) != vdev->nregions_hpa) {
2708
2709                         RTE_LOG(ERR, VHOST_CONFIG,
2710                                 "hpa memory regions number mismatch: "
2711                                 "[%d]\n", vdev->nregions_hpa);
2712                         rte_free(vdev->regions_hpa);
2713                         rte_free(vdev);
2714                         return -1;
2715                 }
2716         }
2717
2718
2719         /* Add device to main ll */
2720         ll_dev = get_data_ll_free_entry(&ll_root_free);
2721         if (ll_dev == NULL) {
2722                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2723                         "of %d devices per core has been reached\n",
2724                         dev->device_fh, num_devices);
2725                 if (vdev->regions_hpa)
2726                         rte_free(vdev->regions_hpa);
2727                 rte_free(vdev);
2728                 return -1;
2729         }
2730         ll_dev->vdev = vdev;
2731         add_data_ll_entry(&ll_root_used, ll_dev);
2732         vdev->vmdq_rx_q
2733                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2734
2735         if (zero_copy) {
2736                 uint32_t index = vdev->vmdq_rx_q;
2737                 uint32_t count_in_ring, i;
2738                 struct mbuf_table *tx_q;
2739
2740                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2741
2742                 RTE_LOG(DEBUG, VHOST_CONFIG,
2743                         "(%"PRIu64") in new_device: mbuf count in mempool "
2744                         "before attach is: %d\n",
2745                         dev->device_fh,
2746                         rte_mempool_count(vpool_array[index].pool));
2747                 RTE_LOG(DEBUG, VHOST_CONFIG,
2748                         "(%"PRIu64") in new_device: mbuf count in  ring "
2749                         "before attach  is : %d\n",
2750                         dev->device_fh, count_in_ring);
2751
2752                 /*
2753                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2754                  */
2755                 for (i = 0; i < count_in_ring; i++)
2756                         attach_rxmbuf_zcp(dev);
2757
2758                 RTE_LOG(DEBUG, VHOST_CONFIG, "(%" PRIu64 ") in new_device: "
2759                         "mbuf count in mempool after attach is: %d\n",
2760                         dev->device_fh,
2761                         rte_mempool_count(vpool_array[index].pool));
2762                 RTE_LOG(DEBUG, VHOST_CONFIG, "(%" PRIu64 ") in new_device: "
2763                         "mbuf count in ring after attach  is : %d\n",
2764                         dev->device_fh,
2765                         rte_ring_count(vpool_array[index].ring));
2766
2767                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2768                 tx_q->txq_id = vdev->vmdq_rx_q;
2769
2770                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2771                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2772
2773                         RTE_LOG(DEBUG, VHOST_CONFIG,
2774                                 "(%"PRIu64") In new_device: Failed to start "
2775                                 "tx queue:%d\n",
2776                                 dev->device_fh, vdev->vmdq_rx_q);
2777
2778                         mbuf_destroy_zcp(vpool);
2779                         rte_free(vdev->regions_hpa);
2780                         rte_free(vdev);
2781                         return -1;
2782                 }
2783
2784                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2785                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2786
2787                         RTE_LOG(DEBUG, VHOST_CONFIG,
2788                                 "(%"PRIu64") In new_device: Failed to start "
2789                                 "rx queue:%d\n",
2790                                 dev->device_fh, vdev->vmdq_rx_q);
2791
2792                         /* Stop the TX queue. */
2793                         if (rte_eth_dev_tx_queue_stop(ports[0],
2794                                 vdev->vmdq_rx_q) != 0) {
2795                                 RTE_LOG(DEBUG, VHOST_CONFIG,
2796                                         "(%"PRIu64") In new_device: Failed to "
2797                                         "stop tx queue:%d\n",
2798                                         dev->device_fh, vdev->vmdq_rx_q);
2799                         }
2800
2801                         mbuf_destroy_zcp(vpool);
2802                         rte_free(vdev->regions_hpa);
2803                         rte_free(vdev);
2804                         return -1;
2805                 }
2806
2807         }
2808
2809         /*reset ready flag*/
2810         vdev->ready = DEVICE_MAC_LEARNING;
2811         vdev->remove = 0;
2812
2813         /* Find a suitable lcore to add the device. */
2814         RTE_LCORE_FOREACH_SLAVE(lcore) {
2815                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2816                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2817                         core_add = lcore;
2818                 }
2819         }
2820         /* Add device to lcore ll */
2821         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2822         if (ll_dev == NULL) {
2823                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2824                 vdev->ready = DEVICE_SAFE_REMOVE;
2825                 destroy_device(dev);
2826                 rte_free(vdev->regions_hpa);
2827                 rte_free(vdev);
2828                 return -1;
2829         }
2830         ll_dev->vdev = vdev;
2831         vdev->coreid = core_add;
2832
2833         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2834
2835         /* Initialize device stats */
2836         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2837
2838         /* Disable notifications. */
2839         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2840         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2841         lcore_info[vdev->coreid].lcore_ll->device_num++;
2842         dev->flags |= VIRTIO_DEV_RUNNING;
2843
2844         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2845
2846         return 0;
2847 }
2848
2849 /*
2850  * These callback allow devices to be added to the data core when configuration
2851  * has been fully complete.
2852  */
2853 static const struct virtio_net_device_ops virtio_net_device_ops =
2854 {
2855         .new_device =  new_device,
2856         .destroy_device = destroy_device,
2857 };
2858
2859 /*
2860  * This is a thread will wake up after a period to print stats if the user has
2861  * enabled them.
2862  */
2863 static void
2864 print_stats(void)
2865 {
2866         struct virtio_net_data_ll *dev_ll;
2867         uint64_t tx_dropped, rx_dropped;
2868         uint64_t tx, tx_total, rx, rx_total;
2869         uint32_t device_fh;
2870         const char clr[] = { 27, '[', '2', 'J', '\0' };
2871         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2872
2873         while(1) {
2874                 sleep(enable_stats);
2875
2876                 /* Clear screen and move to top left */
2877                 printf("%s%s", clr, top_left);
2878
2879                 printf("\nDevice statistics ====================================");
2880
2881                 dev_ll = ll_root_used;
2882                 while (dev_ll != NULL) {
2883                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2884                         tx_total = dev_statistics[device_fh].tx_total;
2885                         tx = dev_statistics[device_fh].tx;
2886                         tx_dropped = tx_total - tx;
2887                         if (zero_copy == 0) {
2888                                 rx_total = rte_atomic64_read(
2889                                         &dev_statistics[device_fh].rx_total_atomic);
2890                                 rx = rte_atomic64_read(
2891                                         &dev_statistics[device_fh].rx_atomic);
2892                         } else {
2893                                 rx_total = dev_statistics[device_fh].rx_total;
2894                                 rx = dev_statistics[device_fh].rx;
2895                         }
2896                         rx_dropped = rx_total - rx;
2897
2898                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2899                                         "\nTX total:            %"PRIu64""
2900                                         "\nTX dropped:          %"PRIu64""
2901                                         "\nTX successful:               %"PRIu64""
2902                                         "\nRX total:            %"PRIu64""
2903                                         "\nRX dropped:          %"PRIu64""
2904                                         "\nRX successful:               %"PRIu64"",
2905                                         device_fh,
2906                                         tx_total,
2907                                         tx_dropped,
2908                                         tx,
2909                                         rx_total,
2910                                         rx_dropped,
2911                                         rx);
2912
2913                         dev_ll = dev_ll->next;
2914                 }
2915                 printf("\n======================================================\n");
2916         }
2917 }
2918
2919 static void
2920 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2921         char *ring_name, uint32_t nb_mbuf)
2922 {
2923         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2924                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2925         if (vpool_array[index].pool != NULL) {
2926                 vpool_array[index].ring
2927                         = rte_ring_create(ring_name,
2928                                 rte_align32pow2(nb_mbuf + 1),
2929                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2930                 if (likely(vpool_array[index].ring != NULL)) {
2931                         RTE_LOG(DEBUG, VHOST_CONFIG,
2932                                 "in setup_mempool_tbl: mbuf count in "
2933                                 "mempool is: %d\n",
2934                                 rte_mempool_count(vpool_array[index].pool));
2935                         RTE_LOG(DEBUG, VHOST_CONFIG,
2936                                 "in setup_mempool_tbl: mbuf count in "
2937                                 "ring   is: %d\n",
2938                                 rte_ring_count(vpool_array[index].ring));
2939                 } else {
2940                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2941                                 ring_name);
2942                 }
2943
2944                 /* Need consider head room. */
2945                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2946         } else {
2947                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2948         }
2949 }
2950
2951 /* When we receive a INT signal, unregister vhost driver */
2952 static void
2953 sigint_handler(__rte_unused int signum)
2954 {
2955         /* Unregister vhost driver. */
2956         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2957         if (ret != 0)
2958                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2959         exit(0);
2960 }
2961
2962 /*
2963  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2964  * device is also registered here to handle the IOCTLs.
2965  */
2966 int
2967 main(int argc, char *argv[])
2968 {
2969         struct rte_mempool *mbuf_pool = NULL;
2970         unsigned lcore_id, core_id = 0;
2971         unsigned nb_ports, valid_num_ports;
2972         int ret;
2973         uint8_t portid;
2974         uint16_t queue_id;
2975         static pthread_t tid;
2976         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2977
2978         signal(SIGINT, sigint_handler);
2979
2980         /* init EAL */
2981         ret = rte_eal_init(argc, argv);
2982         if (ret < 0)
2983                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2984         argc -= ret;
2985         argv += ret;
2986
2987         /* parse app arguments */
2988         ret = us_vhost_parse_args(argc, argv);
2989         if (ret < 0)
2990                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2991
2992         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2993                 if (rte_lcore_is_enabled(lcore_id))
2994                         lcore_ids[core_id ++] = lcore_id;
2995
2996         if (rte_lcore_count() > RTE_MAX_LCORE)
2997                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2998
2999         /*set the number of swithcing cores available*/
3000         num_switching_cores = rte_lcore_count()-1;
3001
3002         /* Get the number of physical ports. */
3003         nb_ports = rte_eth_dev_count();
3004         if (nb_ports > RTE_MAX_ETHPORTS)
3005                 nb_ports = RTE_MAX_ETHPORTS;
3006
3007         /*
3008          * Update the global var NUM_PORTS and global array PORTS
3009          * and get value of var VALID_NUM_PORTS according to system ports number
3010          */
3011         valid_num_ports = check_ports_num(nb_ports);
3012
3013         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3014                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3015                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3016                 return -1;
3017         }
3018
3019         if (zero_copy == 0) {
3020                 /* Create the mbuf pool. */
3021                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3022                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3023                         0, MBUF_DATA_SIZE, rte_socket_id());
3024                 if (mbuf_pool == NULL)
3025                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3026
3027                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3028                         vpool_array[queue_id].pool = mbuf_pool;
3029
3030                 if (vm2vm_mode == VM2VM_HARDWARE) {
3031                         /* Enable VT loop back to let L2 switch to do it. */
3032                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3033                         RTE_LOG(DEBUG, VHOST_CONFIG,
3034                                 "Enable loop back for L2 switch in vmdq.\n");
3035                 }
3036         } else {
3037                 uint32_t nb_mbuf;
3038                 char pool_name[RTE_MEMPOOL_NAMESIZE];
3039                 char ring_name[RTE_MEMPOOL_NAMESIZE];
3040
3041                 nb_mbuf = num_rx_descriptor
3042                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3043                         + num_switching_cores * MAX_PKT_BURST;
3044
3045                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3046                         snprintf(pool_name, sizeof(pool_name),
3047                                 "rxmbuf_pool_%u", queue_id);
3048                         snprintf(ring_name, sizeof(ring_name),
3049                                 "rxmbuf_ring_%u", queue_id);
3050                         setup_mempool_tbl(rte_socket_id(), queue_id,
3051                                 pool_name, ring_name, nb_mbuf);
3052                 }
3053
3054                 nb_mbuf = num_tx_descriptor
3055                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3056                                 + num_switching_cores * MAX_PKT_BURST;
3057
3058                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3059                         snprintf(pool_name, sizeof(pool_name),
3060                                 "txmbuf_pool_%u", queue_id);
3061                         snprintf(ring_name, sizeof(ring_name),
3062                                 "txmbuf_ring_%u", queue_id);
3063                         setup_mempool_tbl(rte_socket_id(),
3064                                 (queue_id + MAX_QUEUES),
3065                                 pool_name, ring_name, nb_mbuf);
3066                 }
3067
3068                 if (vm2vm_mode == VM2VM_HARDWARE) {
3069                         /* Enable VT loop back to let L2 switch to do it. */
3070                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3071                         RTE_LOG(DEBUG, VHOST_CONFIG,
3072                                 "Enable loop back for L2 switch in vmdq.\n");
3073                 }
3074         }
3075
3076         /* initialize all ports */
3077         for (portid = 0; portid < nb_ports; portid++) {
3078                 /* skip ports that are not enabled */
3079                 if ((enabled_port_mask & (1 << portid)) == 0) {
3080                         RTE_LOG(INFO, VHOST_PORT,
3081                                 "Skipping disabled port %d\n", portid);
3082                         continue;
3083                 }
3084                 if (port_init(portid) != 0)
3085                         rte_exit(EXIT_FAILURE,
3086                                 "Cannot initialize network ports\n");
3087         }
3088
3089         /* Initialise all linked lists. */
3090         if (init_data_ll() == -1)
3091                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3092
3093         /* Initialize device stats */
3094         memset(&dev_statistics, 0, sizeof(dev_statistics));
3095
3096         /* Enable stats if the user option is set. */
3097         if (enable_stats) {
3098                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3099                 if (ret != 0)
3100                         rte_exit(EXIT_FAILURE,
3101                                 "Cannot create print-stats thread\n");
3102
3103                 /* Set thread_name for aid in debugging.  */
3104                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3105                 ret = rte_thread_setname(tid, thread_name);
3106                 if (ret != 0)
3107                         RTE_LOG(ERR, VHOST_CONFIG,
3108                                 "Cannot set print-stats name\n");
3109         }
3110
3111         /* Launch all data cores. */
3112         if (zero_copy == 0) {
3113                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3114                         rte_eal_remote_launch(switch_worker,
3115                                 mbuf_pool, lcore_id);
3116                 }
3117         } else {
3118                 uint32_t count_in_mempool, index, i;
3119                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3120                         /* For all RX and TX queues. */
3121                         count_in_mempool
3122                                 = rte_mempool_count(vpool_array[index].pool);
3123
3124                         /*
3125                          * Transfer all un-attached mbufs from vpool.pool
3126                          * to vpoo.ring.
3127                          */
3128                         for (i = 0; i < count_in_mempool; i++) {
3129                                 struct rte_mbuf *mbuf
3130                                         = __rte_mbuf_raw_alloc(
3131                                                 vpool_array[index].pool);
3132                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3133                                                 (void *)mbuf);
3134                         }
3135
3136                         RTE_LOG(DEBUG, VHOST_CONFIG,
3137                                 "in main: mbuf count in mempool at initial "
3138                                 "is: %d\n", count_in_mempool);
3139                         RTE_LOG(DEBUG, VHOST_CONFIG,
3140                                 "in main: mbuf count in  ring at initial  is :"
3141                                 " %d\n",
3142                                 rte_ring_count(vpool_array[index].ring));
3143                 }
3144
3145                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3146                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3147                                 lcore_id);
3148         }
3149
3150         if (mergeable == 0)
3151                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3152
3153         /* Register vhost(cuse or user) driver to handle vhost messages. */
3154         ret = rte_vhost_driver_register((char *)&dev_basename);
3155         if (ret != 0)
3156                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3157
3158         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3159
3160         /* Start CUSE session. */
3161         rte_vhost_driver_session_start();
3162         return 0;
3163
3164 }