mbuf: replace data pointer by an offset
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52
53 #include "main.h"
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
56
57 #define MAX_QUEUES 128
58
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
61
62 /*
63  * Calculate the number of buffers needed per port
64  */
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
66                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
67                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68                                                         (num_switching_cores*MBUF_CACHE_SIZE))
69
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
72
73 /*
74  * No frame data buffer allocated from host are required for zero copy
75  * implementation, guest will allocate the frame data buffer, and vhost
76  * directly use it.
77  */
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80         + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
82
83 /*
84  * RX and TX Prefetch, Host, and Write-back threshold values should be
85  * carefully set for optimal performance. Consult the network
86  * controller's datasheet and supporting DPDK documentation for guidance
87  * on how these parameters should be set.
88  */
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
92
93 /*
94  * These default values are optimized for use with the Intel(R) 82599 10 GbE
95  * Controller and the DPDK ixgbe PMD. Consider using other values for other
96  * network controllers and/or network drivers.
97  */
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0  /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0  /* Default values of TX write-back threshold reg. */
101
102 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16    /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
105
106 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
108
109 #define JUMBO_FRAME_MAX_SIZE    0x2600
110
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
113 #define DEVICE_RX                       1
114 #define DEVICE_SAFE_REMOVE      2
115
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
119
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
123
124 /*
125  * Need refine these 2 macros for legacy and DPDK based front end:
126  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127  * And then adjust power 2.
128  */
129 /*
130  * For legacy front end, 128 descriptors,
131  * half for virtio header, another half for mbuf.
132  */
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
135
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138                 + sizeof(struct rte_mbuf)))
139
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
142
143 #define INVALID_PORT_ID 0xFF
144
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
147
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
150
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
153
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
156
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
159
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
162
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
165
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
168
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
172
173 /*
174  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175  * disabled on default.
176  */
177 static uint32_t zero_copy;
178
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
182
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
185
186 struct vpool {
187         struct rte_mempool *pool;
188         struct rte_ring *ring;
189         uint32_t buf_size;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
191
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
193 typedef enum {
194         VM2VM_DISABLED = 0,
195         VM2VM_SOFTWARE = 1,
196         VM2VM_HARDWARE = 2,
197         VM2VM_LAST
198 } vm2vm_type;
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
200
201 /* The type of host physical address translated from guest physical address. */
202 typedef enum {
203         PHYS_ADDR_CONTINUOUS = 0,
204         PHYS_ADDR_CROSS_SUBREG = 1,
205         PHYS_ADDR_INVALID = 2,
206         PHYS_ADDR_LAST
207 } hpa_type;
208
209 /* Enable stats. */
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
217
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
220
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
223
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
226
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
229         .rx_thresh = {
230                 .pthresh = RX_PTHRESH,
231                 .hthresh = RX_HTHRESH,
232                 .wthresh = RX_WTHRESH,
233         },
234         .rx_drop_en = 1,
235 };
236
237 /*
238  * These default values are optimized for use with the Intel(R) 82599 10 GbE
239  * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240  * network controllers and/or network drivers.
241  */
242 static struct rte_eth_txconf tx_conf_default = {
243         .tx_thresh = {
244                 .pthresh = TX_PTHRESH,
245                 .hthresh = TX_HTHRESH,
246                 .wthresh = TX_WTHRESH,
247         },
248         .tx_free_thresh = 0, /* Use PMD default values */
249         .tx_rs_thresh = 0, /* Use PMD default values */
250 };
251
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
254         .rxmode = {
255                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
256                 .split_hdr_size = 0,
257                 .header_split   = 0, /**< Header Split disabled */
258                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
259                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
260                 /*
261                  * It is necessary for 1G NIC such as I350,
262                  * this fixes bug of ipv4 forwarding in guest can't
263                  * forward pakets from one virtio dev to another virtio dev.
264                  */
265                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
266                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
267                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
268         },
269
270         .txmode = {
271                 .mq_mode = ETH_MQ_TX_NONE,
272         },
273         .rx_adv_conf = {
274                 /*
275                  * should be overridden separately in code with
276                  * appropriate values
277                  */
278                 .vmdq_rx_conf = {
279                         .nb_queue_pools = ETH_8_POOLS,
280                         .enable_default_pool = 0,
281                         .default_pool = 0,
282                         .nb_pool_maps = 0,
283                         .pool_map = {{0, 0},},
284                 },
285         },
286 };
287
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
291
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 };
303
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
306
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
310
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
313
314 /* Used for queueing bursts of TX packets. */
315 struct mbuf_table {
316         unsigned len;
317         unsigned txq_id;
318         struct rte_mbuf *m_table[MAX_PKT_BURST];
319 };
320
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
323
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
326
327 /* Vlan header struct used to insert vlan tags on TX. */
328 struct vlan_ethhdr {
329         unsigned char   h_dest[ETH_ALEN];
330         unsigned char   h_source[ETH_ALEN];
331         __be16          h_vlan_proto;
332         __be16          h_vlan_TCI;
333         __be16          h_vlan_encapsulated_proto;
334 };
335
336 /* IPv4 Header */
337 struct ipv4_hdr {
338         uint8_t  version_ihl;           /**< version and header length */
339         uint8_t  type_of_service;       /**< type of service */
340         uint16_t total_length;          /**< length of packet */
341         uint16_t packet_id;             /**< packet ID */
342         uint16_t fragment_offset;       /**< fragmentation offset */
343         uint8_t  time_to_live;          /**< time to live */
344         uint8_t  next_proto_id;         /**< protocol ID */
345         uint16_t hdr_checksum;          /**< header checksum */
346         uint32_t src_addr;              /**< source address */
347         uint32_t dst_addr;              /**< destination address */
348 } __attribute__((__packed__));
349
350 /* Header lengths. */
351 #define VLAN_HLEN       4
352 #define VLAN_ETH_HLEN   18
353
354 /* Per-device statistics struct */
355 struct device_statistics {
356         uint64_t tx_total;
357         rte_atomic64_t rx_total_atomic;
358         uint64_t rx_total;
359         uint64_t tx;
360         rte_atomic64_t rx_atomic;
361         uint64_t rx;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
364
365 /*
366  * Builds up the correct configuration for VMDQ VLAN pool map
367  * according to the pool & queue limits.
368  */
369 static inline int
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
371 {
372         struct rte_eth_vmdq_rx_conf conf;
373         unsigned i;
374
375         memset(&conf, 0, sizeof(conf));
376         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377         conf.nb_pool_maps = num_devices;
378         conf.enable_loop_back =
379                 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
380
381         for (i = 0; i < conf.nb_pool_maps; i++) {
382                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
383                 conf.pool_map[i].pools = (1UL << i);
384         }
385
386         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
389         return 0;
390 }
391
392 /*
393  * Validate the device number according to the max pool number gotten form
394  * dev_info. If the device number is invalid, give the error message and
395  * return -1. Each device must have its own pool.
396  */
397 static inline int
398 validate_num_devices(uint32_t max_nb_devices)
399 {
400         if (num_devices > max_nb_devices) {
401                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
402                 return -1;
403         }
404         return 0;
405 }
406
407 /*
408  * Initialises a given port using global settings and with the rx buffers
409  * coming from the mbuf_pool passed as parameter
410  */
411 static inline int
412 port_init(uint8_t port)
413 {
414         struct rte_eth_dev_info dev_info;
415         struct rte_eth_conf port_conf;
416         uint16_t rx_rings, tx_rings;
417         uint16_t rx_ring_size, tx_ring_size;
418         int retval;
419         uint16_t q;
420
421         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422         rte_eth_dev_info_get (port, &dev_info);
423
424         /*configure the number of supported virtio devices based on VMDQ limits */
425         num_devices = dev_info.max_vmdq_pools;
426         num_queues = dev_info.max_rx_queues;
427
428         if (zero_copy) {
429                 rx_ring_size = num_rx_descriptor;
430                 tx_ring_size = num_tx_descriptor;
431                 tx_rings = dev_info.max_tx_queues;
432         } else {
433                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435                 tx_rings = (uint16_t)rte_lcore_count();
436         }
437
438         retval = validate_num_devices(MAX_DEVICES);
439         if (retval < 0)
440                 return retval;
441
442         /* Get port configuration. */
443         retval = get_eth_conf(&port_conf, num_devices);
444         if (retval < 0)
445                 return retval;
446
447         if (port >= rte_eth_dev_count()) return -1;
448
449         rx_rings = (uint16_t)num_queues,
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port), &rx_conf_default,
459                                                 vpool_array[q].pool);
460                 if (retval < 0)
461                         return retval;
462         }
463         for (q = 0; q < tx_rings; q ++) {
464                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465                                                 rte_eth_dev_socket_id(port), &tx_conf_default);
466                 if (retval < 0)
467                         return retval;
468         }
469
470         /* Start the device. */
471         retval  = rte_eth_dev_start(port);
472         if (retval < 0) {
473                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
474                 return retval;
475         }
476
477         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
481                         (unsigned)port,
482                         vmdq_ports_eth_addr[port].addr_bytes[0],
483                         vmdq_ports_eth_addr[port].addr_bytes[1],
484                         vmdq_ports_eth_addr[port].addr_bytes[2],
485                         vmdq_ports_eth_addr[port].addr_bytes[3],
486                         vmdq_ports_eth_addr[port].addr_bytes[4],
487                         vmdq_ports_eth_addr[port].addr_bytes[5]);
488
489         return 0;
490 }
491
492 /*
493  * Set character device basename.
494  */
495 static int
496 us_vhost_parse_basename(const char *q_arg)
497 {
498         /* parse number string */
499
500         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501                 return -1;
502         else
503                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
504
505         return 0;
506 }
507
508 /*
509  * Parse the portmask provided at run time.
510  */
511 static int
512 parse_portmask(const char *portmask)
513 {
514         char *end = NULL;
515         unsigned long pm;
516
517         errno = 0;
518
519         /* parse hexadecimal string */
520         pm = strtoul(portmask, &end, 16);
521         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
522                 return -1;
523
524         if (pm == 0)
525                 return -1;
526
527         return pm;
528
529 }
530
531 /*
532  * Parse num options at run time.
533  */
534 static int
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
536 {
537         char *end = NULL;
538         unsigned long num;
539
540         errno = 0;
541
542         /* parse unsigned int string */
543         num = strtoul(q_arg, &end, 10);
544         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545                 return -1;
546
547         if (num > max_valid_value)
548                 return -1;
549
550         return num;
551
552 }
553
554 /*
555  * Display usage
556  */
557 static void
558 us_vhost_usage(const char *prgname)
559 {
560         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
561         "               --vm2vm [0|1|2]\n"
562         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563         "               --dev-basename <name> --dev-index [0-N]\n"
564         "               --nb-devices ND\n"
565         "               -p PORTMASK: Set mask for ports to be used by application\n"
566         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572         "               --dev-basename: The basename to be used for the character device.\n"
573         "               --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
575                         "zero copy\n"
576         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
577                         "used only when zero copy is enabled.\n"
578         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
579                         "used only when zero copy is enabled.\n",
580                prgname);
581 }
582
583 /*
584  * Parse the arguments given in the command line of the application.
585  */
586 static int
587 us_vhost_parse_args(int argc, char **argv)
588 {
589         int opt, ret;
590         int option_index;
591         unsigned i;
592         const char *prgname = argv[0];
593         static struct option long_option[] = {
594                 {"vm2vm", required_argument, NULL, 0},
595                 {"rx-retry", required_argument, NULL, 0},
596                 {"rx-retry-delay", required_argument, NULL, 0},
597                 {"rx-retry-num", required_argument, NULL, 0},
598                 {"mergeable", required_argument, NULL, 0},
599                 {"stats", required_argument, NULL, 0},
600                 {"dev-basename", required_argument, NULL, 0},
601                 {"dev-index", required_argument, NULL, 0},
602                 {"zero-copy", required_argument, NULL, 0},
603                 {"rx-desc-num", required_argument, NULL, 0},
604                 {"tx-desc-num", required_argument, NULL, 0},
605                 {NULL, 0, 0, 0},
606         };
607
608         /* Parse command line */
609         while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
610                 switch (opt) {
611                 /* Portmask */
612                 case 'p':
613                         enabled_port_mask = parse_portmask(optarg);
614                         if (enabled_port_mask == 0) {
615                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616                                 us_vhost_usage(prgname);
617                                 return -1;
618                         }
619                         break;
620
621                 case 0:
622                         /* Enable/disable vm2vm comms. */
623                         if (!strncmp(long_option[option_index].name, "vm2vm",
624                                 MAX_LONG_OPT_SZ)) {
625                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
626                                 if (ret == -1) {
627                                         RTE_LOG(INFO, VHOST_CONFIG,
628                                                 "Invalid argument for "
629                                                 "vm2vm [0|1|2]\n");
630                                         us_vhost_usage(prgname);
631                                         return -1;
632                                 } else {
633                                         vm2vm_mode = (vm2vm_type)ret;
634                                 }
635                         }
636
637                         /* Enable/disable retries on RX. */
638                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639                                 ret = parse_num_opt(optarg, 1);
640                                 if (ret == -1) {
641                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642                                         us_vhost_usage(prgname);
643                                         return -1;
644                                 } else {
645                                         enable_retry = ret;
646                                 }
647                         }
648
649                         /* Specify the retries delay time (in useconds) on RX. */
650                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651                                 ret = parse_num_opt(optarg, INT32_MAX);
652                                 if (ret == -1) {
653                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654                                         us_vhost_usage(prgname);
655                                         return -1;
656                                 } else {
657                                         burst_rx_delay_time = ret;
658                                 }
659                         }
660
661                         /* Specify the retries number on RX. */
662                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663                                 ret = parse_num_opt(optarg, INT32_MAX);
664                                 if (ret == -1) {
665                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666                                         us_vhost_usage(prgname);
667                                         return -1;
668                                 } else {
669                                         burst_rx_retry_num = ret;
670                                 }
671                         }
672
673                         /* Enable/disable RX mergeable buffers. */
674                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675                                 ret = parse_num_opt(optarg, 1);
676                                 if (ret == -1) {
677                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678                                         us_vhost_usage(prgname);
679                                         return -1;
680                                 } else {
681                                         if (ret) {
682                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
683                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
684                                                         = JUMBO_FRAME_MAX_SIZE;
685                                                 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
686                                         }
687                                 }
688                         }
689
690                         /* Enable/disable stats. */
691                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         enable_stats = ret;
699                                 }
700                         }
701
702                         /* Set character device basename. */
703                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704                                 if (us_vhost_parse_basename(optarg) == -1) {
705                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706                                         us_vhost_usage(prgname);
707                                         return -1;
708                                 }
709                         }
710
711                         /* Set character device index. */
712                         if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713                                 ret = parse_num_opt(optarg, INT32_MAX);
714                                 if (ret == -1) {
715                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716                                         us_vhost_usage(prgname);
717                                         return -1;
718                                 } else
719                                         dev_index = ret;
720                         }
721
722                         /* Enable/disable rx/tx zero copy. */
723                         if (!strncmp(long_option[option_index].name,
724                                 "zero-copy", MAX_LONG_OPT_SZ)) {
725                                 ret = parse_num_opt(optarg, 1);
726                                 if (ret == -1) {
727                                         RTE_LOG(INFO, VHOST_CONFIG,
728                                                 "Invalid argument"
729                                                 " for zero-copy [0|1]\n");
730                                         us_vhost_usage(prgname);
731                                         return -1;
732                                 } else
733                                         zero_copy = ret;
734
735                                 if (zero_copy) {
736 #ifdef RTE_MBUF_REFCNT
737                                         RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738                                         "zero copy vhost APP, please "
739                                         "disable RTE_MBUF_REFCNT\n"
740                                         "in config file and then rebuild DPDK "
741                                         "core lib!\n"
742                                         "Otherwise please disable zero copy "
743                                         "flag in command line!\n");
744                                         return -1;
745 #endif
746                                 }
747                         }
748
749                         /* Specify the descriptor number on RX. */
750                         if (!strncmp(long_option[option_index].name,
751                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
752                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
753                                 if ((ret == -1) || (!POWEROF2(ret))) {
754                                         RTE_LOG(INFO, VHOST_CONFIG,
755                                         "Invalid argument for rx-desc-num[0-N],"
756                                         "power of 2 required.\n");
757                                         us_vhost_usage(prgname);
758                                         return -1;
759                                 } else {
760                                         num_rx_descriptor = ret;
761                                 }
762                         }
763
764                         /* Specify the descriptor number on TX. */
765                         if (!strncmp(long_option[option_index].name,
766                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
767                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
768                                 if ((ret == -1) || (!POWEROF2(ret))) {
769                                         RTE_LOG(INFO, VHOST_CONFIG,
770                                         "Invalid argument for tx-desc-num [0-N],"
771                                         "power of 2 required.\n");
772                                         us_vhost_usage(prgname);
773                                         return -1;
774                                 } else {
775                                         num_tx_descriptor = ret;
776                                 }
777                         }
778
779                         break;
780
781                         /* Invalid option - print options. */
782                 default:
783                         us_vhost_usage(prgname);
784                         return -1;
785                 }
786         }
787
788         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789                 if (enabled_port_mask & (1 << i))
790                         ports[num_ports++] = (uint8_t)i;
791         }
792
793         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
794                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
796                 return -1;
797         }
798
799         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800                 RTE_LOG(INFO, VHOST_PORT,
801                         "Vhost zero copy doesn't support software vm2vm,"
802                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
803                 return -1;
804         }
805
806         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807                 RTE_LOG(INFO, VHOST_PORT,
808                         "Vhost zero copy doesn't support jumbo frame,"
809                         "please specify '--mergeable 0' to disable the "
810                         "mergeable feature.\n");
811                 return -1;
812         }
813
814         return 0;
815 }
816
817 /*
818  * Update the global var NUM_PORTS and array PORTS according to system ports number
819  * and return valid ports number
820  */
821 static unsigned check_ports_num(unsigned nb_ports)
822 {
823         unsigned valid_num_ports = num_ports;
824         unsigned portid;
825
826         if (num_ports > nb_ports) {
827                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828                         num_ports, nb_ports);
829                 num_ports = nb_ports;
830         }
831
832         for (portid = 0; portid < num_ports; portid ++) {
833                 if (ports[portid] >= nb_ports) {
834                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835                                 ports[portid], (nb_ports - 1));
836                         ports[portid] = INVALID_PORT_ID;
837                         valid_num_ports--;
838                 }
839         }
840         return valid_num_ports;
841 }
842
843 /*
844  * Macro to print out packet contents. Wrapped in debug define so that the
845  * data path is not effected when debug is disabled.
846  */
847 #ifdef DEBUG
848 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
849         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
850         unsigned int index;                                                                                                                                                                                             \
851         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
852                                                                                                                                                                                                                                         \
853         if ((header))                                                                                                                                                                                                   \
854                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
855         else                                                                                                                                                                                                                    \
856                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
857         for (index = 0; index < (size); index++) {                                                                                                                                              \
858                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
859                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
860         }                                                                                                                                                                                                                               \
861         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
862                                                                                                                                                                                                                                         \
863         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
864 } while(0)
865 #else
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
867 #endif
868
869 /*
870  * Function to convert guest physical addresses to vhost virtual addresses. This
871  * is used to convert virtio buffer addresses.
872  */
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
875 {
876         struct virtio_memory_regions *region;
877         uint32_t regionidx;
878         uint64_t vhost_va = 0;
879
880         for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
881                 region = &dev->mem->regions[regionidx];
882                 if ((guest_pa >= region->guest_phys_address) &&
883                         (guest_pa <= region->guest_phys_address_end)) {
884                         vhost_va = region->address_offset + guest_pa;
885                         break;
886                 }
887         }
888         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n",
889                 dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
890
891         return vhost_va;
892 }
893
894 /*
895  * Function to convert guest physical addresses to vhost physical addresses.
896  * This is used to convert virtio buffer addresses.
897  */
898 static inline uint64_t __attribute__((always_inline))
899 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
900         uint32_t buf_len, hpa_type *addr_type)
901 {
902         struct virtio_memory_regions_hpa *region;
903         uint32_t regionidx;
904         uint64_t vhost_pa = 0;
905
906         *addr_type = PHYS_ADDR_INVALID;
907
908         for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
909                 region = &dev->mem->regions_hpa[regionidx];
910                 if ((guest_pa >= region->guest_phys_address) &&
911                         (guest_pa <= region->guest_phys_address_end)) {
912                         vhost_pa = region->host_phys_addr_offset + guest_pa;
913                         if (likely((guest_pa + buf_len - 1)
914                                 <= region->guest_phys_address_end))
915                                 *addr_type = PHYS_ADDR_CONTINUOUS;
916                         else
917                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
918                         break;
919                 }
920         }
921
922         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
923                 dev->device_fh, (void *)(uintptr_t)guest_pa,
924                 (void *)(uintptr_t)vhost_pa);
925
926         return vhost_pa;
927 }
928
929 /*
930  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
931  * be received from the physical port or from another virtio device. A packet
932  * count is returned to indicate the number of packets that were succesfully
933  * added to the RX queue. This function works when mergeable is disabled.
934  */
935 static inline uint32_t __attribute__((always_inline))
936 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
937 {
938         struct vhost_virtqueue *vq;
939         struct vring_desc *desc;
940         struct rte_mbuf *buff;
941         /* The virtio_hdr is initialised to 0. */
942         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
943         uint64_t buff_addr = 0;
944         uint64_t buff_hdr_addr = 0;
945         uint32_t head[MAX_PKT_BURST], packet_len = 0;
946         uint32_t head_idx, packet_success = 0;
947         uint32_t retry = 0;
948         uint16_t avail_idx, res_cur_idx;
949         uint16_t res_base_idx, res_end_idx;
950         uint16_t free_entries;
951         uint8_t success = 0;
952
953         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
954         vq = dev->virtqueue[VIRTIO_RXQ];
955         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
956
957         /* As many data cores may want access to available buffers, they need to be reserved. */
958         do {
959                 res_base_idx = vq->last_used_idx_res;
960                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
961
962                 free_entries = (avail_idx - res_base_idx);
963                 /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
964                 if (enable_retry && unlikely(count > free_entries)) {
965                         for (retry = 0; retry < burst_rx_retry_num; retry++) {
966                                 rte_delay_us(burst_rx_delay_time);
967                                 avail_idx =
968                                         *((volatile uint16_t *)&vq->avail->idx);
969                                 free_entries = (avail_idx - res_base_idx);
970                                 if (count <= free_entries)
971                                         break;
972                         }
973                 }
974
975                 /*check that we have enough buffers*/
976                 if (unlikely(count > free_entries))
977                         count = free_entries;
978
979                 if (count == 0)
980                         return 0;
981
982                 res_end_idx = res_base_idx + count;
983                 /* vq->last_used_idx_res is atomically updated. */
984                 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
985                                                                         res_end_idx);
986         } while (unlikely(success == 0));
987         res_cur_idx = res_base_idx;
988         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
989
990         /* Prefetch available ring to retrieve indexes. */
991         rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
992
993         /* Retrieve all of the head indexes first to avoid caching issues. */
994         for (head_idx = 0; head_idx < count; head_idx++)
995                 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
996
997         /*Prefetch descriptor index. */
998         rte_prefetch0(&vq->desc[head[packet_success]]);
999
1000         while (res_cur_idx != res_end_idx) {
1001                 /* Get descriptor from available ring */
1002                 desc = &vq->desc[head[packet_success]];
1003
1004                 buff = pkts[packet_success];
1005
1006                 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
1007                 buff_addr = gpa_to_vva(dev, desc->addr);
1008                 /* Prefetch buffer address. */
1009                 rte_prefetch0((void*)(uintptr_t)buff_addr);
1010
1011                 /* Copy virtio_hdr to packet and increment buffer address */
1012                 buff_hdr_addr = buff_addr;
1013                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1014
1015                 /*
1016                  * If the descriptors are chained the header and data are
1017                  * placed in separate buffers.
1018                  */
1019                 if (desc->flags & VRING_DESC_F_NEXT) {
1020                         desc->len = vq->vhost_hlen;
1021                         desc = &vq->desc[desc->next];
1022                         /* Buffer address translation. */
1023                         buff_addr = gpa_to_vva(dev, desc->addr);
1024                         desc->len = rte_pktmbuf_data_len(buff);
1025                 } else {
1026                         buff_addr += vq->vhost_hlen;
1027                         desc->len = packet_len;
1028                 }
1029
1030                 /* Update used ring with desc information */
1031                 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1032                 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1033
1034                 /* Copy mbuf data to buffer */
1035                 rte_memcpy((void *)(uintptr_t)buff_addr,
1036                         rte_pktmbuf_mtod(buff, const void *),
1037                         rte_pktmbuf_data_len(buff));
1038                 PRINT_PACKET(dev, (uintptr_t)buff_addr,
1039                         rte_pktmbuf_data_len(buff), 0);
1040
1041                 res_cur_idx++;
1042                 packet_success++;
1043
1044                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1045                         (const void *)&virtio_hdr, vq->vhost_hlen);
1046
1047                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1048
1049                 if (res_cur_idx < res_end_idx) {
1050                         /* Prefetch descriptor index. */
1051                         rte_prefetch0(&vq->desc[head[packet_success]]);
1052                 }
1053         }
1054
1055         rte_compiler_barrier();
1056
1057         /* Wait until it's our turn to add our buffer to the used ring. */
1058         while (unlikely(vq->last_used_idx != res_base_idx))
1059                 rte_pause();
1060
1061         *(volatile uint16_t *)&vq->used->idx += count;
1062         vq->last_used_idx = res_end_idx;
1063
1064         /* Kick the guest if necessary. */
1065         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1066                 eventfd_write((int)vq->kickfd, 1);
1067         return count;
1068 }
1069
1070 static inline uint32_t __attribute__((always_inline))
1071 copy_from_mbuf_to_vring(struct virtio_net *dev,
1072         uint16_t res_base_idx, uint16_t res_end_idx,
1073         struct rte_mbuf *pkt)
1074 {
1075         uint32_t vec_idx = 0;
1076         uint32_t entry_success = 0;
1077         struct vhost_virtqueue *vq;
1078         /* The virtio_hdr is initialised to 0. */
1079         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
1080                 {0, 0, 0, 0, 0, 0}, 0};
1081         uint16_t cur_idx = res_base_idx;
1082         uint64_t vb_addr = 0;
1083         uint64_t vb_hdr_addr = 0;
1084         uint32_t seg_offset = 0;
1085         uint32_t vb_offset = 0;
1086         uint32_t seg_avail;
1087         uint32_t vb_avail;
1088         uint32_t cpy_len, entry_len;
1089
1090         if (pkt == NULL)
1091                 return 0;
1092
1093         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
1094                 "End Index %d\n",
1095                 dev->device_fh, cur_idx, res_end_idx);
1096
1097         /*
1098          * Convert from gpa to vva
1099          * (guest physical addr -> vhost virtual addr)
1100          */
1101         vq = dev->virtqueue[VIRTIO_RXQ];
1102         vb_addr =
1103                 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1104         vb_hdr_addr = vb_addr;
1105
1106         /* Prefetch buffer address. */
1107         rte_prefetch0((void *)(uintptr_t)vb_addr);
1108
1109         virtio_hdr.num_buffers = res_end_idx - res_base_idx;
1110
1111         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
1112                 dev->device_fh, virtio_hdr.num_buffers);
1113
1114         rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
1115                 (const void *)&virtio_hdr, vq->vhost_hlen);
1116
1117         PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
1118
1119         seg_avail = rte_pktmbuf_data_len(pkt);
1120         vb_offset = vq->vhost_hlen;
1121         vb_avail =
1122                 vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
1123
1124         entry_len = vq->vhost_hlen;
1125
1126         if (vb_avail == 0) {
1127                 uint32_t desc_idx =
1128                         vq->buf_vec[vec_idx].desc_idx;
1129                 vq->desc[desc_idx].len = vq->vhost_hlen;
1130
1131                 if ((vq->desc[desc_idx].flags
1132                         & VRING_DESC_F_NEXT) == 0) {
1133                         /* Update used ring with desc information */
1134                         vq->used->ring[cur_idx & (vq->size - 1)].id
1135                                 = vq->buf_vec[vec_idx].desc_idx;
1136                         vq->used->ring[cur_idx & (vq->size - 1)].len
1137                                 = entry_len;
1138
1139                         entry_len = 0;
1140                         cur_idx++;
1141                         entry_success++;
1142                 }
1143
1144                 vec_idx++;
1145                 vb_addr =
1146                         gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1147
1148                 /* Prefetch buffer address. */
1149                 rte_prefetch0((void *)(uintptr_t)vb_addr);
1150                 vb_offset = 0;
1151                 vb_avail = vq->buf_vec[vec_idx].buf_len;
1152         }
1153
1154         cpy_len = RTE_MIN(vb_avail, seg_avail);
1155
1156         while (cpy_len > 0) {
1157                 /* Copy mbuf data to vring buffer */
1158                 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
1159                         (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
1160                         cpy_len);
1161
1162                 PRINT_PACKET(dev,
1163                         (uintptr_t)(vb_addr + vb_offset),
1164                         cpy_len, 0);
1165
1166                 seg_offset += cpy_len;
1167                 vb_offset += cpy_len;
1168                 seg_avail -= cpy_len;
1169                 vb_avail -= cpy_len;
1170                 entry_len += cpy_len;
1171
1172                 if (seg_avail != 0) {
1173                         /*
1174                          * The virtio buffer in this vring
1175                          * entry reach to its end.
1176                          * But the segment doesn't complete.
1177                          */
1178                         if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
1179                                 VRING_DESC_F_NEXT) == 0) {
1180                                 /* Update used ring with desc information */
1181                                 vq->used->ring[cur_idx & (vq->size - 1)].id
1182                                         = vq->buf_vec[vec_idx].desc_idx;
1183                                 vq->used->ring[cur_idx & (vq->size - 1)].len
1184                                         = entry_len;
1185                                 entry_len = 0;
1186                                 cur_idx++;
1187                                 entry_success++;
1188                         }
1189
1190                         vec_idx++;
1191                         vb_addr = gpa_to_vva(dev,
1192                                 vq->buf_vec[vec_idx].buf_addr);
1193                         vb_offset = 0;
1194                         vb_avail = vq->buf_vec[vec_idx].buf_len;
1195                         cpy_len = RTE_MIN(vb_avail, seg_avail);
1196                 } else {
1197                         /*
1198                          * This current segment complete, need continue to
1199                          * check if the whole packet complete or not.
1200                          */
1201                         pkt = pkt->next;
1202                         if (pkt != NULL) {
1203                                 /*
1204                                  * There are more segments.
1205                                  */
1206                                 if (vb_avail == 0) {
1207                                         /*
1208                                          * This current buffer from vring is
1209                                          * used up, need fetch next buffer
1210                                          * from buf_vec.
1211                                          */
1212                                         uint32_t desc_idx =
1213                                                 vq->buf_vec[vec_idx].desc_idx;
1214                                         vq->desc[desc_idx].len = vb_offset;
1215
1216                                         if ((vq->desc[desc_idx].flags &
1217                                                 VRING_DESC_F_NEXT) == 0) {
1218                                                 uint16_t wrapped_idx =
1219                                                         cur_idx & (vq->size - 1);
1220                                                 /*
1221                                                  * Update used ring with the
1222                                                  * descriptor information
1223                                                  */
1224                                                 vq->used->ring[wrapped_idx].id
1225                                                         = desc_idx;
1226                                                 vq->used->ring[wrapped_idx].len
1227                                                         = entry_len;
1228                                                 entry_success++;
1229                                                 entry_len = 0;
1230                                                 cur_idx++;
1231                                         }
1232
1233                                         /* Get next buffer from buf_vec. */
1234                                         vec_idx++;
1235                                         vb_addr = gpa_to_vva(dev,
1236                                                 vq->buf_vec[vec_idx].buf_addr);
1237                                         vb_avail =
1238                                                 vq->buf_vec[vec_idx].buf_len;
1239                                         vb_offset = 0;
1240                                 }
1241
1242                                 seg_offset = 0;
1243                                 seg_avail = rte_pktmbuf_data_len(pkt);
1244                                 cpy_len = RTE_MIN(vb_avail, seg_avail);
1245                         } else {
1246                                 /*
1247                                  * This whole packet completes.
1248                                  */
1249                                 uint32_t desc_idx =
1250                                         vq->buf_vec[vec_idx].desc_idx;
1251                                 vq->desc[desc_idx].len = vb_offset;
1252
1253                                 while (vq->desc[desc_idx].flags &
1254                                         VRING_DESC_F_NEXT) {
1255                                         desc_idx = vq->desc[desc_idx].next;
1256                                          vq->desc[desc_idx].len = 0;
1257                                 }
1258
1259                                 /* Update used ring with desc information */
1260                                 vq->used->ring[cur_idx & (vq->size - 1)].id
1261                                         = vq->buf_vec[vec_idx].desc_idx;
1262                                 vq->used->ring[cur_idx & (vq->size - 1)].len
1263                                         = entry_len;
1264                                 entry_len = 0;
1265                                 cur_idx++;
1266                                 entry_success++;
1267                                 seg_avail = 0;
1268                                 cpy_len = RTE_MIN(vb_avail, seg_avail);
1269                         }
1270                 }
1271         }
1272
1273         return entry_success;
1274 }
1275
1276 /*
1277  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
1278  * be received from the physical port or from another virtio device. A packet
1279  * count is returned to indicate the number of packets that were succesfully
1280  * added to the RX queue. This function works for mergeable RX.
1281  */
1282 static inline uint32_t __attribute__((always_inline))
1283 virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
1284         uint32_t count)
1285 {
1286         struct vhost_virtqueue *vq;
1287         uint32_t pkt_idx = 0, entry_success = 0;
1288         uint32_t retry = 0;
1289         uint16_t avail_idx, res_cur_idx;
1290         uint16_t res_base_idx, res_end_idx;
1291         uint8_t success = 0;
1292
1293         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
1294                 dev->device_fh);
1295         vq = dev->virtqueue[VIRTIO_RXQ];
1296         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1297
1298         if (count == 0)
1299                 return 0;
1300
1301         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1302                 uint32_t secure_len = 0;
1303                 uint16_t need_cnt;
1304                 uint32_t vec_idx = 0;
1305                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
1306                 uint16_t i, id;
1307
1308                 do {
1309                         /*
1310                          * As many data cores may want access to available
1311                          * buffers, they need to be reserved.
1312                          */
1313                         res_base_idx = vq->last_used_idx_res;
1314                         res_cur_idx = res_base_idx;
1315
1316                         do {
1317                                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1318                                 if (unlikely(res_cur_idx == avail_idx)) {
1319                                         /*
1320                                          * If retry is enabled and the queue is
1321                                          * full then we wait and retry to avoid
1322                                          * packet loss.
1323                                          */
1324                                         if (enable_retry) {
1325                                                 uint8_t cont = 0;
1326                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1327                                                         rte_delay_us(burst_rx_delay_time);
1328                                                         avail_idx =
1329                                                                 *((volatile uint16_t *)&vq->avail->idx);
1330                                                         if (likely(res_cur_idx != avail_idx)) {
1331                                                                 cont = 1;
1332                                                                 break;
1333                                                         }
1334                                                 }
1335                                                 if (cont == 1)
1336                                                         continue;
1337                                         }
1338
1339                                         LOG_DEBUG(VHOST_DATA,
1340                                                 "(%"PRIu64") Failed "
1341                                                 "to get enough desc from "
1342                                                 "vring\n",
1343                                                 dev->device_fh);
1344                                         return pkt_idx;
1345                                 } else {
1346                                         uint16_t wrapped_idx =
1347                                                 (res_cur_idx) & (vq->size - 1);
1348                                         uint32_t idx =
1349                                                 vq->avail->ring[wrapped_idx];
1350                                         uint8_t next_desc;
1351
1352                                         do {
1353                                                 next_desc = 0;
1354                                                 secure_len += vq->desc[idx].len;
1355                                                 if (vq->desc[idx].flags &
1356                                                         VRING_DESC_F_NEXT) {
1357                                                         idx = vq->desc[idx].next;
1358                                                         next_desc = 1;
1359                                                 }
1360                                         } while (next_desc);
1361
1362                                         res_cur_idx++;
1363                                 }
1364                         } while (pkt_len > secure_len);
1365
1366                         /* vq->last_used_idx_res is atomically updated. */
1367                         success = rte_atomic16_cmpset(&vq->last_used_idx_res,
1368                                                         res_base_idx,
1369                                                         res_cur_idx);
1370                 } while (success == 0);
1371
1372                 id = res_base_idx;
1373                 need_cnt = res_cur_idx - res_base_idx;
1374
1375                 for (i = 0; i < need_cnt; i++, id++) {
1376                         uint16_t wrapped_idx = id & (vq->size - 1);
1377                         uint32_t idx = vq->avail->ring[wrapped_idx];
1378                         uint8_t next_desc;
1379                         do {
1380                                 next_desc = 0;
1381                                 vq->buf_vec[vec_idx].buf_addr =
1382                                         vq->desc[idx].addr;
1383                                 vq->buf_vec[vec_idx].buf_len =
1384                                         vq->desc[idx].len;
1385                                 vq->buf_vec[vec_idx].desc_idx = idx;
1386                                 vec_idx++;
1387
1388                                 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
1389                                         idx = vq->desc[idx].next;
1390                                         next_desc = 1;
1391                                 }
1392                         } while (next_desc);
1393                 }
1394
1395                 res_end_idx = res_cur_idx;
1396
1397                 entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
1398                         res_end_idx, pkts[pkt_idx]);
1399
1400                 rte_compiler_barrier();
1401
1402                 /*
1403                  * Wait until it's our turn to add our buffer
1404                  * to the used ring.
1405                  */
1406                 while (unlikely(vq->last_used_idx != res_base_idx))
1407                         rte_pause();
1408
1409                 *(volatile uint16_t *)&vq->used->idx += entry_success;
1410                 vq->last_used_idx = res_end_idx;
1411
1412                 /* Kick the guest if necessary. */
1413                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1414                         eventfd_write((int)vq->kickfd, 1);
1415         }
1416
1417         return count;
1418 }
1419
1420 /*
1421  * Compares a packet destination MAC address to a device MAC address.
1422  */
1423 static inline int __attribute__((always_inline))
1424 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1425 {
1426         return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1427 }
1428
1429 /*
1430  * This function learns the MAC address of the device and registers this along with a
1431  * vlan tag to a VMDQ.
1432  */
1433 static int
1434 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1435 {
1436         struct ether_hdr *pkt_hdr;
1437         struct virtio_net_data_ll *dev_ll;
1438         int i, ret;
1439
1440         /* Learn MAC address of guest device from packet */
1441         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1442
1443         dev_ll = ll_root_used;
1444
1445         while (dev_ll != NULL) {
1446                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1447                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1448                         return -1;
1449                 }
1450                 dev_ll = dev_ll->next;
1451         }
1452
1453         for (i = 0; i < ETHER_ADDR_LEN; i++)
1454                 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1455
1456         /* vlan_tag currently uses the device_id. */
1457         dev->vlan_tag = vlan_tags[dev->device_fh];
1458
1459         /* Print out VMDQ registration info. */
1460         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1461                 dev->device_fh,
1462                 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1463                 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1464                 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1465                 dev->vlan_tag);
1466
1467         /* Register the MAC address. */
1468         ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1469         if (ret)
1470                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1471                                         dev->device_fh);
1472
1473         /* Enable stripping of the vlan tag as we handle routing. */
1474         rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1475
1476         /* Set device as ready for RX. */
1477         dev->ready = DEVICE_RX;
1478
1479         return 0;
1480 }
1481
1482 /*
1483  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1484  * queue before disabling RX on the device.
1485  */
1486 static inline void
1487 unlink_vmdq(struct virtio_net *dev)
1488 {
1489         unsigned i = 0;
1490         unsigned rx_count;
1491         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1492
1493         if (dev->ready == DEVICE_RX) {
1494                 /*clear MAC and VLAN settings*/
1495                 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1496                 for (i = 0; i < 6; i++)
1497                         dev->mac_address.addr_bytes[i] = 0;
1498
1499                 dev->vlan_tag = 0;
1500
1501                 /*Clear out the receive buffers*/
1502                 rx_count = rte_eth_rx_burst(ports[0],
1503                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1504
1505                 while (rx_count) {
1506                         for (i = 0; i < rx_count; i++)
1507                                 rte_pktmbuf_free(pkts_burst[i]);
1508
1509                         rx_count = rte_eth_rx_burst(ports[0],
1510                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1511                 }
1512
1513                 dev->ready = DEVICE_MAC_LEARNING;
1514         }
1515 }
1516
1517 /*
1518  * Check if the packet destination MAC address is for a local device. If so then put
1519  * the packet on that devices RX queue. If not then return.
1520  */
1521 static inline unsigned __attribute__((always_inline))
1522 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1523 {
1524         struct virtio_net_data_ll *dev_ll;
1525         struct ether_hdr *pkt_hdr;
1526         uint64_t ret = 0;
1527
1528         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1529
1530         /*get the used devices list*/
1531         dev_ll = ll_root_used;
1532
1533         while (dev_ll != NULL) {
1534                 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1535                                           &dev_ll->dev->mac_address)) {
1536
1537                         /* Drop the packet if the TX packet is destined for the TX device. */
1538                         if (dev_ll->dev->device_fh == dev->device_fh) {
1539                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1540                                                         dev_ll->dev->device_fh);
1541                                 return 0;
1542                         }
1543
1544
1545                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1546
1547                         if (dev_ll->dev->remove) {
1548                                 /*drop the packet if the device is marked for removal*/
1549                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1550                         } else {
1551                                 uint32_t mergeable =
1552                                         dev_ll->dev->features &
1553                                         (1 << VIRTIO_NET_F_MRG_RXBUF);
1554
1555                                 /*send the packet to the local virtio device*/
1556                                 if (likely(mergeable == 0))
1557                                         ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1558                                 else
1559                                         ret = virtio_dev_merge_rx(dev_ll->dev,
1560                                                 &m, 1);
1561
1562                                 if (enable_stats) {
1563                                         rte_atomic64_add(
1564                                         &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1565                                         1);
1566                                         rte_atomic64_add(
1567                                         &dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1568                                         ret);
1569                                         dev_statistics[dev->device_fh].tx_total++;
1570                                         dev_statistics[dev->device_fh].tx += ret;
1571                                 }
1572                         }
1573
1574                         return 0;
1575                 }
1576                 dev_ll = dev_ll->next;
1577         }
1578
1579         return -1;
1580 }
1581
1582 /*
1583  * This function routes the TX packet to the correct interface. This may be a local device
1584  * or the physical port.
1585  */
1586 static inline void __attribute__((always_inline))
1587 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1588 {
1589         struct mbuf_table *tx_q;
1590         struct vlan_ethhdr *vlan_hdr;
1591         struct rte_mbuf **m_table;
1592         struct rte_mbuf *mbuf, *prev;
1593         unsigned len, ret, offset = 0;
1594         const uint16_t lcore_id = rte_lcore_id();
1595         struct virtio_net_data_ll *dev_ll = ll_root_used;
1596         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1597
1598         /*check if destination is local VM*/
1599         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1600                 return;
1601
1602         if (vm2vm_mode == VM2VM_HARDWARE) {
1603                 while (dev_ll != NULL) {
1604                         if ((dev_ll->dev->ready == DEVICE_RX)
1605                                 && ether_addr_cmp(&(pkt_hdr->d_addr),
1606                                 &dev_ll->dev->mac_address)) {
1607                                 /*
1608                                  * Drop the packet if the TX packet is
1609                                  * destined for the TX device.
1610                                  */
1611                                 if (dev_ll->dev->device_fh == dev->device_fh) {
1612                                         LOG_DEBUG(VHOST_DATA,
1613                                         "(%"PRIu64") TX: Source and destination"
1614                                         " MAC addresses are the same. Dropping "
1615                                         "packet.\n",
1616                                         dev_ll->dev->device_fh);
1617                                         return;
1618                                 }
1619                                 offset = 4;
1620                                 vlan_tag =
1621                                 (uint16_t)
1622                                 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1623
1624                                 LOG_DEBUG(VHOST_DATA,
1625                                 "(%"PRIu64") TX: pkt to local VM device id:"
1626                                 "(%"PRIu64") vlan tag: %d.\n",
1627                                 dev->device_fh, dev_ll->dev->device_fh,
1628                                 vlan_tag);
1629
1630                                 break;
1631                         }
1632                         dev_ll = dev_ll->next;
1633                 }
1634         }
1635
1636         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1637
1638         /*Add packet to the port tx queue*/
1639         tx_q = &lcore_tx_queue[lcore_id];
1640         len = tx_q->len;
1641
1642         /* Allocate an mbuf and populate the structure. */
1643         mbuf = rte_pktmbuf_alloc(mbuf_pool);
1644         if (unlikely(mbuf == NULL)) {
1645                 RTE_LOG(ERR, VHOST_DATA,
1646                         "Failed to allocate memory for mbuf.\n");
1647                 return;
1648         }
1649
1650         mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1651         mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1652         mbuf->nb_segs = m->nb_segs;
1653
1654         /* Copy ethernet header to mbuf. */
1655         rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1656                 rte_pktmbuf_mtod(m, const void *),
1657                 ETH_HLEN);
1658
1659
1660         /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1661         vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1662         vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1663         vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1664         vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1665
1666         /* Copy the remaining packet contents to the mbuf. */
1667         rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1668                 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1669                 (m->data_len - ETH_HLEN));
1670
1671         /* Copy the remaining segments for the whole packet. */
1672         prev = mbuf;
1673         while (m->next) {
1674                 /* Allocate an mbuf and populate the structure. */
1675                 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1676                 if (unlikely(next_mbuf == NULL)) {
1677                         rte_pktmbuf_free(mbuf);
1678                         RTE_LOG(ERR, VHOST_DATA,
1679                                 "Failed to allocate memory for mbuf.\n");
1680                         return;
1681                 }
1682
1683                 m = m->next;
1684                 prev->next = next_mbuf;
1685                 prev = next_mbuf;
1686                 next_mbuf->data_len = m->data_len;
1687
1688                 /* Copy data to next mbuf. */
1689                 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1690                         rte_pktmbuf_mtod(m, const void *), m->data_len);
1691         }
1692
1693         tx_q->m_table[len] = mbuf;
1694         len++;
1695         if (enable_stats) {
1696                 dev_statistics[dev->device_fh].tx_total++;
1697                 dev_statistics[dev->device_fh].tx++;
1698         }
1699
1700         if (unlikely(len == MAX_PKT_BURST)) {
1701                 m_table = (struct rte_mbuf **)tx_q->m_table;
1702                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1703                 /* Free any buffers not handled by TX and update the port stats. */
1704                 if (unlikely(ret < len)) {
1705                         do {
1706                                 rte_pktmbuf_free(m_table[ret]);
1707                         } while (++ret < len);
1708                 }
1709
1710                 len = 0;
1711         }
1712
1713         tx_q->len = len;
1714         return;
1715 }
1716
1717 static inline void __attribute__((always_inline))
1718 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1719 {
1720         struct rte_mbuf m;
1721         struct vhost_virtqueue *vq;
1722         struct vring_desc *desc;
1723         uint64_t buff_addr = 0;
1724         uint32_t head[MAX_PKT_BURST];
1725         uint32_t used_idx;
1726         uint32_t i;
1727         uint16_t free_entries, packet_success = 0;
1728         uint16_t avail_idx;
1729
1730         vq = dev->virtqueue[VIRTIO_TXQ];
1731         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1732
1733         /* If there are no available buffers then return. */
1734         if (vq->last_used_idx == avail_idx)
1735                 return;
1736
1737         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1738
1739         /* Prefetch available ring to retrieve head indexes. */
1740         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1741
1742         /*get the number of free entries in the ring*/
1743         free_entries = (avail_idx - vq->last_used_idx);
1744
1745         /* Limit to MAX_PKT_BURST. */
1746         if (free_entries > MAX_PKT_BURST)
1747                 free_entries = MAX_PKT_BURST;
1748
1749         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1750         /* Retrieve all of the head indexes first to avoid caching issues. */
1751         for (i = 0; i < free_entries; i++)
1752                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1753
1754         /* Prefetch descriptor index. */
1755         rte_prefetch0(&vq->desc[head[packet_success]]);
1756         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1757
1758         while (packet_success < free_entries) {
1759                 desc = &vq->desc[head[packet_success]];
1760
1761                 /* Discard first buffer as it is the virtio header */
1762                 desc = &vq->desc[desc->next];
1763
1764                 /* Buffer address translation. */
1765                 buff_addr = gpa_to_vva(dev, desc->addr);
1766                 /* Prefetch buffer address. */
1767                 rte_prefetch0((void*)(uintptr_t)buff_addr);
1768
1769                 used_idx = vq->last_used_idx & (vq->size - 1);
1770
1771                 if (packet_success < (free_entries - 1)) {
1772                         /* Prefetch descriptor index. */
1773                         rte_prefetch0(&vq->desc[head[packet_success+1]]);
1774                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1775                 }
1776
1777                 /* Update used index buffer information. */
1778                 vq->used->ring[used_idx].id = head[packet_success];
1779                 vq->used->ring[used_idx].len = 0;
1780
1781                 /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1782                 m.data_len = desc->len;
1783                 m.pkt_len = desc->len;
1784                 m.data_off = 0;
1785
1786                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1787
1788                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1789                 if (dev->ready == DEVICE_MAC_LEARNING) {
1790                         if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1791                                 /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1792                                 packet_success += free_entries;
1793                                 vq->last_used_idx += packet_success;
1794                                 break;
1795                         }
1796                 }
1797                 virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1798
1799                 vq->last_used_idx++;
1800                 packet_success++;
1801         }
1802
1803         rte_compiler_barrier();
1804         vq->used->idx += packet_success;
1805         /* Kick guest if required. */
1806         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1807                 eventfd_write((int)vq->kickfd, 1);
1808 }
1809
1810 /* This function works for TX packets with mergeable feature enabled. */
1811 static inline void __attribute__((always_inline))
1812 virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool)
1813 {
1814         struct rte_mbuf *m, *prev;
1815         struct vhost_virtqueue *vq;
1816         struct vring_desc *desc;
1817         uint64_t vb_addr = 0;
1818         uint32_t head[MAX_PKT_BURST];
1819         uint32_t used_idx;
1820         uint32_t i;
1821         uint16_t free_entries, entry_success = 0;
1822         uint16_t avail_idx;
1823         uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
1824                         + RTE_PKTMBUF_HEADROOM);
1825
1826         vq = dev->virtqueue[VIRTIO_TXQ];
1827         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1828
1829         /* If there are no available buffers then return. */
1830         if (vq->last_used_idx == avail_idx)
1831                 return;
1832
1833         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
1834                 dev->device_fh);
1835
1836         /* Prefetch available ring to retrieve head indexes. */
1837         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1838
1839         /*get the number of free entries in the ring*/
1840         free_entries = (avail_idx - vq->last_used_idx);
1841
1842         /* Limit to MAX_PKT_BURST. */
1843         free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
1844
1845         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1846                 dev->device_fh, free_entries);
1847         /* Retrieve all of the head indexes first to avoid caching issues. */
1848         for (i = 0; i < free_entries; i++)
1849                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1850
1851         /* Prefetch descriptor index. */
1852         rte_prefetch0(&vq->desc[head[entry_success]]);
1853         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1854
1855         while (entry_success < free_entries) {
1856                 uint32_t vb_avail, vb_offset;
1857                 uint32_t seg_avail, seg_offset;
1858                 uint32_t cpy_len;
1859                 uint32_t seg_num = 0;
1860                 struct rte_mbuf *cur;
1861                 uint8_t alloc_err = 0;
1862
1863                 desc = &vq->desc[head[entry_success]];
1864
1865                 /* Discard first buffer as it is the virtio header */
1866                 desc = &vq->desc[desc->next];
1867
1868                 /* Buffer address translation. */
1869                 vb_addr = gpa_to_vva(dev, desc->addr);
1870                 /* Prefetch buffer address. */
1871                 rte_prefetch0((void *)(uintptr_t)vb_addr);
1872
1873                 used_idx = vq->last_used_idx & (vq->size - 1);
1874
1875                 if (entry_success < (free_entries - 1)) {
1876                         /* Prefetch descriptor index. */
1877                         rte_prefetch0(&vq->desc[head[entry_success+1]]);
1878                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1879                 }
1880
1881                 /* Update used index buffer information. */
1882                 vq->used->ring[used_idx].id = head[entry_success];
1883                 vq->used->ring[used_idx].len = 0;
1884
1885                 vb_offset = 0;
1886                 vb_avail = desc->len;
1887                 seg_offset = 0;
1888                 seg_avail = buf_size;
1889                 cpy_len = RTE_MIN(vb_avail, seg_avail);
1890
1891                 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
1892
1893                 /* Allocate an mbuf and populate the structure. */
1894                 m = rte_pktmbuf_alloc(mbuf_pool);
1895                 if (unlikely(m == NULL)) {
1896                         RTE_LOG(ERR, VHOST_DATA,
1897                                 "Failed to allocate memory for mbuf.\n");
1898                         return;
1899                 }
1900
1901                 seg_num++;
1902                 cur = m;
1903                 prev = m;
1904                 while (cpy_len != 0) {
1905                         rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
1906                                 (void *)((uintptr_t)(vb_addr + vb_offset)),
1907                                 cpy_len);
1908
1909                         seg_offset += cpy_len;
1910                         vb_offset += cpy_len;
1911                         vb_avail -= cpy_len;
1912                         seg_avail -= cpy_len;
1913
1914                         if (vb_avail != 0) {
1915                                 /*
1916                                  * The segment reachs to its end,
1917                                  * while the virtio buffer in TX vring has
1918                                  * more data to be copied.
1919                                  */
1920                                 cur->data_len = seg_offset;
1921                                 m->pkt_len += seg_offset;
1922                                 /* Allocate mbuf and populate the structure. */
1923                                 cur = rte_pktmbuf_alloc(mbuf_pool);
1924                                 if (unlikely(cur == NULL)) {
1925                                         RTE_LOG(ERR, VHOST_DATA, "Failed to "
1926                                                 "allocate memory for mbuf.\n");
1927                                         rte_pktmbuf_free(m);
1928                                         alloc_err = 1;
1929                                         break;
1930                                 }
1931
1932                                 seg_num++;
1933                                 prev->next = cur;
1934                                 prev = cur;
1935                                 seg_offset = 0;
1936                                 seg_avail = buf_size;
1937                         } else {
1938                                 if (desc->flags & VRING_DESC_F_NEXT) {
1939                                         /*
1940                                          * There are more virtio buffers in
1941                                          * same vring entry need to be copied.
1942                                          */
1943                                         if (seg_avail == 0) {
1944                                                 /*
1945                                                  * The current segment hasn't
1946                                                  * room to accomodate more
1947                                                  * data.
1948                                                  */
1949                                                 cur->data_len = seg_offset;
1950                                                 m->pkt_len += seg_offset;
1951                                                 /*
1952                                                  * Allocate an mbuf and
1953                                                  * populate the structure.
1954                                                  */
1955                                                 cur = rte_pktmbuf_alloc(mbuf_pool);
1956                                                 if (unlikely(cur == NULL)) {
1957                                                         RTE_LOG(ERR,
1958                                                                 VHOST_DATA,
1959                                                                 "Failed to "
1960                                                                 "allocate memory "
1961                                                                 "for mbuf\n");
1962                                                         rte_pktmbuf_free(m);
1963                                                         alloc_err = 1;
1964                                                         break;
1965                                                 }
1966                                                 seg_num++;
1967                                                 prev->next = cur;
1968                                                 prev = cur;
1969                                                 seg_offset = 0;
1970                                                 seg_avail = buf_size;
1971                                         }
1972
1973                                         desc = &vq->desc[desc->next];
1974
1975                                         /* Buffer address translation. */
1976                                         vb_addr = gpa_to_vva(dev, desc->addr);
1977                                         /* Prefetch buffer address. */
1978                                         rte_prefetch0((void *)(uintptr_t)vb_addr);
1979                                         vb_offset = 0;
1980                                         vb_avail = desc->len;
1981
1982                                         PRINT_PACKET(dev, (uintptr_t)vb_addr,
1983                                                 desc->len, 0);
1984                                 } else {
1985                                         /* The whole packet completes. */
1986                                         cur->data_len = seg_offset;
1987                                         m->pkt_len += seg_offset;
1988                                         vb_avail = 0;
1989                                 }
1990                         }
1991
1992                         cpy_len = RTE_MIN(vb_avail, seg_avail);
1993                 }
1994
1995                 if (unlikely(alloc_err == 1))
1996                         break;
1997
1998                 m->nb_segs = seg_num;
1999
2000                 /*
2001                  * If this is the first received packet we need to learn
2002                  * the MAC and setup VMDQ
2003                  */
2004                 if (dev->ready == DEVICE_MAC_LEARNING) {
2005                         if (dev->remove || (link_vmdq(dev, m) == -1)) {
2006                                 /*
2007                                  * Discard frame if device is scheduled for
2008                                  * removal or a duplicate MAC address is found.
2009                                  */
2010                                 entry_success = free_entries;
2011                                 vq->last_used_idx += entry_success;
2012                                 rte_pktmbuf_free(m);
2013                                 break;
2014                         }
2015                 }
2016
2017                 virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh);
2018                 vq->last_used_idx++;
2019                 entry_success++;
2020                 rte_pktmbuf_free(m);
2021         }
2022
2023         rte_compiler_barrier();
2024         vq->used->idx += entry_success;
2025         /* Kick guest if required. */
2026         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2027                 eventfd_write((int)vq->kickfd, 1);
2028
2029 }
2030
2031 /*
2032  * This function is called by each data core. It handles all RX/TX registered with the
2033  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
2034  * with all devices in the main linked list.
2035  */
2036 static int
2037 switch_worker(__attribute__((unused)) void *arg)
2038 {
2039         struct rte_mempool *mbuf_pool = arg;
2040         struct virtio_net *dev = NULL;
2041         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2042         struct virtio_net_data_ll *dev_ll;
2043         struct mbuf_table *tx_q;
2044         volatile struct lcore_ll_info *lcore_ll;
2045         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
2046         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2047         unsigned ret, i;
2048         const uint16_t lcore_id = rte_lcore_id();
2049         const uint16_t num_cores = (uint16_t)rte_lcore_count();
2050         uint16_t rx_count = 0;
2051         uint32_t mergeable = 0;
2052
2053         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2054         lcore_ll = lcore_info[lcore_id].lcore_ll;
2055         prev_tsc = 0;
2056
2057         tx_q = &lcore_tx_queue[lcore_id];
2058         for (i = 0; i < num_cores; i ++) {
2059                 if (lcore_ids[i] == lcore_id) {
2060                         tx_q->txq_id = i;
2061                         break;
2062                 }
2063         }
2064
2065         while(1) {
2066                 cur_tsc = rte_rdtsc();
2067                 /*
2068                  * TX burst queue drain
2069                  */
2070                 diff_tsc = cur_tsc - prev_tsc;
2071                 if (unlikely(diff_tsc > drain_tsc)) {
2072
2073                         if (tx_q->len) {
2074                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
2075
2076                                 /*Tx any packets in the queue*/
2077                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
2078                                                                            (struct rte_mbuf **)tx_q->m_table,
2079                                                                            (uint16_t)tx_q->len);
2080                                 if (unlikely(ret < tx_q->len)) {
2081                                         do {
2082                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
2083                                         } while (++ret < tx_q->len);
2084                                 }
2085
2086                                 tx_q->len = 0;
2087                         }
2088
2089                         prev_tsc = cur_tsc;
2090
2091                 }
2092
2093                 rte_prefetch0(lcore_ll->ll_root_used);
2094                 /*
2095                  * Inform the configuration core that we have exited the linked list and that no devices are
2096                  * in use if requested.
2097                  */
2098                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2099                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2100
2101                 /*
2102                  * Process devices
2103                  */
2104                 dev_ll = lcore_ll->ll_root_used;
2105
2106                 while (dev_ll != NULL) {
2107                         /*get virtio device ID*/
2108                         dev = dev_ll->dev;
2109                         mergeable =
2110                                 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
2111
2112                         if (dev->remove) {
2113                                 dev_ll = dev_ll->next;
2114                                 unlink_vmdq(dev);
2115                                 dev->ready = DEVICE_SAFE_REMOVE;
2116                                 continue;
2117                         }
2118                         if (likely(dev->ready == DEVICE_RX)) {
2119                                 /*Handle guest RX*/
2120                                 rx_count = rte_eth_rx_burst(ports[0],
2121                                         (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
2122
2123                                 if (rx_count) {
2124                                         if (likely(mergeable == 0))
2125                                                 ret_count =
2126                                                         virtio_dev_rx(dev,
2127                                                         pkts_burst, rx_count);
2128                                         else
2129                                                 ret_count =
2130                                                         virtio_dev_merge_rx(dev,
2131                                                         pkts_burst, rx_count);
2132
2133                                         if (enable_stats) {
2134                                                 rte_atomic64_add(
2135                                                 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
2136                                                 rx_count);
2137                                                 rte_atomic64_add(
2138                                                 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
2139                                         }
2140                                         while (likely(rx_count)) {
2141                                                 rx_count--;
2142                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
2143                                         }
2144
2145                                 }
2146                         }
2147
2148                         if (!dev->remove) {
2149                                 /*Handle guest TX*/
2150                                 if (likely(mergeable == 0))
2151                                         virtio_dev_tx(dev, mbuf_pool);
2152                                 else
2153                                         virtio_dev_merge_tx(dev, mbuf_pool);
2154                         }
2155
2156                         /*move to the next device in the list*/
2157                         dev_ll = dev_ll->next;
2158                 }
2159         }
2160
2161         return 0;
2162 }
2163
2164 /*
2165  * This function gets available ring number for zero copy rx.
2166  * Only one thread will call this funciton for a paticular virtio device,
2167  * so, it is designed as non-thread-safe function.
2168  */
2169 static inline uint32_t __attribute__((always_inline))
2170 get_available_ring_num_zcp(struct virtio_net *dev)
2171 {
2172         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2173         uint16_t avail_idx;
2174
2175         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2176         return (uint32_t)(avail_idx - vq->last_used_idx_res);
2177 }
2178
2179 /*
2180  * This function gets available ring index for zero copy rx,
2181  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
2182  * Only one thread will call this funciton for a paticular virtio device,
2183  * so, it is designed as non-thread-safe function.
2184  */
2185 static inline uint32_t __attribute__((always_inline))
2186 get_available_ring_index_zcp(struct virtio_net *dev,
2187         uint16_t *res_base_idx, uint32_t count)
2188 {
2189         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2190         uint16_t avail_idx;
2191         uint32_t retry = 0;
2192         uint16_t free_entries;
2193
2194         *res_base_idx = vq->last_used_idx_res;
2195         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2196         free_entries = (avail_idx - *res_base_idx);
2197
2198         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
2199                         "avail idx: %d, "
2200                         "res base idx:%d, free entries:%d\n",
2201                         dev->device_fh, avail_idx, *res_base_idx,
2202                         free_entries);
2203
2204         /*
2205          * If retry is enabled and the queue is full then we wait
2206          * and retry to avoid packet loss.
2207          */
2208         if (enable_retry && unlikely(count > free_entries)) {
2209                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
2210                         rte_delay_us(burst_rx_delay_time);
2211                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2212                         free_entries = (avail_idx - *res_base_idx);
2213                         if (count <= free_entries)
2214                                 break;
2215                 }
2216         }
2217
2218         /*check that we have enough buffers*/
2219         if (unlikely(count > free_entries))
2220                 count = free_entries;
2221
2222         if (unlikely(count == 0)) {
2223                 LOG_DEBUG(VHOST_DATA,
2224                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
2225                         "avail idx: %d, res base idx:%d, free entries:%d\n",
2226                         dev->device_fh, avail_idx,
2227                         *res_base_idx, free_entries);
2228                 return 0;
2229         }
2230
2231         vq->last_used_idx_res = *res_base_idx + count;
2232
2233         return count;
2234 }
2235
2236 /*
2237  * This function put descriptor back to used list.
2238  */
2239 static inline void __attribute__((always_inline))
2240 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
2241 {
2242         uint16_t res_cur_idx = vq->last_used_idx;
2243         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
2244         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
2245         rte_compiler_barrier();
2246         *(volatile uint16_t *)&vq->used->idx += 1;
2247         vq->last_used_idx += 1;
2248
2249         /* Kick the guest if necessary. */
2250         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2251                 eventfd_write((int)vq->kickfd, 1);
2252 }
2253
2254 /*
2255  * This function get available descriptor from vitio vring and un-attached mbuf
2256  * from vpool->ring, and then attach them together. It needs adjust the offset
2257  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
2258  * frame data may be put to wrong location in mbuf.
2259  */
2260 static inline void __attribute__((always_inline))
2261 attach_rxmbuf_zcp(struct virtio_net *dev)
2262 {
2263         uint16_t res_base_idx, desc_idx;
2264         uint64_t buff_addr, phys_addr;
2265         struct vhost_virtqueue *vq;
2266         struct vring_desc *desc;
2267         struct rte_mbuf *mbuf = NULL;
2268         struct vpool *vpool;
2269         hpa_type addr_type;
2270
2271         vpool = &vpool_array[dev->vmdq_rx_q];
2272         vq = dev->virtqueue[VIRTIO_RXQ];
2273
2274         do {
2275                 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
2276                                 1) != 1))
2277                         return;
2278                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
2279
2280                 desc = &vq->desc[desc_idx];
2281                 if (desc->flags & VRING_DESC_F_NEXT) {
2282                         desc = &vq->desc[desc->next];
2283                         buff_addr = gpa_to_vva(dev, desc->addr);
2284                         phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
2285                                         &addr_type);
2286                 } else {
2287                         buff_addr = gpa_to_vva(dev,
2288                                         desc->addr + vq->vhost_hlen);
2289                         phys_addr = gpa_to_hpa(dev,
2290                                         desc->addr + vq->vhost_hlen,
2291                                         desc->len, &addr_type);
2292                 }
2293
2294                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2295                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
2296                                 " address found when attaching RX frame buffer"
2297                                 " address!\n", dev->device_fh);
2298                         put_desc_to_used_list_zcp(vq, desc_idx);
2299                         continue;
2300                 }
2301
2302                 /*
2303                  * Check if the frame buffer address from guest crosses
2304                  * sub-region or not.
2305                  */
2306                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2307                         RTE_LOG(ERR, VHOST_DATA,
2308                                 "(%"PRIu64") Frame buffer address cross "
2309                                 "sub-regioin found when attaching RX frame "
2310                                 "buffer address!\n",
2311                                 dev->device_fh);
2312                         put_desc_to_used_list_zcp(vq, desc_idx);
2313                         continue;
2314                 }
2315         } while (unlikely(phys_addr == 0));
2316
2317         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2318         if (unlikely(mbuf == NULL)) {
2319                 LOG_DEBUG(VHOST_DATA,
2320                         "(%"PRIu64") in attach_rxmbuf_zcp: "
2321                         "ring_sc_dequeue fail.\n",
2322                         dev->device_fh);
2323                 put_desc_to_used_list_zcp(vq, desc_idx);
2324                 return;
2325         }
2326
2327         if (unlikely(vpool->buf_size > desc->len)) {
2328                 LOG_DEBUG(VHOST_DATA,
2329                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
2330                         "length(%d) of descriptor idx: %d less than room "
2331                         "size required: %d\n",
2332                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
2333                 put_desc_to_used_list_zcp(vq, desc_idx);
2334                 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2335                 return;
2336         }
2337
2338         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
2339         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
2340         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
2341         mbuf->data_len = desc->len;
2342         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2343
2344         LOG_DEBUG(VHOST_DATA,
2345                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
2346                 "descriptor idx:%d\n",
2347                 dev->device_fh, res_base_idx, desc_idx);
2348
2349         __rte_mbuf_raw_free(mbuf);
2350
2351         return;
2352 }
2353
2354 /*
2355  * Detach an attched packet mbuf -
2356  *  - restore original mbuf address and length values.
2357  *  - reset pktmbuf data and data_len to their default values.
2358  *  All other fields of the given packet mbuf will be left intact.
2359  *
2360  * @param m
2361  *   The attached packet mbuf.
2362  */
2363 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
2364 {
2365         const struct rte_mempool *mp = m->pool;
2366         void *buf = RTE_MBUF_TO_BADDR(m);
2367         uint32_t buf_ofs;
2368         uint32_t buf_len = mp->elt_size - sizeof(*m);
2369         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
2370
2371         m->buf_addr = buf;
2372         m->buf_len = (uint16_t)buf_len;
2373
2374         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
2375                         RTE_PKTMBUF_HEADROOM : m->buf_len;
2376         m->data_off = buf_ofs;
2377
2378         m->data_len = 0;
2379 }
2380
2381 /*
2382  * This function is called after packets have been transimited. It fetchs mbuf
2383  * from vpool->pool, detached it and put into vpool->ring. It also update the
2384  * used index and kick the guest if necessary.
2385  */
2386 static inline uint32_t __attribute__((always_inline))
2387 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
2388 {
2389         struct rte_mbuf *mbuf;
2390         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2391         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
2392         uint32_t index = 0;
2393         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
2394
2395         LOG_DEBUG(VHOST_DATA,
2396                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
2397                 "clean is: %d\n",
2398                 dev->device_fh, mbuf_count);
2399         LOG_DEBUG(VHOST_DATA,
2400                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
2401                 "clean  is : %d\n",
2402                 dev->device_fh, rte_ring_count(vpool->ring));
2403
2404         for (index = 0; index < mbuf_count; index++) {
2405                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2406                 if (likely(RTE_MBUF_INDIRECT(mbuf)))
2407                         pktmbuf_detach_zcp(mbuf);
2408                 rte_ring_sp_enqueue(vpool->ring, mbuf);
2409
2410                 /* Update used index buffer information. */
2411                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
2412                 vq->used->ring[used_idx].len = 0;
2413
2414                 used_idx = (used_idx + 1) & (vq->size - 1);
2415         }
2416
2417         LOG_DEBUG(VHOST_DATA,
2418                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
2419                 "clean is: %d\n",
2420                 dev->device_fh, rte_mempool_count(vpool->pool));
2421         LOG_DEBUG(VHOST_DATA,
2422                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
2423                 "clean  is : %d\n",
2424                 dev->device_fh, rte_ring_count(vpool->ring));
2425         LOG_DEBUG(VHOST_DATA,
2426                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
2427                 "vq->last_used_idx:%d\n",
2428                 dev->device_fh, vq->last_used_idx);
2429
2430         vq->last_used_idx += mbuf_count;
2431
2432         LOG_DEBUG(VHOST_DATA,
2433                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
2434                 "vq->last_used_idx:%d\n",
2435                 dev->device_fh, vq->last_used_idx);
2436
2437         rte_compiler_barrier();
2438
2439         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
2440
2441         /* Kick guest if required. */
2442         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2443                 eventfd_write((int)vq->kickfd, 1);
2444
2445         return 0;
2446 }
2447
2448 /*
2449  * This function is called when a virtio device is destroy.
2450  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
2451  */
2452 static void mbuf_destroy_zcp(struct vpool *vpool)
2453 {
2454         struct rte_mbuf *mbuf = NULL;
2455         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
2456
2457         LOG_DEBUG(VHOST_CONFIG,
2458                 "in mbuf_destroy_zcp: mbuf count in mempool before "
2459                 "mbuf_destroy_zcp is: %d\n",
2460                 mbuf_count);
2461         LOG_DEBUG(VHOST_CONFIG,
2462                 "in mbuf_destroy_zcp: mbuf count in  ring before "
2463                 "mbuf_destroy_zcp  is : %d\n",
2464                 rte_ring_count(vpool->ring));
2465
2466         for (index = 0; index < mbuf_count; index++) {
2467                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2468                 if (likely(mbuf != NULL)) {
2469                         if (likely(RTE_MBUF_INDIRECT(mbuf)))
2470                                 pktmbuf_detach_zcp(mbuf);
2471                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2472                 }
2473         }
2474
2475         LOG_DEBUG(VHOST_CONFIG,
2476                 "in mbuf_destroy_zcp: mbuf count in mempool after "
2477                 "mbuf_destroy_zcp is: %d\n",
2478                 rte_mempool_count(vpool->pool));
2479         LOG_DEBUG(VHOST_CONFIG,
2480                 "in mbuf_destroy_zcp: mbuf count in ring after "
2481                 "mbuf_destroy_zcp is : %d\n",
2482                 rte_ring_count(vpool->ring));
2483 }
2484
2485 /*
2486  * This function update the use flag and counter.
2487  */
2488 static inline uint32_t __attribute__((always_inline))
2489 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
2490         uint32_t count)
2491 {
2492         struct vhost_virtqueue *vq;
2493         struct vring_desc *desc;
2494         struct rte_mbuf *buff;
2495         /* The virtio_hdr is initialised to 0. */
2496         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
2497                 = {{0, 0, 0, 0, 0, 0}, 0};
2498         uint64_t buff_hdr_addr = 0;
2499         uint32_t head[MAX_PKT_BURST], packet_len = 0;
2500         uint32_t head_idx, packet_success = 0;
2501         uint16_t res_cur_idx;
2502
2503         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
2504
2505         if (count == 0)
2506                 return 0;
2507
2508         vq = dev->virtqueue[VIRTIO_RXQ];
2509         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
2510
2511         res_cur_idx = vq->last_used_idx;
2512         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
2513                 dev->device_fh, res_cur_idx, res_cur_idx + count);
2514
2515         /* Retrieve all of the head indexes first to avoid caching issues. */
2516         for (head_idx = 0; head_idx < count; head_idx++)
2517                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
2518
2519         /*Prefetch descriptor index. */
2520         rte_prefetch0(&vq->desc[head[packet_success]]);
2521
2522         while (packet_success != count) {
2523                 /* Get descriptor from available ring */
2524                 desc = &vq->desc[head[packet_success]];
2525
2526                 buff = pkts[packet_success];
2527                 LOG_DEBUG(VHOST_DATA,
2528                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
2529                         "pkt[%d] descriptor idx: %d\n",
2530                         dev->device_fh, packet_success,
2531                         MBUF_HEADROOM_UINT32(buff));
2532
2533                 PRINT_PACKET(dev,
2534                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
2535                         + RTE_PKTMBUF_HEADROOM),
2536                         rte_pktmbuf_data_len(buff), 0);
2537
2538                 /* Buffer address translation for virtio header. */
2539                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
2540                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
2541
2542                 /*
2543                  * If the descriptors are chained the header and data are
2544                  * placed in separate buffers.
2545                  */
2546                 if (desc->flags & VRING_DESC_F_NEXT) {
2547                         desc->len = vq->vhost_hlen;
2548                         desc = &vq->desc[desc->next];
2549                         desc->len = rte_pktmbuf_data_len(buff);
2550                 } else {
2551                         desc->len = packet_len;
2552                 }
2553
2554                 /* Update used ring with desc information */
2555                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
2556                         = head[packet_success];
2557                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
2558                         = packet_len;
2559                 res_cur_idx++;
2560                 packet_success++;
2561
2562                 /* A header is required per buffer. */
2563                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
2564                         (const void *)&virtio_hdr, vq->vhost_hlen);
2565
2566                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
2567
2568                 if (likely(packet_success < count)) {
2569                         /* Prefetch descriptor index. */
2570                         rte_prefetch0(&vq->desc[head[packet_success]]);
2571                 }
2572         }
2573
2574         rte_compiler_barrier();
2575
2576         LOG_DEBUG(VHOST_DATA,
2577                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
2578                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
2579                 dev->device_fh, vq->last_used_idx, vq->used->idx);
2580
2581         *(volatile uint16_t *)&vq->used->idx += count;
2582         vq->last_used_idx += count;
2583
2584         LOG_DEBUG(VHOST_DATA,
2585                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
2586                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
2587                 dev->device_fh, vq->last_used_idx, vq->used->idx);
2588
2589         /* Kick the guest if necessary. */
2590         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2591                 eventfd_write((int)vq->kickfd, 1);
2592
2593         return count;
2594 }
2595
2596 /*
2597  * This function routes the TX packet to the correct interface.
2598  * This may be a local device or the physical port.
2599  */
2600 static inline void __attribute__((always_inline))
2601 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
2602         uint32_t desc_idx, uint8_t need_copy)
2603 {
2604         struct mbuf_table *tx_q;
2605         struct rte_mbuf **m_table;
2606         struct rte_mbuf *mbuf = NULL;
2607         unsigned len, ret, offset = 0;
2608         struct vpool *vpool;
2609         struct virtio_net_data_ll *dev_ll = ll_root_used;
2610         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
2611         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
2612
2613         /*Add packet to the port tx queue*/
2614         tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2615         len = tx_q->len;
2616
2617         /* Allocate an mbuf and populate the structure. */
2618         vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
2619         rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2620         if (unlikely(mbuf == NULL)) {
2621                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2622                 RTE_LOG(ERR, VHOST_DATA,
2623                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
2624                         dev->device_fh);
2625                 put_desc_to_used_list_zcp(vq, desc_idx);
2626                 return;
2627         }
2628
2629         if (vm2vm_mode == VM2VM_HARDWARE) {
2630                 /* Avoid using a vlan tag from any vm for external pkt, such as
2631                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2632                  * selection, MAC address determines it as an external pkt
2633                  * which should go to network, while vlan tag determine it as
2634                  * a vm2vm pkt should forward to another vm. Hardware confuse
2635                  * such a ambiguous situation, so pkt will lost.
2636                  */
2637                 vlan_tag = external_pkt_default_vlan_tag;
2638                 while (dev_ll != NULL) {
2639                         if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2640                                 ether_addr_cmp(&(pkt_hdr->d_addr),
2641                                 &dev_ll->dev->mac_address)) {
2642
2643                                 /*
2644                                  * Drop the packet if the TX packet is destined
2645                                  * for the TX device.
2646                                  */
2647                                 if (unlikely(dev_ll->dev->device_fh
2648                                         == dev->device_fh)) {
2649                                         LOG_DEBUG(VHOST_DATA,
2650                                         "(%"PRIu64") TX: Source and destination"
2651                                         "MAC addresses are the same. Dropping "
2652                                         "packet.\n",
2653                                         dev_ll->dev->device_fh);
2654                                         MBUF_HEADROOM_UINT32(mbuf)
2655                                                 = (uint32_t)desc_idx;
2656                                         __rte_mbuf_raw_free(mbuf);
2657                                         return;
2658                                 }
2659
2660                                 /*
2661                                  * Packet length offset 4 bytes for HW vlan
2662                                  * strip when L2 switch back.
2663                                  */
2664                                 offset = 4;
2665                                 vlan_tag =
2666                                 (uint16_t)
2667                                 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2668
2669                                 LOG_DEBUG(VHOST_DATA,
2670                                 "(%"PRIu64") TX: pkt to local VM device id:"
2671                                 "(%"PRIu64") vlan tag: %d.\n",
2672                                 dev->device_fh, dev_ll->dev->device_fh,
2673                                 vlan_tag);
2674
2675                                 break;
2676                         }
2677                         dev_ll = dev_ll->next;
2678                 }
2679         }
2680
2681         mbuf->nb_segs = m->nb_segs;
2682         mbuf->next = m->next;
2683         mbuf->data_len = m->data_len + offset;
2684         mbuf->pkt_len = mbuf->data_len;
2685         if (unlikely(need_copy)) {
2686                 /* Copy the packet contents to the mbuf. */
2687                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
2688                         rte_pktmbuf_mtod(m, void *),
2689                         m->data_len);
2690         } else {
2691                 mbuf->data_off = m->data_off;
2692                 mbuf->buf_physaddr = m->buf_physaddr;
2693                 mbuf->buf_addr = m->buf_addr;
2694         }
2695         mbuf->ol_flags = PKT_TX_VLAN_PKT;
2696         mbuf->vlan_tci = vlan_tag;
2697         mbuf->l2_len = sizeof(struct ether_hdr);
2698         mbuf->l3_len = sizeof(struct ipv4_hdr);
2699         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2700
2701         tx_q->m_table[len] = mbuf;
2702         len++;
2703
2704         LOG_DEBUG(VHOST_DATA,
2705                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2706                 dev->device_fh,
2707                 mbuf->nb_segs,
2708                 (mbuf->next == NULL) ? "null" : "non-null");
2709
2710         if (enable_stats) {
2711                 dev_statistics[dev->device_fh].tx_total++;
2712                 dev_statistics[dev->device_fh].tx++;
2713         }
2714
2715         if (unlikely(len == MAX_PKT_BURST)) {
2716                 m_table = (struct rte_mbuf **)tx_q->m_table;
2717                 ret = rte_eth_tx_burst(ports[0],
2718                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2719
2720                 /*
2721                  * Free any buffers not handled by TX and update
2722                  * the port stats.
2723                  */
2724                 if (unlikely(ret < len)) {
2725                         do {
2726                                 rte_pktmbuf_free(m_table[ret]);
2727                         } while (++ret < len);
2728                 }
2729
2730                 len = 0;
2731                 txmbuf_clean_zcp(dev, vpool);
2732         }
2733
2734         tx_q->len = len;
2735
2736         return;
2737 }
2738
2739 /*
2740  * This function TX all available packets in virtio TX queue for one
2741  * virtio-net device. If it is first packet, it learns MAC address and
2742  * setup VMDQ.
2743  */
2744 static inline void __attribute__((always_inline))
2745 virtio_dev_tx_zcp(struct virtio_net *dev)
2746 {
2747         struct rte_mbuf m;
2748         struct vhost_virtqueue *vq;
2749         struct vring_desc *desc;
2750         uint64_t buff_addr = 0, phys_addr;
2751         uint32_t head[MAX_PKT_BURST];
2752         uint32_t i;
2753         uint16_t free_entries, packet_success = 0;
2754         uint16_t avail_idx;
2755         uint8_t need_copy = 0;
2756         hpa_type addr_type;
2757
2758         vq = dev->virtqueue[VIRTIO_TXQ];
2759         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
2760
2761         /* If there are no available buffers then return. */
2762         if (vq->last_used_idx_res == avail_idx)
2763                 return;
2764
2765         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2766
2767         /* Prefetch available ring to retrieve head indexes. */
2768         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2769
2770         /* Get the number of free entries in the ring */
2771         free_entries = (avail_idx - vq->last_used_idx_res);
2772
2773         /* Limit to MAX_PKT_BURST. */
2774         free_entries
2775                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2776
2777         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2778                 dev->device_fh, free_entries);
2779
2780         /* Retrieve all of the head indexes first to avoid caching issues. */
2781         for (i = 0; i < free_entries; i++)
2782                 head[i]
2783                         = vq->avail->ring[(vq->last_used_idx_res + i)
2784                         & (vq->size - 1)];
2785
2786         vq->last_used_idx_res += free_entries;
2787
2788         /* Prefetch descriptor index. */
2789         rte_prefetch0(&vq->desc[head[packet_success]]);
2790         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2791
2792         while (packet_success < free_entries) {
2793                 desc = &vq->desc[head[packet_success]];
2794
2795                 /* Discard first buffer as it is the virtio header */
2796                 desc = &vq->desc[desc->next];
2797
2798                 /* Buffer address translation. */
2799                 buff_addr = gpa_to_vva(dev, desc->addr);
2800                 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2801
2802                 if (likely(packet_success < (free_entries - 1)))
2803                         /* Prefetch descriptor index. */
2804                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2805
2806                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2807                         RTE_LOG(ERR, VHOST_DATA,
2808                                 "(%"PRIu64") Invalid frame buffer address found"
2809                                 "when TX packets!\n",
2810                                 dev->device_fh);
2811                         packet_success++;
2812                         continue;
2813                 }
2814
2815                 /* Prefetch buffer address. */
2816                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2817
2818                 /*
2819                  * Setup dummy mbuf. This is copied to a real mbuf if
2820                  * transmitted out the physical port.
2821                  */
2822                 m.data_len = desc->len;
2823                 m.nb_segs = 1;
2824                 m.next = NULL;
2825                 m.data_off = 0;
2826                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2827                 m.buf_physaddr = phys_addr;
2828
2829                 /*
2830                  * Check if the frame buffer address from guest crosses
2831                  * sub-region or not.
2832                  */
2833                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2834                         RTE_LOG(ERR, VHOST_DATA,
2835                                 "(%"PRIu64") Frame buffer address cross "
2836                                 "sub-regioin found when attaching TX frame "
2837                                 "buffer address!\n",
2838                                 dev->device_fh);
2839                         need_copy = 1;
2840                 } else
2841                         need_copy = 0;
2842
2843                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2844
2845                 /*
2846                  * If this is the first received packet we need to learn
2847                  * the MAC and setup VMDQ
2848                  */
2849                 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2850                         if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2851                                 /*
2852                                  * Discard frame if device is scheduled for
2853                                  * removal or a duplicate MAC address is found.
2854                                  */
2855                                 packet_success += free_entries;
2856                                 vq->last_used_idx += packet_success;
2857                                 break;
2858                         }
2859                 }
2860
2861                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2862                 packet_success++;
2863         }
2864 }
2865
2866 /*
2867  * This function is called by each data core. It handles all RX/TX registered
2868  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2869  * addresses are compared with all devices in the main linked list.
2870  */
2871 static int
2872 switch_worker_zcp(__attribute__((unused)) void *arg)
2873 {
2874         struct virtio_net *dev = NULL;
2875         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2876         struct virtio_net_data_ll *dev_ll;
2877         struct mbuf_table *tx_q;
2878         volatile struct lcore_ll_info *lcore_ll;
2879         const uint64_t drain_tsc
2880                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2881                 * BURST_TX_DRAIN_US;
2882         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2883         unsigned ret;
2884         const uint16_t lcore_id = rte_lcore_id();
2885         uint16_t count_in_ring, rx_count = 0;
2886
2887         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2888
2889         lcore_ll = lcore_info[lcore_id].lcore_ll;
2890         prev_tsc = 0;
2891
2892         while (1) {
2893                 cur_tsc = rte_rdtsc();
2894
2895                 /* TX burst queue drain */
2896                 diff_tsc = cur_tsc - prev_tsc;
2897                 if (unlikely(diff_tsc > drain_tsc)) {
2898                         /*
2899                          * Get mbuf from vpool.pool and detach mbuf and
2900                          * put back into vpool.ring.
2901                          */
2902                         dev_ll = lcore_ll->ll_root_used;
2903                         while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2904                                 /* Get virtio device ID */
2905                                 dev = dev_ll->dev;
2906
2907                                 if (likely(!dev->remove)) {
2908                                         tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2909                                         if (tx_q->len) {
2910                                                 LOG_DEBUG(VHOST_DATA,
2911                                                 "TX queue drained after timeout"
2912                                                 " with burst size %u\n",
2913                                                 tx_q->len);
2914
2915                                                 /*
2916                                                  * Tx any packets in the queue
2917                                                  */
2918                                                 ret = rte_eth_tx_burst(
2919                                                         ports[0],
2920                                                         (uint16_t)tx_q->txq_id,
2921                                                         (struct rte_mbuf **)
2922                                                         tx_q->m_table,
2923                                                         (uint16_t)tx_q->len);
2924                                                 if (unlikely(ret < tx_q->len)) {
2925                                                         do {
2926                                                                 rte_pktmbuf_free(
2927                                                                         tx_q->m_table[ret]);
2928                                                         } while (++ret < tx_q->len);
2929                                                 }
2930                                                 tx_q->len = 0;
2931
2932                                                 txmbuf_clean_zcp(dev,
2933                                                         &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2934                                         }
2935                                 }
2936                                 dev_ll = dev_ll->next;
2937                         }
2938                         prev_tsc = cur_tsc;
2939                 }
2940
2941                 rte_prefetch0(lcore_ll->ll_root_used);
2942
2943                 /*
2944                  * Inform the configuration core that we have exited the linked
2945                  * list and that no devices are in use if requested.
2946                  */
2947                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2948                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2949
2950                 /* Process devices */
2951                 dev_ll = lcore_ll->ll_root_used;
2952
2953                 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2954                         dev = dev_ll->dev;
2955                         if (unlikely(dev->remove)) {
2956                                 dev_ll = dev_ll->next;
2957                                 unlink_vmdq(dev);
2958                                 dev->ready = DEVICE_SAFE_REMOVE;
2959                                 continue;
2960                         }
2961
2962                         if (likely(dev->ready == DEVICE_RX)) {
2963                                 uint32_t index = dev->vmdq_rx_q;
2964                                 uint16_t i;
2965                                 count_in_ring
2966                                 = rte_ring_count(vpool_array[index].ring);
2967                                 uint16_t free_entries
2968                                 = (uint16_t)get_available_ring_num_zcp(dev);
2969
2970                                 /*
2971                                  * Attach all mbufs in vpool.ring and put back
2972                                  * into vpool.pool.
2973                                  */
2974                                 for (i = 0;
2975                                 i < RTE_MIN(free_entries,
2976                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2977                                 i++)
2978                                         attach_rxmbuf_zcp(dev);
2979
2980                                 /* Handle guest RX */
2981                                 rx_count = rte_eth_rx_burst(ports[0],
2982                                         (uint16_t)dev->vmdq_rx_q, pkts_burst,
2983                                         MAX_PKT_BURST);
2984
2985                                 if (rx_count) {
2986                                         ret_count = virtio_dev_rx_zcp(dev,
2987                                                         pkts_burst, rx_count);
2988                                         if (enable_stats) {
2989                                                 dev_statistics[dev->device_fh].rx_total
2990                                                         += rx_count;
2991                                                 dev_statistics[dev->device_fh].rx
2992                                                         += ret_count;
2993                                         }
2994                                         while (likely(rx_count)) {
2995                                                 rx_count--;
2996                                                 pktmbuf_detach_zcp(
2997                                                         pkts_burst[rx_count]);
2998                                                 rte_ring_sp_enqueue(
2999                                                         vpool_array[index].ring,
3000                                                         (void *)pkts_burst[rx_count]);
3001                                         }
3002                                 }
3003                         }
3004
3005                         if (likely(!dev->remove))
3006                                 /* Handle guest TX */
3007                                 virtio_dev_tx_zcp(dev);
3008
3009                         /* Move to the next device in the list */
3010                         dev_ll = dev_ll->next;
3011                 }
3012         }
3013
3014         return 0;
3015 }
3016
3017
3018 /*
3019  * Add an entry to a used linked list. A free entry must first be found
3020  * in the free linked list using get_data_ll_free_entry();
3021  */
3022 static void
3023 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3024         struct virtio_net_data_ll *ll_dev)
3025 {
3026         struct virtio_net_data_ll *ll = *ll_root_addr;
3027
3028         /* Set next as NULL and use a compiler barrier to avoid reordering. */
3029         ll_dev->next = NULL;
3030         rte_compiler_barrier();
3031
3032         /* If ll == NULL then this is the first device. */
3033         if (ll) {
3034                 /* Increment to the tail of the linked list. */
3035                 while ((ll->next != NULL) )
3036                         ll = ll->next;
3037
3038                 ll->next = ll_dev;
3039         } else {
3040                 *ll_root_addr = ll_dev;
3041         }
3042 }
3043
3044 /*
3045  * Remove an entry from a used linked list. The entry must then be added to
3046  * the free linked list using put_data_ll_free_entry().
3047  */
3048 static void
3049 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3050         struct virtio_net_data_ll *ll_dev,
3051         struct virtio_net_data_ll *ll_dev_last)
3052 {
3053         struct virtio_net_data_ll *ll = *ll_root_addr;
3054
3055         if (unlikely((ll == NULL) || (ll_dev == NULL)))
3056                 return;
3057
3058         if (ll_dev == ll)
3059                 *ll_root_addr = ll_dev->next;
3060         else
3061                 if (likely(ll_dev_last != NULL))
3062                         ll_dev_last->next = ll_dev->next;
3063                 else
3064                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
3065 }
3066
3067 /*
3068  * Find and return an entry from the free linked list.
3069  */
3070 static struct virtio_net_data_ll *
3071 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
3072 {
3073         struct virtio_net_data_ll *ll_free = *ll_root_addr;
3074         struct virtio_net_data_ll *ll_dev;
3075
3076         if (ll_free == NULL)
3077                 return NULL;
3078
3079         ll_dev = ll_free;
3080         *ll_root_addr = ll_free->next;
3081
3082         return ll_dev;
3083 }
3084
3085 /*
3086  * Place an entry back on to the free linked list.
3087  */
3088 static void
3089 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
3090         struct virtio_net_data_ll *ll_dev)
3091 {
3092         struct virtio_net_data_ll *ll_free = *ll_root_addr;
3093
3094         if (ll_dev == NULL)
3095                 return;
3096
3097         ll_dev->next = ll_free;
3098         *ll_root_addr = ll_dev;
3099 }
3100
3101 /*
3102  * Creates a linked list of a given size.
3103  */
3104 static struct virtio_net_data_ll *
3105 alloc_data_ll(uint32_t size)
3106 {
3107         struct virtio_net_data_ll *ll_new;
3108         uint32_t i;
3109
3110         /* Malloc and then chain the linked list. */
3111         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
3112         if (ll_new == NULL) {
3113                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
3114                 return NULL;
3115         }
3116
3117         for (i = 0; i < size - 1; i++) {
3118                 ll_new[i].dev = NULL;
3119                 ll_new[i].next = &ll_new[i+1];
3120         }
3121         ll_new[i].next = NULL;
3122
3123         return (ll_new);
3124 }
3125
3126 /*
3127  * Create the main linked list along with each individual cores linked list. A used and a free list
3128  * are created to manage entries.
3129  */
3130 static int
3131 init_data_ll (void)
3132 {
3133         int lcore;
3134
3135         RTE_LCORE_FOREACH_SLAVE(lcore) {
3136                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
3137                 if (lcore_info[lcore].lcore_ll == NULL) {
3138                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
3139                         return -1;
3140                 }
3141
3142                 lcore_info[lcore].lcore_ll->device_num = 0;
3143                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
3144                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
3145                 if (num_devices % num_switching_cores)
3146                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
3147                 else
3148                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
3149         }
3150
3151         /* Allocate devices up to a maximum of MAX_DEVICES. */
3152         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
3153
3154         return 0;
3155 }
3156
3157 /*
3158  * Set virtqueue flags so that we do not receive interrupts.
3159  */
3160 static void
3161 set_irq_status (struct virtio_net *dev)
3162 {
3163         dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3164         dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3165 }
3166
3167 /*
3168  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
3169  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
3170  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
3171  */
3172 static void
3173 destroy_device (volatile struct virtio_net *dev)
3174 {
3175         struct virtio_net_data_ll *ll_lcore_dev_cur;
3176         struct virtio_net_data_ll *ll_main_dev_cur;
3177         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
3178         struct virtio_net_data_ll *ll_main_dev_last = NULL;
3179         int lcore;
3180
3181         dev->flags &= ~VIRTIO_DEV_RUNNING;
3182
3183         /*set the remove flag. */
3184         dev->remove = 1;
3185
3186         while(dev->ready != DEVICE_SAFE_REMOVE) {
3187                 rte_pause();
3188         }
3189
3190         /* Search for entry to be removed from lcore ll */
3191         ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
3192         while (ll_lcore_dev_cur != NULL) {
3193                 if (ll_lcore_dev_cur->dev == dev) {
3194                         break;
3195                 } else {
3196                         ll_lcore_dev_last = ll_lcore_dev_cur;
3197                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
3198                 }
3199         }
3200
3201         if (ll_lcore_dev_cur == NULL) {
3202                 RTE_LOG(ERR, VHOST_CONFIG,
3203                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
3204                         dev->device_fh);
3205                 return;
3206         }
3207
3208         /* Search for entry to be removed from main ll */
3209         ll_main_dev_cur = ll_root_used;
3210         ll_main_dev_last = NULL;
3211         while (ll_main_dev_cur != NULL) {
3212                 if (ll_main_dev_cur->dev == dev) {
3213                         break;
3214                 } else {
3215                         ll_main_dev_last = ll_main_dev_cur;
3216                         ll_main_dev_cur = ll_main_dev_cur->next;
3217                 }
3218         }
3219
3220         /* Remove entries from the lcore and main ll. */
3221         rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
3222         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
3223
3224         /* Set the dev_removal_flag on each lcore. */
3225         RTE_LCORE_FOREACH_SLAVE(lcore) {
3226                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
3227         }
3228
3229         /*
3230          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
3231          * they can no longer access the device removed from the linked lists and that the devices
3232          * are no longer in use.
3233          */
3234         RTE_LCORE_FOREACH_SLAVE(lcore) {
3235                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
3236                         rte_pause();
3237                 }
3238         }
3239
3240         /* Add the entries back to the lcore and main free ll.*/
3241         put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
3242         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
3243
3244         /* Decrement number of device on the lcore. */
3245         lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
3246
3247         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
3248
3249         if (zero_copy) {
3250                 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3251
3252                 /* Stop the RX queue. */
3253                 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3254                         LOG_DEBUG(VHOST_CONFIG,
3255                                 "(%"PRIu64") In destroy_device: Failed to stop "
3256                                 "rx queue:%d\n",
3257                                 dev->device_fh,
3258                                 dev->vmdq_rx_q);
3259                 }
3260
3261                 LOG_DEBUG(VHOST_CONFIG,
3262                         "(%"PRIu64") in destroy_device: Start put mbuf in "
3263                         "mempool back to ring for RX queue: %d\n",
3264                         dev->device_fh, dev->vmdq_rx_q);
3265
3266                 mbuf_destroy_zcp(vpool);
3267
3268                 /* Stop the TX queue. */
3269                 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3270                         LOG_DEBUG(VHOST_CONFIG,
3271                                 "(%"PRIu64") In destroy_device: Failed to "
3272                                 "stop tx queue:%d\n",
3273                                 dev->device_fh, dev->vmdq_rx_q);
3274                 }
3275
3276                 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
3277
3278                 LOG_DEBUG(VHOST_CONFIG,
3279                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
3280                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
3281                         dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
3282                         dev->device_fh);
3283
3284                 mbuf_destroy_zcp(vpool);
3285         }
3286
3287 }
3288
3289 /*
3290  * A new device is added to a data core. First the device is added to the main linked list
3291  * and the allocated to a specific data core.
3292  */
3293 static int
3294 new_device (struct virtio_net *dev)
3295 {
3296         struct virtio_net_data_ll *ll_dev;
3297         int lcore, core_add = 0;
3298         uint32_t device_num_min = num_devices;
3299
3300         /* Add device to main ll */
3301         ll_dev = get_data_ll_free_entry(&ll_root_free);
3302         if (ll_dev == NULL) {
3303                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
3304                         "of %d devices per core has been reached\n",
3305                         dev->device_fh, num_devices);
3306                 return -1;
3307         }
3308         ll_dev->dev = dev;
3309         add_data_ll_entry(&ll_root_used, ll_dev);
3310         ll_dev->dev->vmdq_rx_q
3311                 = ll_dev->dev->device_fh * (num_queues / num_devices);
3312
3313         if (zero_copy) {
3314                 uint32_t index = ll_dev->dev->vmdq_rx_q;
3315                 uint32_t count_in_ring, i;
3316                 struct mbuf_table *tx_q;
3317
3318                 count_in_ring = rte_ring_count(vpool_array[index].ring);
3319
3320                 LOG_DEBUG(VHOST_CONFIG,
3321                         "(%"PRIu64") in new_device: mbuf count in mempool "
3322                         "before attach is: %d\n",
3323                         dev->device_fh,
3324                         rte_mempool_count(vpool_array[index].pool));
3325                 LOG_DEBUG(VHOST_CONFIG,
3326                         "(%"PRIu64") in new_device: mbuf count in  ring "
3327                         "before attach  is : %d\n",
3328                         dev->device_fh, count_in_ring);
3329
3330                 /*
3331                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
3332                  */
3333                 for (i = 0; i < count_in_ring; i++)
3334                         attach_rxmbuf_zcp(dev);
3335
3336                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3337                         "mempool after attach is: %d\n",
3338                         dev->device_fh,
3339                         rte_mempool_count(vpool_array[index].pool));
3340                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3341                         "ring after attach  is : %d\n",
3342                         dev->device_fh,
3343                         rte_ring_count(vpool_array[index].ring));
3344
3345                 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
3346                 tx_q->txq_id = dev->vmdq_rx_q;
3347
3348                 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3349                         struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3350
3351                         LOG_DEBUG(VHOST_CONFIG,
3352                                 "(%"PRIu64") In new_device: Failed to start "
3353                                 "tx queue:%d\n",
3354                                 dev->device_fh, dev->vmdq_rx_q);
3355
3356                         mbuf_destroy_zcp(vpool);
3357                         return -1;
3358                 }
3359
3360                 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3361                         struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3362
3363                         LOG_DEBUG(VHOST_CONFIG,
3364                                 "(%"PRIu64") In new_device: Failed to start "
3365                                 "rx queue:%d\n",
3366                                 dev->device_fh, dev->vmdq_rx_q);
3367
3368                         /* Stop the TX queue. */
3369                         if (rte_eth_dev_tx_queue_stop(ports[0],
3370                                 dev->vmdq_rx_q) != 0) {
3371                                 LOG_DEBUG(VHOST_CONFIG,
3372                                         "(%"PRIu64") In new_device: Failed to "
3373                                         "stop tx queue:%d\n",
3374                                         dev->device_fh, dev->vmdq_rx_q);
3375                         }
3376
3377                         mbuf_destroy_zcp(vpool);
3378                         return -1;
3379                 }
3380
3381         }
3382
3383         /*reset ready flag*/
3384         dev->ready = DEVICE_MAC_LEARNING;
3385         dev->remove = 0;
3386
3387         /* Find a suitable lcore to add the device. */
3388         RTE_LCORE_FOREACH_SLAVE(lcore) {
3389                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
3390                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
3391                         core_add = lcore;
3392                 }
3393         }
3394         /* Add device to lcore ll */
3395         ll_dev->dev->coreid = core_add;
3396         ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
3397         if (ll_dev == NULL) {
3398                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
3399                 dev->ready = DEVICE_SAFE_REMOVE;
3400                 destroy_device(dev);
3401                 return -1;
3402         }
3403         ll_dev->dev = dev;
3404         add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
3405
3406         /* Initialize device stats */
3407         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
3408
3409         /* Disable notifications. */
3410         set_irq_status(dev);
3411         lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
3412         dev->flags |= VIRTIO_DEV_RUNNING;
3413
3414         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
3415
3416         return 0;
3417 }
3418
3419 /*
3420  * These callback allow devices to be added to the data core when configuration
3421  * has been fully complete.
3422  */
3423 static const struct virtio_net_device_ops virtio_net_device_ops =
3424 {
3425         .new_device =  new_device,
3426         .destroy_device = destroy_device,
3427 };
3428
3429 /*
3430  * This is a thread will wake up after a period to print stats if the user has
3431  * enabled them.
3432  */
3433 static void
3434 print_stats(void)
3435 {
3436         struct virtio_net_data_ll *dev_ll;
3437         uint64_t tx_dropped, rx_dropped;
3438         uint64_t tx, tx_total, rx, rx_total;
3439         uint32_t device_fh;
3440         const char clr[] = { 27, '[', '2', 'J', '\0' };
3441         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
3442
3443         while(1) {
3444                 sleep(enable_stats);
3445
3446                 /* Clear screen and move to top left */
3447                 printf("%s%s", clr, top_left);
3448
3449                 printf("\nDevice statistics ====================================");
3450
3451                 dev_ll = ll_root_used;
3452                 while (dev_ll != NULL) {
3453                         device_fh = (uint32_t)dev_ll->dev->device_fh;
3454                         tx_total = dev_statistics[device_fh].tx_total;
3455                         tx = dev_statistics[device_fh].tx;
3456                         tx_dropped = tx_total - tx;
3457                         if (zero_copy == 0) {
3458                                 rx_total = rte_atomic64_read(
3459                                         &dev_statistics[device_fh].rx_total_atomic);
3460                                 rx = rte_atomic64_read(
3461                                         &dev_statistics[device_fh].rx_atomic);
3462                         } else {
3463                                 rx_total = dev_statistics[device_fh].rx_total;
3464                                 rx = dev_statistics[device_fh].rx;
3465                         }
3466                         rx_dropped = rx_total - rx;
3467
3468                         printf("\nStatistics for device %"PRIu32" ------------------------------"
3469                                         "\nTX total:            %"PRIu64""
3470                                         "\nTX dropped:          %"PRIu64""
3471                                         "\nTX successful:               %"PRIu64""
3472                                         "\nRX total:            %"PRIu64""
3473                                         "\nRX dropped:          %"PRIu64""
3474                                         "\nRX successful:               %"PRIu64"",
3475                                         device_fh,
3476                                         tx_total,
3477                                         tx_dropped,
3478                                         tx,
3479                                         rx_total,
3480                                         rx_dropped,
3481                                         rx);
3482
3483                         dev_ll = dev_ll->next;
3484                 }
3485                 printf("\n======================================================\n");
3486         }
3487 }
3488
3489 static void
3490 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
3491         char *ring_name, uint32_t nb_mbuf)
3492 {
3493         uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
3494         vpool_array[index].pool
3495                 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
3496                 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
3497                 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
3498                 rte_pktmbuf_init, NULL, socket, 0);
3499         if (vpool_array[index].pool != NULL) {
3500                 vpool_array[index].ring
3501                         = rte_ring_create(ring_name,
3502                                 rte_align32pow2(nb_mbuf + 1),
3503                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
3504                 if (likely(vpool_array[index].ring != NULL)) {
3505                         LOG_DEBUG(VHOST_CONFIG,
3506                                 "in setup_mempool_tbl: mbuf count in "
3507                                 "mempool is: %d\n",
3508                                 rte_mempool_count(vpool_array[index].pool));
3509                         LOG_DEBUG(VHOST_CONFIG,
3510                                 "in setup_mempool_tbl: mbuf count in "
3511                                 "ring   is: %d\n",
3512                                 rte_ring_count(vpool_array[index].ring));
3513                 } else {
3514                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
3515                                 ring_name);
3516                 }
3517
3518                 /* Need consider head room. */
3519                 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
3520         } else {
3521                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
3522         }
3523 }
3524
3525
3526 /*
3527  * Main function, does initialisation and calls the per-lcore functions. The CUSE
3528  * device is also registered here to handle the IOCTLs.
3529  */
3530 int
3531 MAIN(int argc, char *argv[])
3532 {
3533         struct rte_mempool *mbuf_pool = NULL;
3534         unsigned lcore_id, core_id = 0;
3535         unsigned nb_ports, valid_num_ports;
3536         int ret;
3537         uint8_t portid, queue_id = 0;
3538         static pthread_t tid;
3539
3540         /* init EAL */
3541         ret = rte_eal_init(argc, argv);
3542         if (ret < 0)
3543                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
3544         argc -= ret;
3545         argv += ret;
3546
3547         /* parse app arguments */
3548         ret = us_vhost_parse_args(argc, argv);
3549         if (ret < 0)
3550                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
3551
3552         if (rte_eal_pci_probe() != 0)
3553                 rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n");
3554
3555         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
3556                 if (rte_lcore_is_enabled(lcore_id))
3557                         lcore_ids[core_id ++] = lcore_id;
3558
3559         if (rte_lcore_count() > RTE_MAX_LCORE)
3560                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
3561
3562         /*set the number of swithcing cores available*/
3563         num_switching_cores = rte_lcore_count()-1;
3564
3565         /* Get the number of physical ports. */
3566         nb_ports = rte_eth_dev_count();
3567         if (nb_ports > RTE_MAX_ETHPORTS)
3568                 nb_ports = RTE_MAX_ETHPORTS;
3569
3570         /*
3571          * Update the global var NUM_PORTS and global array PORTS
3572          * and get value of var VALID_NUM_PORTS according to system ports number
3573          */
3574         valid_num_ports = check_ports_num(nb_ports);
3575
3576         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3577                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3578                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3579                 return -1;
3580         }
3581
3582         if (zero_copy == 0) {
3583                 /* Create the mbuf pool. */
3584                 mbuf_pool = rte_mempool_create(
3585                                 "MBUF_POOL",
3586                                 NUM_MBUFS_PER_PORT
3587                                 * valid_num_ports,
3588                                 MBUF_SIZE, MBUF_CACHE_SIZE,
3589                                 sizeof(struct rte_pktmbuf_pool_private),
3590                                 rte_pktmbuf_pool_init, NULL,
3591                                 rte_pktmbuf_init, NULL,
3592                                 rte_socket_id(), 0);
3593                 if (mbuf_pool == NULL)
3594                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3595
3596                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3597                         vpool_array[queue_id].pool = mbuf_pool;
3598
3599                 if (vm2vm_mode == VM2VM_HARDWARE) {
3600                         /* Enable VT loop back to let L2 switch to do it. */
3601                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3602                         LOG_DEBUG(VHOST_CONFIG,
3603                                 "Enable loop back for L2 switch in vmdq.\n");
3604                 }
3605         } else {
3606                 uint32_t nb_mbuf;
3607                 char pool_name[RTE_MEMPOOL_NAMESIZE];
3608                 char ring_name[RTE_MEMPOOL_NAMESIZE];
3609
3610                 rx_conf_default.start_rx_per_q = (uint8_t)zero_copy;
3611                 rx_conf_default.rx_drop_en = 0;
3612                 tx_conf_default.start_tx_per_q = (uint8_t)zero_copy;
3613                 nb_mbuf = num_rx_descriptor
3614                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3615                         + num_switching_cores * MAX_PKT_BURST;
3616
3617                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3618                         snprintf(pool_name, sizeof(pool_name),
3619                                 "rxmbuf_pool_%u", queue_id);
3620                         snprintf(ring_name, sizeof(ring_name),
3621                                 "rxmbuf_ring_%u", queue_id);
3622                         setup_mempool_tbl(rte_socket_id(), queue_id,
3623                                 pool_name, ring_name, nb_mbuf);
3624                 }
3625
3626                 nb_mbuf = num_tx_descriptor
3627                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3628                                 + num_switching_cores * MAX_PKT_BURST;
3629
3630                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3631                         snprintf(pool_name, sizeof(pool_name),
3632                                 "txmbuf_pool_%u", queue_id);
3633                         snprintf(ring_name, sizeof(ring_name),
3634                                 "txmbuf_ring_%u", queue_id);
3635                         setup_mempool_tbl(rte_socket_id(),
3636                                 (queue_id + MAX_QUEUES),
3637                                 pool_name, ring_name, nb_mbuf);
3638                 }
3639
3640                 if (vm2vm_mode == VM2VM_HARDWARE) {
3641                         /* Enable VT loop back to let L2 switch to do it. */
3642                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3643                         LOG_DEBUG(VHOST_CONFIG,
3644                                 "Enable loop back for L2 switch in vmdq.\n");
3645                 }
3646         }
3647         /* Set log level. */
3648         rte_set_log_level(LOG_LEVEL);
3649
3650         /* initialize all ports */
3651         for (portid = 0; portid < nb_ports; portid++) {
3652                 /* skip ports that are not enabled */
3653                 if ((enabled_port_mask & (1 << portid)) == 0) {
3654                         RTE_LOG(INFO, VHOST_PORT,
3655                                 "Skipping disabled port %d\n", portid);
3656                         continue;
3657                 }
3658                 if (port_init(portid) != 0)
3659                         rte_exit(EXIT_FAILURE,
3660                                 "Cannot initialize network ports\n");
3661         }
3662
3663         /* Initialise all linked lists. */
3664         if (init_data_ll() == -1)
3665                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3666
3667         /* Initialize device stats */
3668         memset(&dev_statistics, 0, sizeof(dev_statistics));
3669
3670         /* Enable stats if the user option is set. */
3671         if (enable_stats)
3672                 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3673
3674         /* Launch all data cores. */
3675         if (zero_copy == 0) {
3676                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3677                         rte_eal_remote_launch(switch_worker,
3678                                 mbuf_pool, lcore_id);
3679                 }
3680         } else {
3681                 uint32_t count_in_mempool, index, i;
3682                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3683                         /* For all RX and TX queues. */
3684                         count_in_mempool
3685                                 = rte_mempool_count(vpool_array[index].pool);
3686
3687                         /*
3688                          * Transfer all un-attached mbufs from vpool.pool
3689                          * to vpoo.ring.
3690                          */
3691                         for (i = 0; i < count_in_mempool; i++) {
3692                                 struct rte_mbuf *mbuf
3693                                         = __rte_mbuf_raw_alloc(
3694                                                 vpool_array[index].pool);
3695                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3696                                                 (void *)mbuf);
3697                         }
3698
3699                         LOG_DEBUG(VHOST_CONFIG,
3700                                 "in MAIN: mbuf count in mempool at initial "
3701                                 "is: %d\n", count_in_mempool);
3702                         LOG_DEBUG(VHOST_CONFIG,
3703                                 "in MAIN: mbuf count in  ring at initial  is :"
3704                                 " %d\n",
3705                                 rte_ring_count(vpool_array[index].ring));
3706                 }
3707
3708                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3709                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3710                                 lcore_id);
3711         }
3712
3713         /* Register CUSE device to handle IOCTLs. */
3714         ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3715         if (ret != 0)
3716                 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3717
3718         init_virtio_net(&virtio_net_device_ops);
3719
3720         /* Start CUSE session. */
3721         start_cuse_session_loop();
3722         return 0;
3723
3724 }
3725