examples/vhost: remove unnecessary pseudo checksum calculation
[dpdk.git] / examples / vhost / main.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55
56 #include "main.h"
57
58 #ifndef MAX_QUEUES
59 #define MAX_QUEUES 128
60 #endif
61
62 /* the maximum number of external ports supported */
63 #define MAX_SUP_PORTS 1
64
65 /*
66  * Calculate the number of buffers needed per port
67  */
68 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) +             \
69                                                         (num_switching_cores*MAX_PKT_BURST) +                   \
70                                                         (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
71                                                         ((num_switching_cores+1)*MBUF_CACHE_SIZE))
72
73 #define MBUF_CACHE_SIZE 128
74 #define MBUF_DATA_SIZE  RTE_MBUF_DEFAULT_BUF_SIZE
75
76 /*
77  * No frame data buffer allocated from host are required for zero copy
78  * implementation, guest will allocate the frame data buffer, and vhost
79  * directly use it.
80  */
81 #define VIRTIO_DESCRIPTOR_LEN_ZCP       RTE_MBUF_DEFAULT_DATAROOM
82 #define MBUF_DATA_SIZE_ZCP              RTE_MBUF_DEFAULT_BUF_SIZE
83 #define MBUF_CACHE_SIZE_ZCP 0
84
85 #define MAX_PKT_BURST 32                /* Max burst size for RX/TX */
86 #define BURST_TX_DRAIN_US 100   /* TX drain every ~100us */
87
88 #define BURST_RX_WAIT_US 15     /* Defines how long we wait between retries on RX */
89 #define BURST_RX_RETRIES 4              /* Number of retries on RX. */
90
91 #define JUMBO_FRAME_MAX_SIZE    0x2600
92
93 /* State of virtio device. */
94 #define DEVICE_MAC_LEARNING 0
95 #define DEVICE_RX                       1
96 #define DEVICE_SAFE_REMOVE      2
97
98 /* Config_core_flag status definitions. */
99 #define REQUEST_DEV_REMOVAL 1
100 #define ACK_DEV_REMOVAL 0
101
102 /* Configurable number of RX/TX ring descriptors */
103 #define RTE_TEST_RX_DESC_DEFAULT 1024
104 #define RTE_TEST_TX_DESC_DEFAULT 512
105
106 /*
107  * Need refine these 2 macros for legacy and DPDK based front end:
108  * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
109  * And then adjust power 2.
110  */
111 /*
112  * For legacy front end, 128 descriptors,
113  * half for virtio header, another half for mbuf.
114  */
115 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32   /* legacy: 32, DPDK virt FE: 128. */
116 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64   /* legacy: 64, DPDK virt FE: 64.  */
117
118 /* Get first 4 bytes in mbuf headroom. */
119 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
120                 + sizeof(struct rte_mbuf)))
121
122 /* true if x is a power of 2 */
123 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
124
125 #define INVALID_PORT_ID 0xFF
126
127 /* Max number of devices. Limited by vmdq. */
128 #define MAX_DEVICES 64
129
130 /* Size of buffers used for snprintfs. */
131 #define MAX_PRINT_BUFF 6072
132
133 /* Maximum character device basename size. */
134 #define MAX_BASENAME_SZ 10
135
136 /* Maximum long option length for option parsing. */
137 #define MAX_LONG_OPT_SZ 64
138
139 /* Used to compare MAC addresses. */
140 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
141
142 /* Number of descriptors per cacheline. */
143 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
144
145 #define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
146
147 /* mask of enabled ports */
148 static uint32_t enabled_port_mask = 0;
149
150 /* Promiscuous mode */
151 static uint32_t promiscuous;
152
153 /*Number of switching cores enabled*/
154 static uint32_t num_switching_cores = 0;
155
156 /* number of devices/queues to support*/
157 static uint32_t num_queues = 0;
158 static uint32_t num_devices;
159
160 /*
161  * Enable zero copy, pkts buffer will directly dma to hw descriptor,
162  * disabled on default.
163  */
164 static uint32_t zero_copy;
165 static int mergeable;
166
167 /* Do vlan strip on host, enabled on default */
168 static uint32_t vlan_strip = 1;
169
170 /* number of descriptors to apply*/
171 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
172 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
173
174 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
175 #define MAX_RING_DESC 4096
176
177 struct vpool {
178         struct rte_mempool *pool;
179         struct rte_ring *ring;
180         uint32_t buf_size;
181 } vpool_array[MAX_QUEUES+MAX_QUEUES];
182
183 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
184 typedef enum {
185         VM2VM_DISABLED = 0,
186         VM2VM_SOFTWARE = 1,
187         VM2VM_HARDWARE = 2,
188         VM2VM_LAST
189 } vm2vm_type;
190 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
191
192 /* The type of host physical address translated from guest physical address. */
193 typedef enum {
194         PHYS_ADDR_CONTINUOUS = 0,
195         PHYS_ADDR_CROSS_SUBREG = 1,
196         PHYS_ADDR_INVALID = 2,
197         PHYS_ADDR_LAST
198 } hpa_type;
199
200 /* Enable stats. */
201 static uint32_t enable_stats = 0;
202 /* Enable retries on RX. */
203 static uint32_t enable_retry = 1;
204
205 /* Disable TX checksum offload */
206 static uint32_t enable_tx_csum;
207
208 /* Disable TSO offload */
209 static uint32_t enable_tso;
210
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
215
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
218
219 /* empty vmdq configuration structure. Filled in programatically */
220 static struct rte_eth_conf vmdq_conf_default = {
221         .rxmode = {
222                 .mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
223                 .split_hdr_size = 0,
224                 .header_split   = 0, /**< Header Split disabled */
225                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
226                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
227                 /*
228                  * It is necessary for 1G NIC such as I350,
229                  * this fixes bug of ipv4 forwarding in guest can't
230                  * forward pakets from one virtio dev to another virtio dev.
231                  */
232                 .hw_vlan_strip  = 1, /**< VLAN strip enabled. */
233                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
234                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
235         },
236
237         .txmode = {
238                 .mq_mode = ETH_MQ_TX_NONE,
239         },
240         .rx_adv_conf = {
241                 /*
242                  * should be overridden separately in code with
243                  * appropriate values
244                  */
245                 .vmdq_rx_conf = {
246                         .nb_queue_pools = ETH_8_POOLS,
247                         .enable_default_pool = 0,
248                         .default_pool = 0,
249                         .nb_pool_maps = 0,
250                         .pool_map = {{0, 0},},
251                 },
252         },
253 };
254
255 static unsigned lcore_ids[RTE_MAX_LCORE];
256 static uint8_t ports[RTE_MAX_ETHPORTS];
257 static unsigned num_ports = 0; /**< The number of ports specified in command line */
258 static uint16_t num_pf_queues, num_vmdq_queues;
259 static uint16_t vmdq_pool_base, vmdq_queue_base;
260 static uint16_t queues_per_pool;
261
262 static const uint16_t external_pkt_default_vlan_tag = 2000;
263 const uint16_t vlan_tags[] = {
264         1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
265         1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
266         1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
267         1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
268         1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
269         1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
270         1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
271         1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
272 };
273
274 /* ethernet addresses of ports */
275 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
276
277 /* heads for the main used and free linked lists for the data path. */
278 static struct virtio_net_data_ll *ll_root_used = NULL;
279 static struct virtio_net_data_ll *ll_root_free = NULL;
280
281 /* Array of data core structures containing information on individual core linked lists. */
282 static struct lcore_info lcore_info[RTE_MAX_LCORE];
283
284 /* Used for queueing bursts of TX packets. */
285 struct mbuf_table {
286         unsigned len;
287         unsigned txq_id;
288         struct rte_mbuf *m_table[MAX_PKT_BURST];
289 };
290
291 /* TX queue for each data core. */
292 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
293
294 /* TX queue fori each virtio device for zero copy. */
295 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
296
297 /* Vlan header struct used to insert vlan tags on TX. */
298 struct vlan_ethhdr {
299         unsigned char   h_dest[ETH_ALEN];
300         unsigned char   h_source[ETH_ALEN];
301         __be16          h_vlan_proto;
302         __be16          h_vlan_TCI;
303         __be16          h_vlan_encapsulated_proto;
304 };
305
306 /* Header lengths. */
307 #define VLAN_HLEN       4
308 #define VLAN_ETH_HLEN   18
309
310 /* Per-device statistics struct */
311 struct device_statistics {
312         uint64_t tx_total;
313         rte_atomic64_t rx_total_atomic;
314         uint64_t rx_total;
315         uint64_t tx;
316         rte_atomic64_t rx_atomic;
317         uint64_t rx;
318 } __rte_cache_aligned;
319 struct device_statistics dev_statistics[MAX_DEVICES];
320
321 /*
322  * Builds up the correct configuration for VMDQ VLAN pool map
323  * according to the pool & queue limits.
324  */
325 static inline int
326 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
327 {
328         struct rte_eth_vmdq_rx_conf conf;
329         struct rte_eth_vmdq_rx_conf *def_conf =
330                 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
331         unsigned i;
332
333         memset(&conf, 0, sizeof(conf));
334         conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
335         conf.nb_pool_maps = num_devices;
336         conf.enable_loop_back = def_conf->enable_loop_back;
337         conf.rx_mode = def_conf->rx_mode;
338
339         for (i = 0; i < conf.nb_pool_maps; i++) {
340                 conf.pool_map[i].vlan_id = vlan_tags[ i ];
341                 conf.pool_map[i].pools = (1UL << i);
342         }
343
344         (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
345         (void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
346                    sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
347         return 0;
348 }
349
350 /*
351  * Validate the device number according to the max pool number gotten form
352  * dev_info. If the device number is invalid, give the error message and
353  * return -1. Each device must have its own pool.
354  */
355 static inline int
356 validate_num_devices(uint32_t max_nb_devices)
357 {
358         if (num_devices > max_nb_devices) {
359                 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
360                 return -1;
361         }
362         return 0;
363 }
364
365 /*
366  * Initialises a given port using global settings and with the rx buffers
367  * coming from the mbuf_pool passed as parameter
368  */
369 static inline int
370 port_init(uint8_t port)
371 {
372         struct rte_eth_dev_info dev_info;
373         struct rte_eth_conf port_conf;
374         struct rte_eth_rxconf *rxconf;
375         struct rte_eth_txconf *txconf;
376         int16_t rx_rings, tx_rings;
377         uint16_t rx_ring_size, tx_ring_size;
378         int retval;
379         uint16_t q;
380
381         /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
382         rte_eth_dev_info_get (port, &dev_info);
383
384         if (dev_info.max_rx_queues > MAX_QUEUES) {
385                 rte_exit(EXIT_FAILURE,
386                         "please define MAX_QUEUES no less than %u in %s\n",
387                         dev_info.max_rx_queues, __FILE__);
388         }
389
390         rxconf = &dev_info.default_rxconf;
391         txconf = &dev_info.default_txconf;
392         rxconf->rx_drop_en = 1;
393
394         /* Enable vlan offload */
395         txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
396
397         /*
398          * Zero copy defers queue RX/TX start to the time when guest
399          * finishes its startup and packet buffers from that guest are
400          * available.
401          */
402         if (zero_copy) {
403                 rxconf->rx_deferred_start = 1;
404                 rxconf->rx_drop_en = 0;
405                 txconf->tx_deferred_start = 1;
406         }
407
408         /*configure the number of supported virtio devices based on VMDQ limits */
409         num_devices = dev_info.max_vmdq_pools;
410
411         if (zero_copy) {
412                 rx_ring_size = num_rx_descriptor;
413                 tx_ring_size = num_tx_descriptor;
414                 tx_rings = dev_info.max_tx_queues;
415         } else {
416                 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
417                 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
418                 tx_rings = (uint16_t)rte_lcore_count();
419         }
420
421         retval = validate_num_devices(MAX_DEVICES);
422         if (retval < 0)
423                 return retval;
424
425         /* Get port configuration. */
426         retval = get_eth_conf(&port_conf, num_devices);
427         if (retval < 0)
428                 return retval;
429         /* NIC queues are divided into pf queues and vmdq queues.  */
430         num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
431         queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
432         num_vmdq_queues = num_devices * queues_per_pool;
433         num_queues = num_pf_queues + num_vmdq_queues;
434         vmdq_queue_base = dev_info.vmdq_queue_base;
435         vmdq_pool_base  = dev_info.vmdq_pool_base;
436         printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
437                 num_pf_queues, num_devices, queues_per_pool);
438
439         if (port >= rte_eth_dev_count()) return -1;
440
441         if (enable_tx_csum == 0)
442                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
443
444         if (enable_tso == 0) {
445                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4);
446                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO6);
447         }
448
449         rx_rings = (uint16_t)dev_info.max_rx_queues;
450         /* Configure ethernet device. */
451         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
452         if (retval != 0)
453                 return retval;
454
455         /* Setup the queues. */
456         for (q = 0; q < rx_rings; q ++) {
457                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458                                                 rte_eth_dev_socket_id(port),
459                                                 rxconf,
460                                                 vpool_array[q].pool);
461                 if (retval < 0)
462                         return retval;
463         }
464         for (q = 0; q < tx_rings; q ++) {
465                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
466                                                 rte_eth_dev_socket_id(port),
467                                                 txconf);
468                 if (retval < 0)
469                         return retval;
470         }
471
472         /* Start the device. */
473         retval  = rte_eth_dev_start(port);
474         if (retval < 0) {
475                 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
476                 return retval;
477         }
478
479         if (promiscuous)
480                 rte_eth_promiscuous_enable(port);
481
482         rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
483         RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
484         RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
485                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
486                         (unsigned)port,
487                         vmdq_ports_eth_addr[port].addr_bytes[0],
488                         vmdq_ports_eth_addr[port].addr_bytes[1],
489                         vmdq_ports_eth_addr[port].addr_bytes[2],
490                         vmdq_ports_eth_addr[port].addr_bytes[3],
491                         vmdq_ports_eth_addr[port].addr_bytes[4],
492                         vmdq_ports_eth_addr[port].addr_bytes[5]);
493
494         return 0;
495 }
496
497 /*
498  * Set character device basename.
499  */
500 static int
501 us_vhost_parse_basename(const char *q_arg)
502 {
503         /* parse number string */
504
505         if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
506                 return -1;
507         else
508                 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509
510         return 0;
511 }
512
513 /*
514  * Parse the portmask provided at run time.
515  */
516 static int
517 parse_portmask(const char *portmask)
518 {
519         char *end = NULL;
520         unsigned long pm;
521
522         errno = 0;
523
524         /* parse hexadecimal string */
525         pm = strtoul(portmask, &end, 16);
526         if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
527                 return -1;
528
529         if (pm == 0)
530                 return -1;
531
532         return pm;
533
534 }
535
536 /*
537  * Parse num options at run time.
538  */
539 static int
540 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
541 {
542         char *end = NULL;
543         unsigned long num;
544
545         errno = 0;
546
547         /* parse unsigned int string */
548         num = strtoul(q_arg, &end, 10);
549         if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
550                 return -1;
551
552         if (num > max_valid_value)
553                 return -1;
554
555         return num;
556
557 }
558
559 /*
560  * Display usage
561  */
562 static void
563 us_vhost_usage(const char *prgname)
564 {
565         RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
566         "               --vm2vm [0|1|2]\n"
567         "               --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
568         "               --dev-basename <name>\n"
569         "               --nb-devices ND\n"
570         "               -p PORTMASK: Set mask for ports to be used by application\n"
571         "               --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
572         "               --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
573         "               --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
574         "               --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
575         "               --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
576         "               --vlan-strip [0|1]: disable/enable(default) RX VLAN strip on host\n"
577         "               --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
578         "               --dev-basename: The basename to be used for the character device.\n"
579         "               --zero-copy [0|1]: disable(default)/enable rx/tx "
580                         "zero copy\n"
581         "               --rx-desc-num [0-N]: the number of descriptors on rx, "
582                         "used only when zero copy is enabled.\n"
583         "               --tx-desc-num [0-N]: the number of descriptors on tx, "
584                         "used only when zero copy is enabled.\n"
585         "               --tx-csum [0|1] disable/enable TX checksum offload.\n"
586         "               --tso [0|1] disable/enable TCP segment offload.\n",
587                prgname);
588 }
589
590 /*
591  * Parse the arguments given in the command line of the application.
592  */
593 static int
594 us_vhost_parse_args(int argc, char **argv)
595 {
596         int opt, ret;
597         int option_index;
598         unsigned i;
599         const char *prgname = argv[0];
600         static struct option long_option[] = {
601                 {"vm2vm", required_argument, NULL, 0},
602                 {"rx-retry", required_argument, NULL, 0},
603                 {"rx-retry-delay", required_argument, NULL, 0},
604                 {"rx-retry-num", required_argument, NULL, 0},
605                 {"mergeable", required_argument, NULL, 0},
606                 {"vlan-strip", required_argument, NULL, 0},
607                 {"stats", required_argument, NULL, 0},
608                 {"dev-basename", required_argument, NULL, 0},
609                 {"zero-copy", required_argument, NULL, 0},
610                 {"rx-desc-num", required_argument, NULL, 0},
611                 {"tx-desc-num", required_argument, NULL, 0},
612                 {"tx-csum", required_argument, NULL, 0},
613                 {"tso", required_argument, NULL, 0},
614                 {NULL, 0, 0, 0},
615         };
616
617         /* Parse command line */
618         while ((opt = getopt_long(argc, argv, "p:P",
619                         long_option, &option_index)) != EOF) {
620                 switch (opt) {
621                 /* Portmask */
622                 case 'p':
623                         enabled_port_mask = parse_portmask(optarg);
624                         if (enabled_port_mask == 0) {
625                                 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
626                                 us_vhost_usage(prgname);
627                                 return -1;
628                         }
629                         break;
630
631                 case 'P':
632                         promiscuous = 1;
633                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
634                                 ETH_VMDQ_ACCEPT_BROADCAST |
635                                 ETH_VMDQ_ACCEPT_MULTICAST;
636                         rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
637
638                         break;
639
640                 case 0:
641                         /* Enable/disable vm2vm comms. */
642                         if (!strncmp(long_option[option_index].name, "vm2vm",
643                                 MAX_LONG_OPT_SZ)) {
644                                 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
645                                 if (ret == -1) {
646                                         RTE_LOG(INFO, VHOST_CONFIG,
647                                                 "Invalid argument for "
648                                                 "vm2vm [0|1|2]\n");
649                                         us_vhost_usage(prgname);
650                                         return -1;
651                                 } else {
652                                         vm2vm_mode = (vm2vm_type)ret;
653                                 }
654                         }
655
656                         /* Enable/disable retries on RX. */
657                         if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
658                                 ret = parse_num_opt(optarg, 1);
659                                 if (ret == -1) {
660                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
661                                         us_vhost_usage(prgname);
662                                         return -1;
663                                 } else {
664                                         enable_retry = ret;
665                                 }
666                         }
667
668                         /* Enable/disable TX checksum offload. */
669                         if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
670                                 ret = parse_num_opt(optarg, 1);
671                                 if (ret == -1) {
672                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
673                                         us_vhost_usage(prgname);
674                                         return -1;
675                                 } else
676                                         enable_tx_csum = ret;
677                         }
678
679                         /* Enable/disable TSO offload. */
680                         if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
681                                 ret = parse_num_opt(optarg, 1);
682                                 if (ret == -1) {
683                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
684                                         us_vhost_usage(prgname);
685                                         return -1;
686                                 } else
687                                         enable_tso = ret;
688                         }
689
690                         /* Specify the retries delay time (in useconds) on RX. */
691                         if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
692                                 ret = parse_num_opt(optarg, INT32_MAX);
693                                 if (ret == -1) {
694                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
695                                         us_vhost_usage(prgname);
696                                         return -1;
697                                 } else {
698                                         burst_rx_delay_time = ret;
699                                 }
700                         }
701
702                         /* Specify the retries number on RX. */
703                         if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
704                                 ret = parse_num_opt(optarg, INT32_MAX);
705                                 if (ret == -1) {
706                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
707                                         us_vhost_usage(prgname);
708                                         return -1;
709                                 } else {
710                                         burst_rx_retry_num = ret;
711                                 }
712                         }
713
714                         /* Enable/disable RX mergeable buffers. */
715                         if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
716                                 ret = parse_num_opt(optarg, 1);
717                                 if (ret == -1) {
718                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
719                                         us_vhost_usage(prgname);
720                                         return -1;
721                                 } else {
722                                         mergeable = !!ret;
723                                         if (ret) {
724                                                 vmdq_conf_default.rxmode.jumbo_frame = 1;
725                                                 vmdq_conf_default.rxmode.max_rx_pkt_len
726                                                         = JUMBO_FRAME_MAX_SIZE;
727                                         }
728                                 }
729                         }
730
731                         /* Enable/disable RX VLAN strip on host. */
732                         if (!strncmp(long_option[option_index].name,
733                                 "vlan-strip", MAX_LONG_OPT_SZ)) {
734                                 ret = parse_num_opt(optarg, 1);
735                                 if (ret == -1) {
736                                         RTE_LOG(INFO, VHOST_CONFIG,
737                                                 "Invalid argument for VLAN strip [0|1]\n");
738                                         us_vhost_usage(prgname);
739                                         return -1;
740                                 } else {
741                                         vlan_strip = !!ret;
742                                         vmdq_conf_default.rxmode.hw_vlan_strip =
743                                                 vlan_strip;
744                                 }
745                         }
746
747                         /* Enable/disable stats. */
748                         if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
749                                 ret = parse_num_opt(optarg, INT32_MAX);
750                                 if (ret == -1) {
751                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
752                                         us_vhost_usage(prgname);
753                                         return -1;
754                                 } else {
755                                         enable_stats = ret;
756                                 }
757                         }
758
759                         /* Set character device basename. */
760                         if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
761                                 if (us_vhost_parse_basename(optarg) == -1) {
762                                         RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
763                                         us_vhost_usage(prgname);
764                                         return -1;
765                                 }
766                         }
767
768                         /* Enable/disable rx/tx zero copy. */
769                         if (!strncmp(long_option[option_index].name,
770                                 "zero-copy", MAX_LONG_OPT_SZ)) {
771                                 ret = parse_num_opt(optarg, 1);
772                                 if (ret == -1) {
773                                         RTE_LOG(INFO, VHOST_CONFIG,
774                                                 "Invalid argument"
775                                                 " for zero-copy [0|1]\n");
776                                         us_vhost_usage(prgname);
777                                         return -1;
778                                 } else
779                                         zero_copy = ret;
780                         }
781
782                         /* Specify the descriptor number on RX. */
783                         if (!strncmp(long_option[option_index].name,
784                                 "rx-desc-num", MAX_LONG_OPT_SZ)) {
785                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
786                                 if ((ret == -1) || (!POWEROF2(ret))) {
787                                         RTE_LOG(INFO, VHOST_CONFIG,
788                                         "Invalid argument for rx-desc-num[0-N],"
789                                         "power of 2 required.\n");
790                                         us_vhost_usage(prgname);
791                                         return -1;
792                                 } else {
793                                         num_rx_descriptor = ret;
794                                 }
795                         }
796
797                         /* Specify the descriptor number on TX. */
798                         if (!strncmp(long_option[option_index].name,
799                                 "tx-desc-num", MAX_LONG_OPT_SZ)) {
800                                 ret = parse_num_opt(optarg, MAX_RING_DESC);
801                                 if ((ret == -1) || (!POWEROF2(ret))) {
802                                         RTE_LOG(INFO, VHOST_CONFIG,
803                                         "Invalid argument for tx-desc-num [0-N],"
804                                         "power of 2 required.\n");
805                                         us_vhost_usage(prgname);
806                                         return -1;
807                                 } else {
808                                         num_tx_descriptor = ret;
809                                 }
810                         }
811
812                         break;
813
814                         /* Invalid option - print options. */
815                 default:
816                         us_vhost_usage(prgname);
817                         return -1;
818                 }
819         }
820
821         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
822                 if (enabled_port_mask & (1 << i))
823                         ports[num_ports++] = (uint8_t)i;
824         }
825
826         if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
827                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
828                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
829                 return -1;
830         }
831
832         if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
833                 RTE_LOG(INFO, VHOST_PORT,
834                         "Vhost zero copy doesn't support software vm2vm,"
835                         "please specify 'vm2vm 2' to use hardware vm2vm.\n");
836                 return -1;
837         }
838
839         if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
840                 RTE_LOG(INFO, VHOST_PORT,
841                         "Vhost zero copy doesn't support jumbo frame,"
842                         "please specify '--mergeable 0' to disable the "
843                         "mergeable feature.\n");
844                 return -1;
845         }
846
847         return 0;
848 }
849
850 /*
851  * Update the global var NUM_PORTS and array PORTS according to system ports number
852  * and return valid ports number
853  */
854 static unsigned check_ports_num(unsigned nb_ports)
855 {
856         unsigned valid_num_ports = num_ports;
857         unsigned portid;
858
859         if (num_ports > nb_ports) {
860                 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
861                         num_ports, nb_ports);
862                 num_ports = nb_ports;
863         }
864
865         for (portid = 0; portid < num_ports; portid ++) {
866                 if (ports[portid] >= nb_ports) {
867                         RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
868                                 ports[portid], (nb_ports - 1));
869                         ports[portid] = INVALID_PORT_ID;
870                         valid_num_ports--;
871                 }
872         }
873         return valid_num_ports;
874 }
875
876 /*
877  * Macro to print out packet contents. Wrapped in debug define so that the
878  * data path is not effected when debug is disabled.
879  */
880 #ifdef DEBUG
881 #define PRINT_PACKET(device, addr, size, header) do {                                                                                                                           \
882         char *pkt_addr = (char*)(addr);                                                                                                                                                                 \
883         unsigned int index;                                                                                                                                                                                             \
884         char packet[MAX_PRINT_BUFF];                                                                                                                                                                    \
885                                                                                                                                                                                                                                         \
886         if ((header))                                                                                                                                                                                                   \
887                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size));                          \
888         else                                                                                                                                                                                                                    \
889                 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size));                          \
890         for (index = 0; index < (size); index++) {                                                                                                                                              \
891                 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF),    \
892                         "%02hhx ", pkt_addr[index]);                                                                                                                                                    \
893         }                                                                                                                                                                                                                               \
894         snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n");     \
895                                                                                                                                                                                                                                         \
896         LOG_DEBUG(VHOST_DATA, "%s", packet);                                                                                                                                                                    \
897 } while(0)
898 #else
899 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
900 #endif
901
902 /*
903  * Function to convert guest physical addresses to vhost physical addresses.
904  * This is used to convert virtio buffer addresses.
905  */
906 static inline uint64_t __attribute__((always_inline))
907 gpa_to_hpa(struct vhost_dev  *vdev, uint64_t guest_pa,
908         uint32_t buf_len, hpa_type *addr_type)
909 {
910         struct virtio_memory_regions_hpa *region;
911         uint32_t regionidx;
912         uint64_t vhost_pa = 0;
913
914         *addr_type = PHYS_ADDR_INVALID;
915
916         for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
917                 region = &vdev->regions_hpa[regionidx];
918                 if ((guest_pa >= region->guest_phys_address) &&
919                         (guest_pa <= region->guest_phys_address_end)) {
920                         vhost_pa = region->host_phys_addr_offset + guest_pa;
921                         if (likely((guest_pa + buf_len - 1)
922                                 <= region->guest_phys_address_end))
923                                 *addr_type = PHYS_ADDR_CONTINUOUS;
924                         else
925                                 *addr_type = PHYS_ADDR_CROSS_SUBREG;
926                         break;
927                 }
928         }
929
930         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
931                 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
932                 (void *)(uintptr_t)vhost_pa);
933
934         return vhost_pa;
935 }
936
937 /*
938  * Compares a packet destination MAC address to a device MAC address.
939  */
940 static inline int __attribute__((always_inline))
941 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
942 {
943         return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0;
944 }
945
946 /*
947  * This function learns the MAC address of the device and registers this along with a
948  * vlan tag to a VMDQ.
949  */
950 static int
951 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
952 {
953         struct ether_hdr *pkt_hdr;
954         struct virtio_net_data_ll *dev_ll;
955         struct virtio_net *dev = vdev->dev;
956         int i, ret;
957
958         /* Learn MAC address of guest device from packet */
959         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
960
961         dev_ll = ll_root_used;
962
963         while (dev_ll != NULL) {
964                 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
965                         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
966                         return -1;
967                 }
968                 dev_ll = dev_ll->next;
969         }
970
971         for (i = 0; i < ETHER_ADDR_LEN; i++)
972                 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
973
974         /* vlan_tag currently uses the device_id. */
975         vdev->vlan_tag = vlan_tags[dev->device_fh];
976
977         /* Print out VMDQ registration info. */
978         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
979                 dev->device_fh,
980                 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
981                 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
982                 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
983                 vdev->vlan_tag);
984
985         /* Register the MAC address. */
986         ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
987                                 (uint32_t)dev->device_fh + vmdq_pool_base);
988         if (ret)
989                 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
990                                         dev->device_fh);
991
992         /* Enable stripping of the vlan tag as we handle routing. */
993         if (vlan_strip)
994                 rte_eth_dev_set_vlan_strip_on_queue(ports[0],
995                         (uint16_t)vdev->vmdq_rx_q, 1);
996
997         /* Set device as ready for RX. */
998         vdev->ready = DEVICE_RX;
999
1000         return 0;
1001 }
1002
1003 /*
1004  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1005  * queue before disabling RX on the device.
1006  */
1007 static inline void
1008 unlink_vmdq(struct vhost_dev *vdev)
1009 {
1010         unsigned i = 0;
1011         unsigned rx_count;
1012         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1013
1014         if (vdev->ready == DEVICE_RX) {
1015                 /*clear MAC and VLAN settings*/
1016                 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
1017                 for (i = 0; i < 6; i++)
1018                         vdev->mac_address.addr_bytes[i] = 0;
1019
1020                 vdev->vlan_tag = 0;
1021
1022                 /*Clear out the receive buffers*/
1023                 rx_count = rte_eth_rx_burst(ports[0],
1024                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1025
1026                 while (rx_count) {
1027                         for (i = 0; i < rx_count; i++)
1028                                 rte_pktmbuf_free(pkts_burst[i]);
1029
1030                         rx_count = rte_eth_rx_burst(ports[0],
1031                                         (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1032                 }
1033
1034                 vdev->ready = DEVICE_MAC_LEARNING;
1035         }
1036 }
1037
1038 /*
1039  * Check if the packet destination MAC address is for a local device. If so then put
1040  * the packet on that devices RX queue. If not then return.
1041  */
1042 static inline int __attribute__((always_inline))
1043 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1044 {
1045         struct virtio_net_data_ll *dev_ll;
1046         struct ether_hdr *pkt_hdr;
1047         uint64_t ret = 0;
1048         struct virtio_net *dev = vdev->dev;
1049         struct virtio_net *tdev; /* destination virito device */
1050
1051         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1052
1053         /*get the used devices list*/
1054         dev_ll = ll_root_used;
1055
1056         while (dev_ll != NULL) {
1057                 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1058                                           &dev_ll->vdev->mac_address)) {
1059
1060                         /* Drop the packet if the TX packet is destined for the TX device. */
1061                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1062                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1063                                                         dev->device_fh);
1064                                 return 0;
1065                         }
1066                         tdev = dev_ll->vdev->dev;
1067
1068
1069                         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1070
1071                         if (unlikely(dev_ll->vdev->remove)) {
1072                                 /*drop the packet if the device is marked for removal*/
1073                                 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1074                         } else {
1075                                 /*send the packet to the local virtio device*/
1076                                 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1077                                 if (enable_stats) {
1078                                         rte_atomic64_add(
1079                                         &dev_statistics[tdev->device_fh].rx_total_atomic,
1080                                         1);
1081                                         rte_atomic64_add(
1082                                         &dev_statistics[tdev->device_fh].rx_atomic,
1083                                         ret);
1084                                         dev_statistics[dev->device_fh].tx_total++;
1085                                         dev_statistics[dev->device_fh].tx += ret;
1086                                 }
1087                         }
1088
1089                         return 0;
1090                 }
1091                 dev_ll = dev_ll->next;
1092         }
1093
1094         return -1;
1095 }
1096
1097 /*
1098  * Check if the destination MAC of a packet is one local VM,
1099  * and get its vlan tag, and offset if it is.
1100  */
1101 static inline int __attribute__((always_inline))
1102 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1103         uint32_t *offset, uint16_t *vlan_tag)
1104 {
1105         struct virtio_net_data_ll *dev_ll = ll_root_used;
1106         struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1107
1108         while (dev_ll != NULL) {
1109                 if ((dev_ll->vdev->ready == DEVICE_RX)
1110                         && ether_addr_cmp(&(pkt_hdr->d_addr),
1111                 &dev_ll->vdev->mac_address)) {
1112                         /*
1113                          * Drop the packet if the TX packet is
1114                          * destined for the TX device.
1115                          */
1116                         if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1117                                 LOG_DEBUG(VHOST_DATA,
1118                                 "(%"PRIu64") TX: Source and destination"
1119                                 " MAC addresses are the same. Dropping "
1120                                 "packet.\n",
1121                                 dev_ll->vdev->dev->device_fh);
1122                                 return -1;
1123                         }
1124
1125                         /*
1126                          * HW vlan strip will reduce the packet length
1127                          * by minus length of vlan tag, so need restore
1128                          * the packet length by plus it.
1129                          */
1130                         *offset = VLAN_HLEN;
1131                         *vlan_tag =
1132                         (uint16_t)
1133                         vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1134
1135                         LOG_DEBUG(VHOST_DATA,
1136                         "(%"PRIu64") TX: pkt to local VM device id:"
1137                         "(%"PRIu64") vlan tag: %d.\n",
1138                         dev->device_fh, dev_ll->vdev->dev->device_fh,
1139                         (int)*vlan_tag);
1140
1141                         break;
1142                 }
1143                 dev_ll = dev_ll->next;
1144         }
1145         return 0;
1146 }
1147
1148 static uint16_t
1149 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1150 {
1151         if (ol_flags & PKT_TX_IPV4)
1152                 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1153         else /* assume ethertype == ETHER_TYPE_IPv6 */
1154                 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1155 }
1156
1157 static void virtio_tx_offload(struct rte_mbuf *m)
1158 {
1159         void *l3_hdr;
1160         struct ipv4_hdr *ipv4_hdr = NULL;
1161         struct tcp_hdr *tcp_hdr = NULL;
1162         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1163
1164         l3_hdr = (char *)eth_hdr + m->l2_len;
1165
1166         ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
1167         tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
1168         m->ol_flags |= PKT_TX_IP_CKSUM;
1169         ipv4_hdr->hdr_checksum = 0;
1170         tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1171 }
1172
1173 /*
1174  * This function routes the TX packet to the correct interface. This may be a local device
1175  * or the physical port.
1176  */
1177 static inline void __attribute__((always_inline))
1178 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1179 {
1180         struct mbuf_table *tx_q;
1181         struct rte_mbuf **m_table;
1182         unsigned len, ret, offset = 0;
1183         const uint16_t lcore_id = rte_lcore_id();
1184         struct virtio_net *dev = vdev->dev;
1185         struct ether_hdr *nh;
1186
1187         /*check if destination is local VM*/
1188         if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1189                 rte_pktmbuf_free(m);
1190                 return;
1191         }
1192
1193         if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1194                 if (unlikely(find_local_dest(dev, m, &offset, &vlan_tag) != 0)) {
1195                         rte_pktmbuf_free(m);
1196                         return;
1197                 }
1198         }
1199
1200         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1201
1202         /*Add packet to the port tx queue*/
1203         tx_q = &lcore_tx_queue[lcore_id];
1204         len = tx_q->len;
1205
1206         nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
1207         if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
1208                 /* Guest has inserted the vlan tag. */
1209                 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
1210                 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1211                 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1212                         (vh->vlan_tci != vlan_tag_be))
1213                         vh->vlan_tci = vlan_tag_be;
1214         } else {
1215                 m->ol_flags |= PKT_TX_VLAN_PKT;
1216
1217                 /*
1218                  * Find the right seg to adjust the data len when offset is
1219                  * bigger than tail room size.
1220                  */
1221                 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1222                         if (likely(offset <= rte_pktmbuf_tailroom(m)))
1223                                 m->data_len += offset;
1224                         else {
1225                                 struct rte_mbuf *seg = m;
1226
1227                                 while ((seg->next != NULL) &&
1228                                         (offset > rte_pktmbuf_tailroom(seg)))
1229                                         seg = seg->next;
1230
1231                                 seg->data_len += offset;
1232                         }
1233                         m->pkt_len += offset;
1234                 }
1235
1236                 m->vlan_tci = vlan_tag;
1237         }
1238
1239         if (m->ol_flags & PKT_TX_TCP_SEG)
1240                 virtio_tx_offload(m);
1241
1242         tx_q->m_table[len] = m;
1243         len++;
1244         if (enable_stats) {
1245                 dev_statistics[dev->device_fh].tx_total++;
1246                 dev_statistics[dev->device_fh].tx++;
1247         }
1248
1249         if (unlikely(len == MAX_PKT_BURST)) {
1250                 m_table = (struct rte_mbuf **)tx_q->m_table;
1251                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1252                 /* Free any buffers not handled by TX and update the port stats. */
1253                 if (unlikely(ret < len)) {
1254                         do {
1255                                 rte_pktmbuf_free(m_table[ret]);
1256                         } while (++ret < len);
1257                 }
1258
1259                 len = 0;
1260         }
1261
1262         tx_q->len = len;
1263         return;
1264 }
1265 /*
1266  * This function is called by each data core. It handles all RX/TX registered with the
1267  * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1268  * with all devices in the main linked list.
1269  */
1270 static int
1271 switch_worker(__attribute__((unused)) void *arg)
1272 {
1273         struct rte_mempool *mbuf_pool = arg;
1274         struct virtio_net *dev = NULL;
1275         struct vhost_dev *vdev = NULL;
1276         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1277         struct virtio_net_data_ll *dev_ll;
1278         struct mbuf_table *tx_q;
1279         volatile struct lcore_ll_info *lcore_ll;
1280         const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1281         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1282         unsigned ret, i;
1283         const uint16_t lcore_id = rte_lcore_id();
1284         const uint16_t num_cores = (uint16_t)rte_lcore_count();
1285         uint16_t rx_count = 0;
1286         uint16_t tx_count;
1287         uint32_t retry = 0;
1288
1289         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1290         lcore_ll = lcore_info[lcore_id].lcore_ll;
1291         prev_tsc = 0;
1292
1293         tx_q = &lcore_tx_queue[lcore_id];
1294         for (i = 0; i < num_cores; i ++) {
1295                 if (lcore_ids[i] == lcore_id) {
1296                         tx_q->txq_id = i;
1297                         break;
1298                 }
1299         }
1300
1301         while(1) {
1302                 cur_tsc = rte_rdtsc();
1303                 /*
1304                  * TX burst queue drain
1305                  */
1306                 diff_tsc = cur_tsc - prev_tsc;
1307                 if (unlikely(diff_tsc > drain_tsc)) {
1308
1309                         if (tx_q->len) {
1310                                 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1311
1312                                 /*Tx any packets in the queue*/
1313                                 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1314                                                                            (struct rte_mbuf **)tx_q->m_table,
1315                                                                            (uint16_t)tx_q->len);
1316                                 if (unlikely(ret < tx_q->len)) {
1317                                         do {
1318                                                 rte_pktmbuf_free(tx_q->m_table[ret]);
1319                                         } while (++ret < tx_q->len);
1320                                 }
1321
1322                                 tx_q->len = 0;
1323                         }
1324
1325                         prev_tsc = cur_tsc;
1326
1327                 }
1328
1329                 rte_prefetch0(lcore_ll->ll_root_used);
1330                 /*
1331                  * Inform the configuration core that we have exited the linked list and that no devices are
1332                  * in use if requested.
1333                  */
1334                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1335                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1336
1337                 /*
1338                  * Process devices
1339                  */
1340                 dev_ll = lcore_ll->ll_root_used;
1341
1342                 while (dev_ll != NULL) {
1343                         /*get virtio device ID*/
1344                         vdev = dev_ll->vdev;
1345                         dev = vdev->dev;
1346
1347                         if (unlikely(vdev->remove)) {
1348                                 dev_ll = dev_ll->next;
1349                                 unlink_vmdq(vdev);
1350                                 vdev->ready = DEVICE_SAFE_REMOVE;
1351                                 continue;
1352                         }
1353                         if (likely(vdev->ready == DEVICE_RX)) {
1354                                 /*Handle guest RX*/
1355                                 rx_count = rte_eth_rx_burst(ports[0],
1356                                         vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1357
1358                                 if (rx_count) {
1359                                         /*
1360                                         * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1361                                         * Here MAX_PKT_BURST must be less than virtio queue size
1362                                         */
1363                                         if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1364                                                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1365                                                         rte_delay_us(burst_rx_delay_time);
1366                                                         if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1367                                                                 break;
1368                                                 }
1369                                         }
1370                                         ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1371                                         if (enable_stats) {
1372                                                 rte_atomic64_add(
1373                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1374                                                 rx_count);
1375                                                 rte_atomic64_add(
1376                                                 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1377                                         }
1378                                         while (likely(rx_count)) {
1379                                                 rx_count--;
1380                                                 rte_pktmbuf_free(pkts_burst[rx_count]);
1381                                         }
1382
1383                                 }
1384                         }
1385
1386                         if (likely(!vdev->remove)) {
1387                                 /* Handle guest TX*/
1388                                 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1389                                 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1390                                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1391                                         if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1392                                                 while (tx_count)
1393                                                         rte_pktmbuf_free(pkts_burst[--tx_count]);
1394                                         }
1395                                 }
1396                                 for (i = 0; i < tx_count; ++i)
1397                                         virtio_tx_route(vdev, pkts_burst[i], (uint16_t)dev->device_fh);
1398                         }
1399
1400                         /*move to the next device in the list*/
1401                         dev_ll = dev_ll->next;
1402                 }
1403         }
1404
1405         return 0;
1406 }
1407
1408 /*
1409  * This function gets available ring number for zero copy rx.
1410  * Only one thread will call this funciton for a paticular virtio device,
1411  * so, it is designed as non-thread-safe function.
1412  */
1413 static inline uint32_t __attribute__((always_inline))
1414 get_available_ring_num_zcp(struct virtio_net *dev)
1415 {
1416         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1417         uint16_t avail_idx;
1418
1419         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1420         return (uint32_t)(avail_idx - vq->last_used_idx_res);
1421 }
1422
1423 /*
1424  * This function gets available ring index for zero copy rx,
1425  * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1426  * Only one thread will call this funciton for a paticular virtio device,
1427  * so, it is designed as non-thread-safe function.
1428  */
1429 static inline uint32_t __attribute__((always_inline))
1430 get_available_ring_index_zcp(struct virtio_net *dev,
1431         uint16_t *res_base_idx, uint32_t count)
1432 {
1433         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1434         uint16_t avail_idx;
1435         uint32_t retry = 0;
1436         uint16_t free_entries;
1437
1438         *res_base_idx = vq->last_used_idx_res;
1439         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1440         free_entries = (avail_idx - *res_base_idx);
1441
1442         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1443                         "avail idx: %d, "
1444                         "res base idx:%d, free entries:%d\n",
1445                         dev->device_fh, avail_idx, *res_base_idx,
1446                         free_entries);
1447
1448         /*
1449          * If retry is enabled and the queue is full then we wait
1450          * and retry to avoid packet loss.
1451          */
1452         if (enable_retry && unlikely(count > free_entries)) {
1453                 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1454                         rte_delay_us(burst_rx_delay_time);
1455                         avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1456                         free_entries = (avail_idx - *res_base_idx);
1457                         if (count <= free_entries)
1458                                 break;
1459                 }
1460         }
1461
1462         /*check that we have enough buffers*/
1463         if (unlikely(count > free_entries))
1464                 count = free_entries;
1465
1466         if (unlikely(count == 0)) {
1467                 LOG_DEBUG(VHOST_DATA,
1468                         "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1469                         "avail idx: %d, res base idx:%d, free entries:%d\n",
1470                         dev->device_fh, avail_idx,
1471                         *res_base_idx, free_entries);
1472                 return 0;
1473         }
1474
1475         vq->last_used_idx_res = *res_base_idx + count;
1476
1477         return count;
1478 }
1479
1480 /*
1481  * This function put descriptor back to used list.
1482  */
1483 static inline void __attribute__((always_inline))
1484 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1485 {
1486         uint16_t res_cur_idx = vq->last_used_idx;
1487         vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1488         vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1489         rte_compiler_barrier();
1490         *(volatile uint16_t *)&vq->used->idx += 1;
1491         vq->last_used_idx += 1;
1492
1493         /* Kick the guest if necessary. */
1494         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1495                 eventfd_write(vq->callfd, (eventfd_t)1);
1496 }
1497
1498 /*
1499  * This function get available descriptor from vitio vring and un-attached mbuf
1500  * from vpool->ring, and then attach them together. It needs adjust the offset
1501  * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1502  * frame data may be put to wrong location in mbuf.
1503  */
1504 static inline void __attribute__((always_inline))
1505 attach_rxmbuf_zcp(struct virtio_net *dev)
1506 {
1507         uint16_t res_base_idx, desc_idx;
1508         uint64_t buff_addr, phys_addr;
1509         struct vhost_virtqueue *vq;
1510         struct vring_desc *desc;
1511         void *obj = NULL;
1512         struct rte_mbuf *mbuf;
1513         struct vpool *vpool;
1514         hpa_type addr_type;
1515         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1516
1517         vpool = &vpool_array[vdev->vmdq_rx_q];
1518         vq = dev->virtqueue[VIRTIO_RXQ];
1519
1520         do {
1521                 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1522                                 1) != 1))
1523                         return;
1524                 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1525
1526                 desc = &vq->desc[desc_idx];
1527                 if (desc->flags & VRING_DESC_F_NEXT) {
1528                         desc = &vq->desc[desc->next];
1529                         buff_addr = gpa_to_vva(dev, desc->addr);
1530                         phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1531                                         &addr_type);
1532                 } else {
1533                         buff_addr = gpa_to_vva(dev,
1534                                         desc->addr + vq->vhost_hlen);
1535                         phys_addr = gpa_to_hpa(vdev,
1536                                         desc->addr + vq->vhost_hlen,
1537                                         desc->len, &addr_type);
1538                 }
1539
1540                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1541                         RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1542                                 " address found when attaching RX frame buffer"
1543                                 " address!\n", dev->device_fh);
1544                         put_desc_to_used_list_zcp(vq, desc_idx);
1545                         continue;
1546                 }
1547
1548                 /*
1549                  * Check if the frame buffer address from guest crosses
1550                  * sub-region or not.
1551                  */
1552                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1553                         RTE_LOG(ERR, VHOST_DATA,
1554                                 "(%"PRIu64") Frame buffer address cross "
1555                                 "sub-regioin found when attaching RX frame "
1556                                 "buffer address!\n",
1557                                 dev->device_fh);
1558                         put_desc_to_used_list_zcp(vq, desc_idx);
1559                         continue;
1560                 }
1561         } while (unlikely(phys_addr == 0));
1562
1563         rte_ring_sc_dequeue(vpool->ring, &obj);
1564         mbuf = obj;
1565         if (unlikely(mbuf == NULL)) {
1566                 LOG_DEBUG(VHOST_DATA,
1567                         "(%"PRIu64") in attach_rxmbuf_zcp: "
1568                         "ring_sc_dequeue fail.\n",
1569                         dev->device_fh);
1570                 put_desc_to_used_list_zcp(vq, desc_idx);
1571                 return;
1572         }
1573
1574         if (unlikely(vpool->buf_size > desc->len)) {
1575                 LOG_DEBUG(VHOST_DATA,
1576                         "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1577                         "length(%d) of descriptor idx: %d less than room "
1578                         "size required: %d\n",
1579                         dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1580                 put_desc_to_used_list_zcp(vq, desc_idx);
1581                 rte_ring_sp_enqueue(vpool->ring, obj);
1582                 return;
1583         }
1584
1585         mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1586         mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1587         mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1588         mbuf->data_len = desc->len;
1589         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1590
1591         LOG_DEBUG(VHOST_DATA,
1592                 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1593                 "descriptor idx:%d\n",
1594                 dev->device_fh, res_base_idx, desc_idx);
1595
1596         __rte_mbuf_raw_free(mbuf);
1597
1598         return;
1599 }
1600
1601 /*
1602  * Detach an attched packet mbuf -
1603  *  - restore original mbuf address and length values.
1604  *  - reset pktmbuf data and data_len to their default values.
1605  *  All other fields of the given packet mbuf will be left intact.
1606  *
1607  * @param m
1608  *   The attached packet mbuf.
1609  */
1610 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1611 {
1612         const struct rte_mempool *mp = m->pool;
1613         void *buf = rte_mbuf_to_baddr(m);
1614         uint32_t buf_ofs;
1615         uint32_t buf_len = mp->elt_size - sizeof(*m);
1616         m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1617
1618         m->buf_addr = buf;
1619         m->buf_len = (uint16_t)buf_len;
1620
1621         buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1622                         RTE_PKTMBUF_HEADROOM : m->buf_len;
1623         m->data_off = buf_ofs;
1624
1625         m->data_len = 0;
1626 }
1627
1628 /*
1629  * This function is called after packets have been transimited. It fetchs mbuf
1630  * from vpool->pool, detached it and put into vpool->ring. It also update the
1631  * used index and kick the guest if necessary.
1632  */
1633 static inline uint32_t __attribute__((always_inline))
1634 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1635 {
1636         struct rte_mbuf *mbuf;
1637         struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1638         uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1639         uint32_t index = 0;
1640         uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1641
1642         LOG_DEBUG(VHOST_DATA,
1643                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1644                 "clean is: %d\n",
1645                 dev->device_fh, mbuf_count);
1646         LOG_DEBUG(VHOST_DATA,
1647                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring before "
1648                 "clean  is : %d\n",
1649                 dev->device_fh, rte_ring_count(vpool->ring));
1650
1651         for (index = 0; index < mbuf_count; index++) {
1652                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1653                 if (likely(MBUF_EXT_MEM(mbuf)))
1654                         pktmbuf_detach_zcp(mbuf);
1655                 rte_ring_sp_enqueue(vpool->ring, mbuf);
1656
1657                 /* Update used index buffer information. */
1658                 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1659                 vq->used->ring[used_idx].len = 0;
1660
1661                 used_idx = (used_idx + 1) & (vq->size - 1);
1662         }
1663
1664         LOG_DEBUG(VHOST_DATA,
1665                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1666                 "clean is: %d\n",
1667                 dev->device_fh, rte_mempool_count(vpool->pool));
1668         LOG_DEBUG(VHOST_DATA,
1669                 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in  ring after "
1670                 "clean  is : %d\n",
1671                 dev->device_fh, rte_ring_count(vpool->ring));
1672         LOG_DEBUG(VHOST_DATA,
1673                 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1674                 "vq->last_used_idx:%d\n",
1675                 dev->device_fh, vq->last_used_idx);
1676
1677         vq->last_used_idx += mbuf_count;
1678
1679         LOG_DEBUG(VHOST_DATA,
1680                 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1681                 "vq->last_used_idx:%d\n",
1682                 dev->device_fh, vq->last_used_idx);
1683
1684         rte_compiler_barrier();
1685
1686         *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1687
1688         /* Kick guest if required. */
1689         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1690                 eventfd_write(vq->callfd, (eventfd_t)1);
1691
1692         return 0;
1693 }
1694
1695 /*
1696  * This function is called when a virtio device is destroy.
1697  * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1698  */
1699 static void mbuf_destroy_zcp(struct vpool *vpool)
1700 {
1701         struct rte_mbuf *mbuf = NULL;
1702         uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1703
1704         LOG_DEBUG(VHOST_CONFIG,
1705                 "in mbuf_destroy_zcp: mbuf count in mempool before "
1706                 "mbuf_destroy_zcp is: %d\n",
1707                 mbuf_count);
1708         LOG_DEBUG(VHOST_CONFIG,
1709                 "in mbuf_destroy_zcp: mbuf count in  ring before "
1710                 "mbuf_destroy_zcp  is : %d\n",
1711                 rte_ring_count(vpool->ring));
1712
1713         for (index = 0; index < mbuf_count; index++) {
1714                 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1715                 if (likely(mbuf != NULL)) {
1716                         if (likely(MBUF_EXT_MEM(mbuf)))
1717                                 pktmbuf_detach_zcp(mbuf);
1718                         rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1719                 }
1720         }
1721
1722         LOG_DEBUG(VHOST_CONFIG,
1723                 "in mbuf_destroy_zcp: mbuf count in mempool after "
1724                 "mbuf_destroy_zcp is: %d\n",
1725                 rte_mempool_count(vpool->pool));
1726         LOG_DEBUG(VHOST_CONFIG,
1727                 "in mbuf_destroy_zcp: mbuf count in ring after "
1728                 "mbuf_destroy_zcp is : %d\n",
1729                 rte_ring_count(vpool->ring));
1730 }
1731
1732 /*
1733  * This function update the use flag and counter.
1734  */
1735 static inline uint32_t __attribute__((always_inline))
1736 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1737         uint32_t count)
1738 {
1739         struct vhost_virtqueue *vq;
1740         struct vring_desc *desc;
1741         struct rte_mbuf *buff;
1742         /* The virtio_hdr is initialised to 0. */
1743         struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1744                 = {{0, 0, 0, 0, 0, 0}, 0};
1745         uint64_t buff_hdr_addr = 0;
1746         uint32_t head[MAX_PKT_BURST], packet_len = 0;
1747         uint32_t head_idx, packet_success = 0;
1748         uint16_t res_cur_idx;
1749
1750         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1751
1752         if (count == 0)
1753                 return 0;
1754
1755         vq = dev->virtqueue[VIRTIO_RXQ];
1756         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1757
1758         res_cur_idx = vq->last_used_idx;
1759         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1760                 dev->device_fh, res_cur_idx, res_cur_idx + count);
1761
1762         /* Retrieve all of the head indexes first to avoid caching issues. */
1763         for (head_idx = 0; head_idx < count; head_idx++)
1764                 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1765
1766         /*Prefetch descriptor index. */
1767         rte_prefetch0(&vq->desc[head[packet_success]]);
1768
1769         while (packet_success != count) {
1770                 /* Get descriptor from available ring */
1771                 desc = &vq->desc[head[packet_success]];
1772
1773                 buff = pkts[packet_success];
1774                 LOG_DEBUG(VHOST_DATA,
1775                         "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1776                         "pkt[%d] descriptor idx: %d\n",
1777                         dev->device_fh, packet_success,
1778                         MBUF_HEADROOM_UINT32(buff));
1779
1780                 PRINT_PACKET(dev,
1781                         (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1782                         + RTE_PKTMBUF_HEADROOM),
1783                         rte_pktmbuf_data_len(buff), 0);
1784
1785                 /* Buffer address translation for virtio header. */
1786                 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1787                 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1788
1789                 /*
1790                  * If the descriptors are chained the header and data are
1791                  * placed in separate buffers.
1792                  */
1793                 if (desc->flags & VRING_DESC_F_NEXT) {
1794                         desc->len = vq->vhost_hlen;
1795                         desc = &vq->desc[desc->next];
1796                         desc->len = rte_pktmbuf_data_len(buff);
1797                 } else {
1798                         desc->len = packet_len;
1799                 }
1800
1801                 /* Update used ring with desc information */
1802                 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1803                         = head[packet_success];
1804                 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1805                         = packet_len;
1806                 res_cur_idx++;
1807                 packet_success++;
1808
1809                 /* A header is required per buffer. */
1810                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1811                         (const void *)&virtio_hdr, vq->vhost_hlen);
1812
1813                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1814
1815                 if (likely(packet_success < count)) {
1816                         /* Prefetch descriptor index. */
1817                         rte_prefetch0(&vq->desc[head[packet_success]]);
1818                 }
1819         }
1820
1821         rte_compiler_barrier();
1822
1823         LOG_DEBUG(VHOST_DATA,
1824                 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1825                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1826                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1827
1828         *(volatile uint16_t *)&vq->used->idx += count;
1829         vq->last_used_idx += count;
1830
1831         LOG_DEBUG(VHOST_DATA,
1832                 "(%"PRIu64") in dev_rx_zcp: after  update used idx: "
1833                 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1834                 dev->device_fh, vq->last_used_idx, vq->used->idx);
1835
1836         /* Kick the guest if necessary. */
1837         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1838                 eventfd_write(vq->callfd, (eventfd_t)1);
1839
1840         return count;
1841 }
1842
1843 /*
1844  * This function routes the TX packet to the correct interface.
1845  * This may be a local device or the physical port.
1846  */
1847 static inline void __attribute__((always_inline))
1848 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1849         uint32_t desc_idx, uint8_t need_copy)
1850 {
1851         struct mbuf_table *tx_q;
1852         struct rte_mbuf **m_table;
1853         void *obj = NULL;
1854         struct rte_mbuf *mbuf;
1855         unsigned len, ret, offset = 0;
1856         struct vpool *vpool;
1857         uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1858         uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1859
1860         /*Add packet to the port tx queue*/
1861         tx_q = &tx_queue_zcp[vmdq_rx_q];
1862         len = tx_q->len;
1863
1864         /* Allocate an mbuf and populate the structure. */
1865         vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1866         rte_ring_sc_dequeue(vpool->ring, &obj);
1867         mbuf = obj;
1868         if (unlikely(mbuf == NULL)) {
1869                 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1870                 RTE_LOG(ERR, VHOST_DATA,
1871                         "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1872                         dev->device_fh);
1873                 put_desc_to_used_list_zcp(vq, desc_idx);
1874                 return;
1875         }
1876
1877         if (vm2vm_mode == VM2VM_HARDWARE) {
1878                 /* Avoid using a vlan tag from any vm for external pkt, such as
1879                  * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1880                  * selection, MAC address determines it as an external pkt
1881                  * which should go to network, while vlan tag determine it as
1882                  * a vm2vm pkt should forward to another vm. Hardware confuse
1883                  * such a ambiguous situation, so pkt will lost.
1884                  */
1885                 vlan_tag = external_pkt_default_vlan_tag;
1886                 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1887                         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1888                         __rte_mbuf_raw_free(mbuf);
1889                         return;
1890                 }
1891         }
1892
1893         mbuf->nb_segs = m->nb_segs;
1894         mbuf->next = m->next;
1895         mbuf->data_len = m->data_len + offset;
1896         mbuf->pkt_len = mbuf->data_len;
1897         if (unlikely(need_copy)) {
1898                 /* Copy the packet contents to the mbuf. */
1899                 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1900                         rte_pktmbuf_mtod(m, void *),
1901                         m->data_len);
1902         } else {
1903                 mbuf->data_off = m->data_off;
1904                 mbuf->buf_physaddr = m->buf_physaddr;
1905                 mbuf->buf_addr = m->buf_addr;
1906         }
1907         mbuf->ol_flags |= PKT_TX_VLAN_PKT;
1908         mbuf->vlan_tci = vlan_tag;
1909         mbuf->l2_len = sizeof(struct ether_hdr);
1910         mbuf->l3_len = sizeof(struct ipv4_hdr);
1911         MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1912
1913         tx_q->m_table[len] = mbuf;
1914         len++;
1915
1916         LOG_DEBUG(VHOST_DATA,
1917                 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1918                 dev->device_fh,
1919                 mbuf->nb_segs,
1920                 (mbuf->next == NULL) ? "null" : "non-null");
1921
1922         if (enable_stats) {
1923                 dev_statistics[dev->device_fh].tx_total++;
1924                 dev_statistics[dev->device_fh].tx++;
1925         }
1926
1927         if (unlikely(len == MAX_PKT_BURST)) {
1928                 m_table = (struct rte_mbuf **)tx_q->m_table;
1929                 ret = rte_eth_tx_burst(ports[0],
1930                         (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1931
1932                 /*
1933                  * Free any buffers not handled by TX and update
1934                  * the port stats.
1935                  */
1936                 if (unlikely(ret < len)) {
1937                         do {
1938                                 rte_pktmbuf_free(m_table[ret]);
1939                         } while (++ret < len);
1940                 }
1941
1942                 len = 0;
1943                 txmbuf_clean_zcp(dev, vpool);
1944         }
1945
1946         tx_q->len = len;
1947
1948         return;
1949 }
1950
1951 /*
1952  * This function TX all available packets in virtio TX queue for one
1953  * virtio-net device. If it is first packet, it learns MAC address and
1954  * setup VMDQ.
1955  */
1956 static inline void __attribute__((always_inline))
1957 virtio_dev_tx_zcp(struct virtio_net *dev)
1958 {
1959         struct rte_mbuf m;
1960         struct vhost_virtqueue *vq;
1961         struct vring_desc *desc;
1962         uint64_t buff_addr = 0, phys_addr;
1963         uint32_t head[MAX_PKT_BURST];
1964         uint32_t i;
1965         uint16_t free_entries, packet_success = 0;
1966         uint16_t avail_idx;
1967         uint8_t need_copy = 0;
1968         hpa_type addr_type;
1969         struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1970
1971         vq = dev->virtqueue[VIRTIO_TXQ];
1972         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
1973
1974         /* If there are no available buffers then return. */
1975         if (vq->last_used_idx_res == avail_idx)
1976                 return;
1977
1978         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1979
1980         /* Prefetch available ring to retrieve head indexes. */
1981         rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1982
1983         /* Get the number of free entries in the ring */
1984         free_entries = (avail_idx - vq->last_used_idx_res);
1985
1986         /* Limit to MAX_PKT_BURST. */
1987         free_entries
1988                 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1989
1990         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1991                 dev->device_fh, free_entries);
1992
1993         /* Retrieve all of the head indexes first to avoid caching issues. */
1994         for (i = 0; i < free_entries; i++)
1995                 head[i]
1996                         = vq->avail->ring[(vq->last_used_idx_res + i)
1997                         & (vq->size - 1)];
1998
1999         vq->last_used_idx_res += free_entries;
2000
2001         /* Prefetch descriptor index. */
2002         rte_prefetch0(&vq->desc[head[packet_success]]);
2003         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2004
2005         while (packet_success < free_entries) {
2006                 desc = &vq->desc[head[packet_success]];
2007
2008                 /* Discard first buffer as it is the virtio header */
2009                 desc = &vq->desc[desc->next];
2010
2011                 /* Buffer address translation. */
2012                 buff_addr = gpa_to_vva(dev, desc->addr);
2013                 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
2014                 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
2015                         &addr_type);
2016
2017                 if (likely(packet_success < (free_entries - 1)))
2018                         /* Prefetch descriptor index. */
2019                         rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2020
2021                 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2022                         RTE_LOG(ERR, VHOST_DATA,
2023                                 "(%"PRIu64") Invalid frame buffer address found"
2024                                 "when TX packets!\n",
2025                                 dev->device_fh);
2026                         packet_success++;
2027                         continue;
2028                 }
2029
2030                 /* Prefetch buffer address. */
2031                 rte_prefetch0((void *)(uintptr_t)buff_addr);
2032
2033                 /*
2034                  * Setup dummy mbuf. This is copied to a real mbuf if
2035                  * transmitted out the physical port.
2036                  */
2037                 m.data_len = desc->len;
2038                 m.nb_segs = 1;
2039                 m.next = NULL;
2040                 m.data_off = 0;
2041                 m.buf_addr = (void *)(uintptr_t)buff_addr;
2042                 m.buf_physaddr = phys_addr;
2043
2044                 /*
2045                  * Check if the frame buffer address from guest crosses
2046                  * sub-region or not.
2047                  */
2048                 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2049                         RTE_LOG(ERR, VHOST_DATA,
2050                                 "(%"PRIu64") Frame buffer address cross "
2051                                 "sub-regioin found when attaching TX frame "
2052                                 "buffer address!\n",
2053                                 dev->device_fh);
2054                         need_copy = 1;
2055                 } else
2056                         need_copy = 0;
2057
2058                 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2059
2060                 /*
2061                  * If this is the first received packet we need to learn
2062                  * the MAC and setup VMDQ
2063                  */
2064                 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2065                         if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2066                                 /*
2067                                  * Discard frame if device is scheduled for
2068                                  * removal or a duplicate MAC address is found.
2069                                  */
2070                                 packet_success += free_entries;
2071                                 vq->last_used_idx += packet_success;
2072                                 break;
2073                         }
2074                 }
2075
2076                 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2077                 packet_success++;
2078         }
2079 }
2080
2081 /*
2082  * This function is called by each data core. It handles all RX/TX registered
2083  * with the core. For TX the specific lcore linked list is used. For RX, MAC
2084  * addresses are compared with all devices in the main linked list.
2085  */
2086 static int
2087 switch_worker_zcp(__attribute__((unused)) void *arg)
2088 {
2089         struct virtio_net *dev = NULL;
2090         struct vhost_dev  *vdev = NULL;
2091         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2092         struct virtio_net_data_ll *dev_ll;
2093         struct mbuf_table *tx_q;
2094         volatile struct lcore_ll_info *lcore_ll;
2095         const uint64_t drain_tsc
2096                 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2097                 * BURST_TX_DRAIN_US;
2098         uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2099         unsigned ret;
2100         const uint16_t lcore_id = rte_lcore_id();
2101         uint16_t count_in_ring, rx_count = 0;
2102
2103         RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2104
2105         lcore_ll = lcore_info[lcore_id].lcore_ll;
2106         prev_tsc = 0;
2107
2108         while (1) {
2109                 cur_tsc = rte_rdtsc();
2110
2111                 /* TX burst queue drain */
2112                 diff_tsc = cur_tsc - prev_tsc;
2113                 if (unlikely(diff_tsc > drain_tsc)) {
2114                         /*
2115                          * Get mbuf from vpool.pool and detach mbuf and
2116                          * put back into vpool.ring.
2117                          */
2118                         dev_ll = lcore_ll->ll_root_used;
2119                         while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2120                                 /* Get virtio device ID */
2121                                 vdev = dev_ll->vdev;
2122                                 dev = vdev->dev;
2123
2124                                 if (likely(!vdev->remove)) {
2125                                         tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2126                                         if (tx_q->len) {
2127                                                 LOG_DEBUG(VHOST_DATA,
2128                                                 "TX queue drained after timeout"
2129                                                 " with burst size %u\n",
2130                                                 tx_q->len);
2131
2132                                                 /*
2133                                                  * Tx any packets in the queue
2134                                                  */
2135                                                 ret = rte_eth_tx_burst(
2136                                                         ports[0],
2137                                                         (uint16_t)tx_q->txq_id,
2138                                                         (struct rte_mbuf **)
2139                                                         tx_q->m_table,
2140                                                         (uint16_t)tx_q->len);
2141                                                 if (unlikely(ret < tx_q->len)) {
2142                                                         do {
2143                                                                 rte_pktmbuf_free(
2144                                                                         tx_q->m_table[ret]);
2145                                                         } while (++ret < tx_q->len);
2146                                                 }
2147                                                 tx_q->len = 0;
2148
2149                                                 txmbuf_clean_zcp(dev,
2150                                                         &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2151                                         }
2152                                 }
2153                                 dev_ll = dev_ll->next;
2154                         }
2155                         prev_tsc = cur_tsc;
2156                 }
2157
2158                 rte_prefetch0(lcore_ll->ll_root_used);
2159
2160                 /*
2161                  * Inform the configuration core that we have exited the linked
2162                  * list and that no devices are in use if requested.
2163                  */
2164                 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2165                         lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2166
2167                 /* Process devices */
2168                 dev_ll = lcore_ll->ll_root_used;
2169
2170                 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2171                         vdev = dev_ll->vdev;
2172                         dev  = vdev->dev;
2173                         if (unlikely(vdev->remove)) {
2174                                 dev_ll = dev_ll->next;
2175                                 unlink_vmdq(vdev);
2176                                 vdev->ready = DEVICE_SAFE_REMOVE;
2177                                 continue;
2178                         }
2179
2180                         if (likely(vdev->ready == DEVICE_RX)) {
2181                                 uint32_t index = vdev->vmdq_rx_q;
2182                                 uint16_t i;
2183                                 count_in_ring
2184                                 = rte_ring_count(vpool_array[index].ring);
2185                                 uint16_t free_entries
2186                                 = (uint16_t)get_available_ring_num_zcp(dev);
2187
2188                                 /*
2189                                  * Attach all mbufs in vpool.ring and put back
2190                                  * into vpool.pool.
2191                                  */
2192                                 for (i = 0;
2193                                 i < RTE_MIN(free_entries,
2194                                 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2195                                 i++)
2196                                         attach_rxmbuf_zcp(dev);
2197
2198                                 /* Handle guest RX */
2199                                 rx_count = rte_eth_rx_burst(ports[0],
2200                                         vdev->vmdq_rx_q, pkts_burst,
2201                                         MAX_PKT_BURST);
2202
2203                                 if (rx_count) {
2204                                         ret_count = virtio_dev_rx_zcp(dev,
2205                                                         pkts_burst, rx_count);
2206                                         if (enable_stats) {
2207                                                 dev_statistics[dev->device_fh].rx_total
2208                                                         += rx_count;
2209                                                 dev_statistics[dev->device_fh].rx
2210                                                         += ret_count;
2211                                         }
2212                                         while (likely(rx_count)) {
2213                                                 rx_count--;
2214                                                 pktmbuf_detach_zcp(
2215                                                         pkts_burst[rx_count]);
2216                                                 rte_ring_sp_enqueue(
2217                                                         vpool_array[index].ring,
2218                                                         (void *)pkts_burst[rx_count]);
2219                                         }
2220                                 }
2221                         }
2222
2223                         if (likely(!vdev->remove))
2224                                 /* Handle guest TX */
2225                                 virtio_dev_tx_zcp(dev);
2226
2227                         /* Move to the next device in the list */
2228                         dev_ll = dev_ll->next;
2229                 }
2230         }
2231
2232         return 0;
2233 }
2234
2235
2236 /*
2237  * Add an entry to a used linked list. A free entry must first be found
2238  * in the free linked list using get_data_ll_free_entry();
2239  */
2240 static void
2241 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2242         struct virtio_net_data_ll *ll_dev)
2243 {
2244         struct virtio_net_data_ll *ll = *ll_root_addr;
2245
2246         /* Set next as NULL and use a compiler barrier to avoid reordering. */
2247         ll_dev->next = NULL;
2248         rte_compiler_barrier();
2249
2250         /* If ll == NULL then this is the first device. */
2251         if (ll) {
2252                 /* Increment to the tail of the linked list. */
2253                 while ((ll->next != NULL) )
2254                         ll = ll->next;
2255
2256                 ll->next = ll_dev;
2257         } else {
2258                 *ll_root_addr = ll_dev;
2259         }
2260 }
2261
2262 /*
2263  * Remove an entry from a used linked list. The entry must then be added to
2264  * the free linked list using put_data_ll_free_entry().
2265  */
2266 static void
2267 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2268         struct virtio_net_data_ll *ll_dev,
2269         struct virtio_net_data_ll *ll_dev_last)
2270 {
2271         struct virtio_net_data_ll *ll = *ll_root_addr;
2272
2273         if (unlikely((ll == NULL) || (ll_dev == NULL)))
2274                 return;
2275
2276         if (ll_dev == ll)
2277                 *ll_root_addr = ll_dev->next;
2278         else
2279                 if (likely(ll_dev_last != NULL))
2280                         ll_dev_last->next = ll_dev->next;
2281                 else
2282                         RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2283 }
2284
2285 /*
2286  * Find and return an entry from the free linked list.
2287  */
2288 static struct virtio_net_data_ll *
2289 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2290 {
2291         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2292         struct virtio_net_data_ll *ll_dev;
2293
2294         if (ll_free == NULL)
2295                 return NULL;
2296
2297         ll_dev = ll_free;
2298         *ll_root_addr = ll_free->next;
2299
2300         return ll_dev;
2301 }
2302
2303 /*
2304  * Place an entry back on to the free linked list.
2305  */
2306 static void
2307 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2308         struct virtio_net_data_ll *ll_dev)
2309 {
2310         struct virtio_net_data_ll *ll_free = *ll_root_addr;
2311
2312         if (ll_dev == NULL)
2313                 return;
2314
2315         ll_dev->next = ll_free;
2316         *ll_root_addr = ll_dev;
2317 }
2318
2319 /*
2320  * Creates a linked list of a given size.
2321  */
2322 static struct virtio_net_data_ll *
2323 alloc_data_ll(uint32_t size)
2324 {
2325         struct virtio_net_data_ll *ll_new;
2326         uint32_t i;
2327
2328         /* Malloc and then chain the linked list. */
2329         ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2330         if (ll_new == NULL) {
2331                 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2332                 return NULL;
2333         }
2334
2335         for (i = 0; i < size - 1; i++) {
2336                 ll_new[i].vdev = NULL;
2337                 ll_new[i].next = &ll_new[i+1];
2338         }
2339         ll_new[i].next = NULL;
2340
2341         return ll_new;
2342 }
2343
2344 /*
2345  * Create the main linked list along with each individual cores linked list. A used and a free list
2346  * are created to manage entries.
2347  */
2348 static int
2349 init_data_ll (void)
2350 {
2351         int lcore;
2352
2353         RTE_LCORE_FOREACH_SLAVE(lcore) {
2354                 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2355                 if (lcore_info[lcore].lcore_ll == NULL) {
2356                         RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2357                         return -1;
2358                 }
2359
2360                 lcore_info[lcore].lcore_ll->device_num = 0;
2361                 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2362                 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2363                 if (num_devices % num_switching_cores)
2364                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2365                 else
2366                         lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2367         }
2368
2369         /* Allocate devices up to a maximum of MAX_DEVICES. */
2370         ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2371
2372         return 0;
2373 }
2374
2375 /*
2376  * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2377  * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2378  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2379  */
2380 static void
2381 destroy_device (volatile struct virtio_net *dev)
2382 {
2383         struct virtio_net_data_ll *ll_lcore_dev_cur;
2384         struct virtio_net_data_ll *ll_main_dev_cur;
2385         struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2386         struct virtio_net_data_ll *ll_main_dev_last = NULL;
2387         struct vhost_dev *vdev;
2388         int lcore;
2389
2390         dev->flags &= ~VIRTIO_DEV_RUNNING;
2391
2392         vdev = (struct vhost_dev *)dev->priv;
2393         /*set the remove flag. */
2394         vdev->remove = 1;
2395         while(vdev->ready != DEVICE_SAFE_REMOVE) {
2396                 rte_pause();
2397         }
2398
2399         /* Search for entry to be removed from lcore ll */
2400         ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2401         while (ll_lcore_dev_cur != NULL) {
2402                 if (ll_lcore_dev_cur->vdev == vdev) {
2403                         break;
2404                 } else {
2405                         ll_lcore_dev_last = ll_lcore_dev_cur;
2406                         ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2407                 }
2408         }
2409
2410         if (ll_lcore_dev_cur == NULL) {
2411                 RTE_LOG(ERR, VHOST_CONFIG,
2412                         "(%"PRIu64") Failed to find the dev to be destroy.\n",
2413                         dev->device_fh);
2414                 return;
2415         }
2416
2417         /* Search for entry to be removed from main ll */
2418         ll_main_dev_cur = ll_root_used;
2419         ll_main_dev_last = NULL;
2420         while (ll_main_dev_cur != NULL) {
2421                 if (ll_main_dev_cur->vdev == vdev) {
2422                         break;
2423                 } else {
2424                         ll_main_dev_last = ll_main_dev_cur;
2425                         ll_main_dev_cur = ll_main_dev_cur->next;
2426                 }
2427         }
2428
2429         /* Remove entries from the lcore and main ll. */
2430         rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2431         rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2432
2433         /* Set the dev_removal_flag on each lcore. */
2434         RTE_LCORE_FOREACH_SLAVE(lcore) {
2435                 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2436         }
2437
2438         /*
2439          * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2440          * they can no longer access the device removed from the linked lists and that the devices
2441          * are no longer in use.
2442          */
2443         RTE_LCORE_FOREACH_SLAVE(lcore) {
2444                 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2445                         rte_pause();
2446                 }
2447         }
2448
2449         /* Add the entries back to the lcore and main free ll.*/
2450         put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2451         put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2452
2453         /* Decrement number of device on the lcore. */
2454         lcore_info[vdev->coreid].lcore_ll->device_num--;
2455
2456         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2457
2458         if (zero_copy) {
2459                 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2460
2461                 /* Stop the RX queue. */
2462                 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2463                         LOG_DEBUG(VHOST_CONFIG,
2464                                 "(%"PRIu64") In destroy_device: Failed to stop "
2465                                 "rx queue:%d\n",
2466                                 dev->device_fh,
2467                                 vdev->vmdq_rx_q);
2468                 }
2469
2470                 LOG_DEBUG(VHOST_CONFIG,
2471                         "(%"PRIu64") in destroy_device: Start put mbuf in "
2472                         "mempool back to ring for RX queue: %d\n",
2473                         dev->device_fh, vdev->vmdq_rx_q);
2474
2475                 mbuf_destroy_zcp(vpool);
2476
2477                 /* Stop the TX queue. */
2478                 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2479                         LOG_DEBUG(VHOST_CONFIG,
2480                                 "(%"PRIu64") In destroy_device: Failed to "
2481                                 "stop tx queue:%d\n",
2482                                 dev->device_fh, vdev->vmdq_rx_q);
2483                 }
2484
2485                 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2486
2487                 LOG_DEBUG(VHOST_CONFIG,
2488                         "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2489                         "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2490                         dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2491                         dev->device_fh);
2492
2493                 mbuf_destroy_zcp(vpool);
2494                 rte_free(vdev->regions_hpa);
2495         }
2496         rte_free(vdev);
2497
2498 }
2499
2500 /*
2501  * Calculate the region count of physical continous regions for one particular
2502  * region of whose vhost virtual address is continous. The particular region
2503  * start from vva_start, with size of 'size' in argument.
2504  */
2505 static uint32_t
2506 check_hpa_regions(uint64_t vva_start, uint64_t size)
2507 {
2508         uint32_t i, nregions = 0, page_size = getpagesize();
2509         uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2510         if (vva_start % page_size) {
2511                 LOG_DEBUG(VHOST_CONFIG,
2512                         "in check_countinous: vva start(%p) mod page_size(%d) "
2513                         "has remainder\n",
2514                         (void *)(uintptr_t)vva_start, page_size);
2515                 return 0;
2516         }
2517         if (size % page_size) {
2518                 LOG_DEBUG(VHOST_CONFIG,
2519                         "in check_countinous: "
2520                         "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2521                         size, page_size);
2522                 return 0;
2523         }
2524         for (i = 0; i < size - page_size; i = i + page_size) {
2525                 cur_phys_addr
2526                         = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2527                 next_phys_addr = rte_mem_virt2phy(
2528                         (void *)(uintptr_t)(vva_start + i + page_size));
2529                 if ((cur_phys_addr + page_size) != next_phys_addr) {
2530                         ++nregions;
2531                         LOG_DEBUG(VHOST_CONFIG,
2532                                 "in check_continuous: hva addr:(%p) is not "
2533                                 "continuous with hva addr:(%p), diff:%d\n",
2534                                 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2535                                 (void *)(uintptr_t)(vva_start + (uint64_t)i
2536                                 + page_size), page_size);
2537                         LOG_DEBUG(VHOST_CONFIG,
2538                                 "in check_continuous: hpa addr:(%p) is not "
2539                                 "continuous with hpa addr:(%p), "
2540                                 "diff:(%"PRIu64")\n",
2541                                 (void *)(uintptr_t)cur_phys_addr,
2542                                 (void *)(uintptr_t)next_phys_addr,
2543                                 (next_phys_addr-cur_phys_addr));
2544                 }
2545         }
2546         return nregions;
2547 }
2548
2549 /*
2550  * Divide each region whose vhost virtual address is continous into a few
2551  * sub-regions, make sure the physical address within each sub-region are
2552  * continous. And fill offset(to GPA) and size etc. information of each
2553  * sub-region into regions_hpa.
2554  */
2555 static uint32_t
2556 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2557 {
2558         uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2559         uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2560
2561         if (mem_region_hpa == NULL)
2562                 return 0;
2563
2564         for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2565                 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2566                         virtio_memory->regions[regionidx].address_offset;
2567                 mem_region_hpa[regionidx_hpa].guest_phys_address
2568                         = virtio_memory->regions[regionidx].guest_phys_address;
2569                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2570                         rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2571                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2572                 LOG_DEBUG(VHOST_CONFIG,
2573                         "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2574                         regionidx_hpa,
2575                         (void *)(uintptr_t)
2576                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2577                 LOG_DEBUG(VHOST_CONFIG,
2578                         "in fill_hpa_regions: host  phys addr start[%d]:(%p)\n",
2579                         regionidx_hpa,
2580                         (void *)(uintptr_t)
2581                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2582                 for (i = 0, k = 0;
2583                         i < virtio_memory->regions[regionidx].memory_size -
2584                                 page_size;
2585                         i += page_size) {
2586                         cur_phys_addr = rte_mem_virt2phy(
2587                                         (void *)(uintptr_t)(vva_start + i));
2588                         next_phys_addr = rte_mem_virt2phy(
2589                                         (void *)(uintptr_t)(vva_start +
2590                                         i + page_size));
2591                         if ((cur_phys_addr + page_size) != next_phys_addr) {
2592                                 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2593                                         mem_region_hpa[regionidx_hpa].guest_phys_address +
2594                                         k + page_size;
2595                                 mem_region_hpa[regionidx_hpa].memory_size
2596                                         = k + page_size;
2597                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2598                                         "phys addr end  [%d]:(%p)\n",
2599                                         regionidx_hpa,
2600                                         (void *)(uintptr_t)
2601                                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2602                                 LOG_DEBUG(VHOST_CONFIG,
2603                                         "in fill_hpa_regions: guest phys addr "
2604                                         "size [%d]:(%p)\n",
2605                                         regionidx_hpa,
2606                                         (void *)(uintptr_t)
2607                                         (mem_region_hpa[regionidx_hpa].memory_size));
2608                                 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2609                                         = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2610                                 ++regionidx_hpa;
2611                                 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2612                                         next_phys_addr -
2613                                         mem_region_hpa[regionidx_hpa].guest_phys_address;
2614                                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2615                                         " phys addr start[%d]:(%p)\n",
2616                                         regionidx_hpa,
2617                                         (void *)(uintptr_t)
2618                                         (mem_region_hpa[regionidx_hpa].guest_phys_address));
2619                                 LOG_DEBUG(VHOST_CONFIG,
2620                                         "in fill_hpa_regions: host  phys addr "
2621                                         "start[%d]:(%p)\n",
2622                                         regionidx_hpa,
2623                                         (void *)(uintptr_t)
2624                                         (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2625                                 k = 0;
2626                         } else {
2627                                 k += page_size;
2628                         }
2629                 }
2630                 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2631                         = mem_region_hpa[regionidx_hpa].guest_phys_address
2632                         + k + page_size;
2633                 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2634                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end  "
2635                         "[%d]:(%p)\n", regionidx_hpa,
2636                         (void *)(uintptr_t)
2637                         (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2638                 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2639                         "[%d]:(%p)\n", regionidx_hpa,
2640                         (void *)(uintptr_t)
2641                         (mem_region_hpa[regionidx_hpa].memory_size));
2642                 ++regionidx_hpa;
2643         }
2644         return regionidx_hpa;
2645 }
2646
2647 /*
2648  * A new device is added to a data core. First the device is added to the main linked list
2649  * and the allocated to a specific data core.
2650  */
2651 static int
2652 new_device (struct virtio_net *dev)
2653 {
2654         struct virtio_net_data_ll *ll_dev;
2655         int lcore, core_add = 0;
2656         uint32_t device_num_min = num_devices;
2657         struct vhost_dev *vdev;
2658         uint32_t regionidx;
2659
2660         vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
2661         if (vdev == NULL) {
2662                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2663                         dev->device_fh);
2664                 return -1;
2665         }
2666         vdev->dev = dev;
2667         dev->priv = vdev;
2668
2669         if (zero_copy) {
2670                 vdev->nregions_hpa = dev->mem->nregions;
2671                 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2672                         vdev->nregions_hpa
2673                                 += check_hpa_regions(
2674                                         dev->mem->regions[regionidx].guest_phys_address
2675                                         + dev->mem->regions[regionidx].address_offset,
2676                                         dev->mem->regions[regionidx].memory_size);
2677
2678                 }
2679
2680                 vdev->regions_hpa = rte_calloc("vhost hpa region",
2681                                                vdev->nregions_hpa,
2682                                                sizeof(struct virtio_memory_regions_hpa),
2683                                                RTE_CACHE_LINE_SIZE);
2684                 if (vdev->regions_hpa == NULL) {
2685                         RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2686                         rte_free(vdev);
2687                         return -1;
2688                 }
2689
2690
2691                 if (fill_hpa_memory_regions(
2692                         vdev->regions_hpa, dev->mem
2693                         ) != vdev->nregions_hpa) {
2694
2695                         RTE_LOG(ERR, VHOST_CONFIG,
2696                                 "hpa memory regions number mismatch: "
2697                                 "[%d]\n", vdev->nregions_hpa);
2698                         rte_free(vdev->regions_hpa);
2699                         rte_free(vdev);
2700                         return -1;
2701                 }
2702         }
2703
2704
2705         /* Add device to main ll */
2706         ll_dev = get_data_ll_free_entry(&ll_root_free);
2707         if (ll_dev == NULL) {
2708                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2709                         "of %d devices per core has been reached\n",
2710                         dev->device_fh, num_devices);
2711                 if (vdev->regions_hpa)
2712                         rte_free(vdev->regions_hpa);
2713                 rte_free(vdev);
2714                 return -1;
2715         }
2716         ll_dev->vdev = vdev;
2717         add_data_ll_entry(&ll_root_used, ll_dev);
2718         vdev->vmdq_rx_q
2719                 = dev->device_fh * queues_per_pool + vmdq_queue_base;
2720
2721         if (zero_copy) {
2722                 uint32_t index = vdev->vmdq_rx_q;
2723                 uint32_t count_in_ring, i;
2724                 struct mbuf_table *tx_q;
2725
2726                 count_in_ring = rte_ring_count(vpool_array[index].ring);
2727
2728                 LOG_DEBUG(VHOST_CONFIG,
2729                         "(%"PRIu64") in new_device: mbuf count in mempool "
2730                         "before attach is: %d\n",
2731                         dev->device_fh,
2732                         rte_mempool_count(vpool_array[index].pool));
2733                 LOG_DEBUG(VHOST_CONFIG,
2734                         "(%"PRIu64") in new_device: mbuf count in  ring "
2735                         "before attach  is : %d\n",
2736                         dev->device_fh, count_in_ring);
2737
2738                 /*
2739                  * Attach all mbufs in vpool.ring and put back intovpool.pool.
2740                  */
2741                 for (i = 0; i < count_in_ring; i++)
2742                         attach_rxmbuf_zcp(dev);
2743
2744                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2745                         "mempool after attach is: %d\n",
2746                         dev->device_fh,
2747                         rte_mempool_count(vpool_array[index].pool));
2748                 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2749                         "ring after attach  is : %d\n",
2750                         dev->device_fh,
2751                         rte_ring_count(vpool_array[index].ring));
2752
2753                 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2754                 tx_q->txq_id = vdev->vmdq_rx_q;
2755
2756                 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2757                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2758
2759                         LOG_DEBUG(VHOST_CONFIG,
2760                                 "(%"PRIu64") In new_device: Failed to start "
2761                                 "tx queue:%d\n",
2762                                 dev->device_fh, vdev->vmdq_rx_q);
2763
2764                         mbuf_destroy_zcp(vpool);
2765                         rte_free(vdev->regions_hpa);
2766                         rte_free(vdev);
2767                         return -1;
2768                 }
2769
2770                 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2771                         struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2772
2773                         LOG_DEBUG(VHOST_CONFIG,
2774                                 "(%"PRIu64") In new_device: Failed to start "
2775                                 "rx queue:%d\n",
2776                                 dev->device_fh, vdev->vmdq_rx_q);
2777
2778                         /* Stop the TX queue. */
2779                         if (rte_eth_dev_tx_queue_stop(ports[0],
2780                                 vdev->vmdq_rx_q) != 0) {
2781                                 LOG_DEBUG(VHOST_CONFIG,
2782                                         "(%"PRIu64") In new_device: Failed to "
2783                                         "stop tx queue:%d\n",
2784                                         dev->device_fh, vdev->vmdq_rx_q);
2785                         }
2786
2787                         mbuf_destroy_zcp(vpool);
2788                         rte_free(vdev->regions_hpa);
2789                         rte_free(vdev);
2790                         return -1;
2791                 }
2792
2793         }
2794
2795         /*reset ready flag*/
2796         vdev->ready = DEVICE_MAC_LEARNING;
2797         vdev->remove = 0;
2798
2799         /* Find a suitable lcore to add the device. */
2800         RTE_LCORE_FOREACH_SLAVE(lcore) {
2801                 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2802                         device_num_min = lcore_info[lcore].lcore_ll->device_num;
2803                         core_add = lcore;
2804                 }
2805         }
2806         /* Add device to lcore ll */
2807         ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2808         if (ll_dev == NULL) {
2809                 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2810                 vdev->ready = DEVICE_SAFE_REMOVE;
2811                 destroy_device(dev);
2812                 rte_free(vdev->regions_hpa);
2813                 rte_free(vdev);
2814                 return -1;
2815         }
2816         ll_dev->vdev = vdev;
2817         vdev->coreid = core_add;
2818
2819         add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2820
2821         /* Initialize device stats */
2822         memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2823
2824         /* Disable notifications. */
2825         rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2826         rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2827         lcore_info[vdev->coreid].lcore_ll->device_num++;
2828         dev->flags |= VIRTIO_DEV_RUNNING;
2829
2830         RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2831
2832         return 0;
2833 }
2834
2835 /*
2836  * These callback allow devices to be added to the data core when configuration
2837  * has been fully complete.
2838  */
2839 static const struct virtio_net_device_ops virtio_net_device_ops =
2840 {
2841         .new_device =  new_device,
2842         .destroy_device = destroy_device,
2843 };
2844
2845 /*
2846  * This is a thread will wake up after a period to print stats if the user has
2847  * enabled them.
2848  */
2849 static void
2850 print_stats(void)
2851 {
2852         struct virtio_net_data_ll *dev_ll;
2853         uint64_t tx_dropped, rx_dropped;
2854         uint64_t tx, tx_total, rx, rx_total;
2855         uint32_t device_fh;
2856         const char clr[] = { 27, '[', '2', 'J', '\0' };
2857         const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2858
2859         while(1) {
2860                 sleep(enable_stats);
2861
2862                 /* Clear screen and move to top left */
2863                 printf("%s%s", clr, top_left);
2864
2865                 printf("\nDevice statistics ====================================");
2866
2867                 dev_ll = ll_root_used;
2868                 while (dev_ll != NULL) {
2869                         device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2870                         tx_total = dev_statistics[device_fh].tx_total;
2871                         tx = dev_statistics[device_fh].tx;
2872                         tx_dropped = tx_total - tx;
2873                         if (zero_copy == 0) {
2874                                 rx_total = rte_atomic64_read(
2875                                         &dev_statistics[device_fh].rx_total_atomic);
2876                                 rx = rte_atomic64_read(
2877                                         &dev_statistics[device_fh].rx_atomic);
2878                         } else {
2879                                 rx_total = dev_statistics[device_fh].rx_total;
2880                                 rx = dev_statistics[device_fh].rx;
2881                         }
2882                         rx_dropped = rx_total - rx;
2883
2884                         printf("\nStatistics for device %"PRIu32" ------------------------------"
2885                                         "\nTX total:            %"PRIu64""
2886                                         "\nTX dropped:          %"PRIu64""
2887                                         "\nTX successful:               %"PRIu64""
2888                                         "\nRX total:            %"PRIu64""
2889                                         "\nRX dropped:          %"PRIu64""
2890                                         "\nRX successful:               %"PRIu64"",
2891                                         device_fh,
2892                                         tx_total,
2893                                         tx_dropped,
2894                                         tx,
2895                                         rx_total,
2896                                         rx_dropped,
2897                                         rx);
2898
2899                         dev_ll = dev_ll->next;
2900                 }
2901                 printf("\n======================================================\n");
2902         }
2903 }
2904
2905 static void
2906 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2907         char *ring_name, uint32_t nb_mbuf)
2908 {
2909         vpool_array[index].pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf,
2910                 MBUF_CACHE_SIZE_ZCP, 0, MBUF_DATA_SIZE_ZCP, socket);
2911         if (vpool_array[index].pool != NULL) {
2912                 vpool_array[index].ring
2913                         = rte_ring_create(ring_name,
2914                                 rte_align32pow2(nb_mbuf + 1),
2915                                 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2916                 if (likely(vpool_array[index].ring != NULL)) {
2917                         LOG_DEBUG(VHOST_CONFIG,
2918                                 "in setup_mempool_tbl: mbuf count in "
2919                                 "mempool is: %d\n",
2920                                 rte_mempool_count(vpool_array[index].pool));
2921                         LOG_DEBUG(VHOST_CONFIG,
2922                                 "in setup_mempool_tbl: mbuf count in "
2923                                 "ring   is: %d\n",
2924                                 rte_ring_count(vpool_array[index].ring));
2925                 } else {
2926                         rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2927                                 ring_name);
2928                 }
2929
2930                 /* Need consider head room. */
2931                 vpool_array[index].buf_size = VIRTIO_DESCRIPTOR_LEN_ZCP;
2932         } else {
2933                 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2934         }
2935 }
2936
2937 /* When we receive a INT signal, unregister vhost driver */
2938 static void
2939 sigint_handler(__rte_unused int signum)
2940 {
2941         /* Unregister vhost driver. */
2942         int ret = rte_vhost_driver_unregister((char *)&dev_basename);
2943         if (ret != 0)
2944                 rte_exit(EXIT_FAILURE, "vhost driver unregister failure.\n");
2945         exit(0);
2946 }
2947
2948 /*
2949  * Main function, does initialisation and calls the per-lcore functions. The CUSE
2950  * device is also registered here to handle the IOCTLs.
2951  */
2952 int
2953 main(int argc, char *argv[])
2954 {
2955         struct rte_mempool *mbuf_pool = NULL;
2956         unsigned lcore_id, core_id = 0;
2957         unsigned nb_ports, valid_num_ports;
2958         int ret;
2959         uint8_t portid;
2960         uint16_t queue_id;
2961         static pthread_t tid;
2962         char thread_name[RTE_MAX_THREAD_NAME_LEN];
2963
2964         signal(SIGINT, sigint_handler);
2965
2966         /* init EAL */
2967         ret = rte_eal_init(argc, argv);
2968         if (ret < 0)
2969                 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2970         argc -= ret;
2971         argv += ret;
2972
2973         /* parse app arguments */
2974         ret = us_vhost_parse_args(argc, argv);
2975         if (ret < 0)
2976                 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2977
2978         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2979                 if (rte_lcore_is_enabled(lcore_id))
2980                         lcore_ids[core_id ++] = lcore_id;
2981
2982         if (rte_lcore_count() > RTE_MAX_LCORE)
2983                 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2984
2985         /*set the number of swithcing cores available*/
2986         num_switching_cores = rte_lcore_count()-1;
2987
2988         /* Get the number of physical ports. */
2989         nb_ports = rte_eth_dev_count();
2990         if (nb_ports > RTE_MAX_ETHPORTS)
2991                 nb_ports = RTE_MAX_ETHPORTS;
2992
2993         /*
2994          * Update the global var NUM_PORTS and global array PORTS
2995          * and get value of var VALID_NUM_PORTS according to system ports number
2996          */
2997         valid_num_ports = check_ports_num(nb_ports);
2998
2999         if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
3000                 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3001                         "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3002                 return -1;
3003         }
3004
3005         if (zero_copy == 0) {
3006                 /* Create the mbuf pool. */
3007                 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
3008                         NUM_MBUFS_PER_PORT * valid_num_ports, MBUF_CACHE_SIZE,
3009                         0, MBUF_DATA_SIZE, rte_socket_id());
3010                 if (mbuf_pool == NULL)
3011                         rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3012
3013                 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3014                         vpool_array[queue_id].pool = mbuf_pool;
3015
3016                 if (vm2vm_mode == VM2VM_HARDWARE) {
3017                         /* Enable VT loop back to let L2 switch to do it. */
3018                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3019                         LOG_DEBUG(VHOST_CONFIG,
3020                                 "Enable loop back for L2 switch in vmdq.\n");
3021                 }
3022         } else {
3023                 uint32_t nb_mbuf;
3024                 char pool_name[RTE_MEMPOOL_NAMESIZE];
3025                 char ring_name[RTE_MEMPOOL_NAMESIZE];
3026
3027                 nb_mbuf = num_rx_descriptor
3028                         + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3029                         + num_switching_cores * MAX_PKT_BURST;
3030
3031                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3032                         snprintf(pool_name, sizeof(pool_name),
3033                                 "rxmbuf_pool_%u", queue_id);
3034                         snprintf(ring_name, sizeof(ring_name),
3035                                 "rxmbuf_ring_%u", queue_id);
3036                         setup_mempool_tbl(rte_socket_id(), queue_id,
3037                                 pool_name, ring_name, nb_mbuf);
3038                 }
3039
3040                 nb_mbuf = num_tx_descriptor
3041                                 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3042                                 + num_switching_cores * MAX_PKT_BURST;
3043
3044                 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3045                         snprintf(pool_name, sizeof(pool_name),
3046                                 "txmbuf_pool_%u", queue_id);
3047                         snprintf(ring_name, sizeof(ring_name),
3048                                 "txmbuf_ring_%u", queue_id);
3049                         setup_mempool_tbl(rte_socket_id(),
3050                                 (queue_id + MAX_QUEUES),
3051                                 pool_name, ring_name, nb_mbuf);
3052                 }
3053
3054                 if (vm2vm_mode == VM2VM_HARDWARE) {
3055                         /* Enable VT loop back to let L2 switch to do it. */
3056                         vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3057                         LOG_DEBUG(VHOST_CONFIG,
3058                                 "Enable loop back for L2 switch in vmdq.\n");
3059                 }
3060         }
3061         /* Set log level. */
3062         rte_set_log_level(LOG_LEVEL);
3063
3064         /* initialize all ports */
3065         for (portid = 0; portid < nb_ports; portid++) {
3066                 /* skip ports that are not enabled */
3067                 if ((enabled_port_mask & (1 << portid)) == 0) {
3068                         RTE_LOG(INFO, VHOST_PORT,
3069                                 "Skipping disabled port %d\n", portid);
3070                         continue;
3071                 }
3072                 if (port_init(portid) != 0)
3073                         rte_exit(EXIT_FAILURE,
3074                                 "Cannot initialize network ports\n");
3075         }
3076
3077         /* Initialise all linked lists. */
3078         if (init_data_ll() == -1)
3079                 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3080
3081         /* Initialize device stats */
3082         memset(&dev_statistics, 0, sizeof(dev_statistics));
3083
3084         /* Enable stats if the user option is set. */
3085         if (enable_stats) {
3086                 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
3087                 if (ret != 0)
3088                         rte_exit(EXIT_FAILURE,
3089                                 "Cannot create print-stats thread\n");
3090
3091                 /* Set thread_name for aid in debugging.  */
3092                 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
3093                 ret = rte_thread_setname(tid, thread_name);
3094                 if (ret != 0)
3095                         RTE_LOG(ERR, VHOST_CONFIG,
3096                                 "Cannot set print-stats name\n");
3097         }
3098
3099         /* Launch all data cores. */
3100         if (zero_copy == 0) {
3101                 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3102                         rte_eal_remote_launch(switch_worker,
3103                                 mbuf_pool, lcore_id);
3104                 }
3105         } else {
3106                 uint32_t count_in_mempool, index, i;
3107                 for (index = 0; index < 2*MAX_QUEUES; index++) {
3108                         /* For all RX and TX queues. */
3109                         count_in_mempool
3110                                 = rte_mempool_count(vpool_array[index].pool);
3111
3112                         /*
3113                          * Transfer all un-attached mbufs from vpool.pool
3114                          * to vpoo.ring.
3115                          */
3116                         for (i = 0; i < count_in_mempool; i++) {
3117                                 struct rte_mbuf *mbuf
3118                                         = __rte_mbuf_raw_alloc(
3119                                                 vpool_array[index].pool);
3120                                 rte_ring_sp_enqueue(vpool_array[index].ring,
3121                                                 (void *)mbuf);
3122                         }
3123
3124                         LOG_DEBUG(VHOST_CONFIG,
3125                                 "in main: mbuf count in mempool at initial "
3126                                 "is: %d\n", count_in_mempool);
3127                         LOG_DEBUG(VHOST_CONFIG,
3128                                 "in main: mbuf count in  ring at initial  is :"
3129                                 " %d\n",
3130                                 rte_ring_count(vpool_array[index].ring));
3131                 }
3132
3133                 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3134                         rte_eal_remote_launch(switch_worker_zcp, NULL,
3135                                 lcore_id);
3136         }
3137
3138         if (mergeable == 0)
3139                 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3140
3141         /* Register vhost(cuse or user) driver to handle vhost messages. */
3142         ret = rte_vhost_driver_register((char *)&dev_basename);
3143         if (ret != 0)
3144                 rte_exit(EXIT_FAILURE, "vhost driver register failure.\n");
3145
3146         rte_vhost_driver_callback_register(&virtio_net_device_ops);
3147
3148         /* Start CUSE session. */
3149         rte_vhost_driver_session_start();
3150         return 0;
3151
3152 }