4 * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <linux/if_ether.h>
36 #include <linux/if_vlan.h>
37 #include <linux/virtio_net.h>
38 #include <linux/virtio_ring.h>
39 #include <sys/param.h>
42 #include <rte_ethdev.h>
44 #include <rte_string_fns.h>
46 #include <rte_malloc.h>
52 #include "rte_vhost.h"
54 #include "vxlan_setup.h"
56 #define IPV4_HEADER_LEN 20
57 #define UDP_HEADER_LEN 8
58 #define VXLAN_HEADER_LEN 8
60 #define IP_VERSION 0x40
61 #define IP_HDRLEN 0x05 /* default IP header length == five 32-bits words. */
62 #define IP_DEFTTL 64 /* from RFC 1340. */
63 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
65 #define IP_DN_FRAGMENT_FLAG 0x0040
67 /* Used to compare MAC addresses. */
68 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
70 /* Configurable number of RX/TX ring descriptors */
71 #define RTE_TEST_RX_DESC_DEFAULT 1024
72 #define RTE_TEST_TX_DESC_DEFAULT 512
74 /* Default inner VLAN ID */
75 #define INNER_VLAN_ID 100
78 struct vxlan_conf vxdev;
80 struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS];
81 struct ether_hdr app_l2_hdr[VXLAN_N_PORTS];
83 /* local VTEP IP address */
84 uint8_t vxlan_multicast_ips[2][4] = { {239, 1, 1, 1 }, {239, 1, 2, 1 } };
86 /* Remote VTEP IP address */
87 uint8_t vxlan_overlay_ips[2][4] = { {192, 168, 10, 1}, {192, 168, 30, 1} };
89 /* Remote VTEP MAC address */
90 uint8_t peer_mac[6] = {0x00, 0x11, 0x01, 0x00, 0x00, 0x01};
92 /* VXLAN RX filter type */
93 uint8_t tep_filter_type[] = {RTE_TUNNEL_FILTER_IMAC_TENID,
94 RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID,
95 RTE_TUNNEL_FILTER_OMAC_TENID_IMAC,};
97 /* Options for configuring ethernet port */
98 static const struct rte_eth_conf port_conf = {
101 .header_split = 0, /**< Header Split disabled */
102 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
103 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
104 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
105 .hw_strip_crc = 1, /**< CRC stripped by hardware */
108 .mq_mode = ETH_MQ_TX_NONE,
113 * The one or two device(s) that belongs to the same tenant ID can
114 * be assigned in a VM.
116 const uint16_t tenant_id_conf[] = {
117 1000, 1000, 1001, 1001, 1002, 1002, 1003, 1003,
118 1004, 1004, 1005, 1005, 1006, 1006, 1007, 1007,
119 1008, 1008, 1009, 1009, 1010, 1010, 1011, 1011,
120 1012, 1012, 1013, 1013, 1014, 1014, 1015, 1015,
121 1016, 1016, 1017, 1017, 1018, 1018, 1019, 1019,
122 1020, 1020, 1021, 1021, 1022, 1022, 1023, 1023,
123 1024, 1024, 1025, 1025, 1026, 1026, 1027, 1027,
124 1028, 1028, 1029, 1029, 1030, 1030, 1031, 1031,
128 * Initialises a given port using global settings and with the rx buffers
129 * coming from the mbuf_pool passed as parameter
132 vxlan_port_init(uint16_t port, struct rte_mempool *mbuf_pool)
136 struct rte_eth_dev_info dev_info;
137 uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
138 uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
139 uint16_t tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
140 struct rte_eth_udp_tunnel tunnel_udp;
141 struct rte_eth_rxconf *rxconf;
142 struct rte_eth_txconf *txconf;
143 struct vxlan_conf *pconf = &vxdev;
145 pconf->dst_port = udp_port;
147 rte_eth_dev_info_get(port, &dev_info);
149 if (dev_info.max_rx_queues > MAX_QUEUES) {
150 rte_exit(EXIT_FAILURE,
151 "please define MAX_QUEUES no less than %u in %s\n",
152 dev_info.max_rx_queues, __FILE__);
155 rxconf = &dev_info.default_rxconf;
156 txconf = &dev_info.default_txconf;
157 txconf->txq_flags = 0;
159 if (port >= rte_eth_dev_count())
162 rx_rings = nb_devices;
164 /* Configure ethernet device. */
165 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
169 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
174 /* Setup the queues. */
175 for (q = 0; q < rx_rings; q++) {
176 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
177 rte_eth_dev_socket_id(port),
183 for (q = 0; q < tx_rings; q++) {
184 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
185 rte_eth_dev_socket_id(port),
191 /* Start the device. */
192 retval = rte_eth_dev_start(port);
196 /* Configure UDP port for UDP tunneling */
197 tunnel_udp.udp_port = udp_port;
198 tunnel_udp.prot_type = RTE_TUNNEL_TYPE_VXLAN;
199 retval = rte_eth_dev_udp_tunnel_port_add(port, &tunnel_udp);
202 rte_eth_macaddr_get(port, &ports_eth_addr[port]);
203 RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
204 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
206 ports_eth_addr[port].addr_bytes[0],
207 ports_eth_addr[port].addr_bytes[1],
208 ports_eth_addr[port].addr_bytes[2],
209 ports_eth_addr[port].addr_bytes[3],
210 ports_eth_addr[port].addr_bytes[4],
211 ports_eth_addr[port].addr_bytes[5]);
213 if (tso_segsz != 0) {
214 struct rte_eth_dev_info dev_info;
215 rte_eth_dev_info_get(port, &dev_info);
216 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0)
217 RTE_LOG(WARNING, PORT,
218 "hardware TSO offload is not supported\n");
224 vxlan_rx_process(struct rte_mbuf *pkt)
229 ret = decapsulation(pkt);
235 vxlan_tx_process(uint8_t queue_id, struct rte_mbuf *pkt)
238 encapsulation(pkt, queue_id);
244 * This function learns the MAC address of the device and set init
245 * L2 header and L3 header info.
248 vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
251 struct ether_hdr *pkt_hdr;
252 uint64_t portid = vdev->vid;
255 struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
257 if (unlikely(portid >= VXLAN_N_PORTS)) {
258 RTE_LOG(INFO, VHOST_DATA,
259 "(%d) WARNING: Not configuring device,"
260 "as already have %d ports for VXLAN.",
261 vdev->vid, VXLAN_N_PORTS);
265 /* Learn MAC address of guest device from packet */
266 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
267 if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) {
268 RTE_LOG(INFO, VHOST_DATA,
269 "(%d) WARNING: This device is using an existing"
270 " MAC address and has not been registered.\n",
275 for (i = 0; i < ETHER_ADDR_LEN; i++) {
276 vdev->mac_address.addr_bytes[i] =
277 vxdev.port[portid].vport_mac.addr_bytes[i] =
278 pkt_hdr->s_addr.addr_bytes[i];
279 vxdev.port[portid].peer_mac.addr_bytes[i] = peer_mac[i];
282 memset(&tunnel_filter_conf, 0,
283 sizeof(struct rte_eth_tunnel_filter_conf));
285 ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
286 tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
289 ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
291 tunnel_filter_conf.queue_id = vdev->rx_q;
292 tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
294 if (tep_filter_type[filter_idx] == RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
295 tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
297 tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
299 ret = rte_eth_dev_filter_ctrl(ports[0],
300 RTE_ETH_FILTER_TUNNEL,
302 &tunnel_filter_conf);
304 RTE_LOG(ERR, VHOST_DATA,
305 "%d Failed to add device MAC address to cloud filter\n",
310 /* Print out inner MAC and VNI info. */
311 RTE_LOG(INFO, VHOST_DATA,
312 "(%d) MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VNI %d registered\n",
314 vdev->mac_address.addr_bytes[0],
315 vdev->mac_address.addr_bytes[1],
316 vdev->mac_address.addr_bytes[2],
317 vdev->mac_address.addr_bytes[3],
318 vdev->mac_address.addr_bytes[4],
319 vdev->mac_address.addr_bytes[5],
320 tenant_id_conf[vdev->rx_q]);
322 vxdev.port[portid].vport_id = portid;
324 for (i = 0; i < 4; i++) {
326 vxdev.port_ip |= vxlan_multicast_ips[portid][i] << (8 * i);
328 vxdev.port[portid].peer_ip |=
329 vxlan_overlay_ips[portid][i] << (8 * i);
332 vxdev.out_key = tenant_id_conf[vdev->rx_q];
333 ether_addr_copy(&vxdev.port[portid].peer_mac,
334 &app_l2_hdr[portid].d_addr);
335 ether_addr_copy(&ports_eth_addr[0],
336 &app_l2_hdr[portid].s_addr);
337 app_l2_hdr[portid].ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
339 ip = &app_ip_hdr[portid];
340 ip->version_ihl = IP_VHL_DEF;
341 ip->type_of_service = 0;
342 ip->total_length = 0;
344 ip->fragment_offset = IP_DN_FRAGMENT_FLAG;
345 ip->time_to_live = IP_DEFTTL;
346 ip->next_proto_id = IPPROTO_UDP;
347 ip->hdr_checksum = 0;
348 ip->src_addr = vxdev.port_ip;
349 ip->dst_addr = vxdev.port[portid].peer_ip;
351 /* Set device as ready for RX. */
352 vdev->ready = DEVICE_RX;
358 * Removes cloud filter. Ensures that nothing is adding buffers to the RX
359 * queue before disabling RX on the device.
362 vxlan_unlink(struct vhost_dev *vdev)
364 unsigned i = 0, rx_count;
366 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
367 struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
369 if (vdev->ready == DEVICE_RX) {
370 memset(&tunnel_filter_conf, 0,
371 sizeof(struct rte_eth_tunnel_filter_conf));
373 ether_addr_copy(&ports_eth_addr[0], &tunnel_filter_conf.outer_mac);
374 ether_addr_copy(&vdev->mac_address, &tunnel_filter_conf.inner_mac);
375 tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
376 tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
378 if (tep_filter_type[filter_idx] ==
379 RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
380 tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
382 tunnel_filter_conf.queue_id = vdev->rx_q;
383 tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
385 ret = rte_eth_dev_filter_ctrl(ports[0],
386 RTE_ETH_FILTER_TUNNEL,
387 RTE_ETH_FILTER_DELETE,
388 &tunnel_filter_conf);
390 RTE_LOG(ERR, VHOST_DATA,
391 "%d Failed to add device MAC address to cloud filter\n",
395 for (i = 0; i < ETHER_ADDR_LEN; i++)
396 vdev->mac_address.addr_bytes[i] = 0;
398 /* Clear out the receive buffers */
399 rx_count = rte_eth_rx_burst(ports[0],
400 (uint16_t)vdev->rx_q,
401 pkts_burst, MAX_PKT_BURST);
404 for (i = 0; i < rx_count; i++)
405 rte_pktmbuf_free(pkts_burst[i]);
407 rx_count = rte_eth_rx_burst(ports[0],
408 (uint16_t)vdev->rx_q,
409 pkts_burst, MAX_PKT_BURST);
411 vdev->ready = DEVICE_MAC_LEARNING;
415 /* Transmit packets after encapsulating */
417 vxlan_tx_pkts(uint16_t port_id, uint16_t queue_id,
418 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) {
422 for (i = 0; i < nb_pkts; i++)
423 vxlan_tx_process(queue_id, tx_pkts[i]);
425 ret = rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts);
430 /* Check for decapsulation and pass packets directly to VIRTIO device */
432 vxlan_rx_pkts(int vid, struct rte_mbuf **pkts_burst, uint32_t rx_count)
437 struct rte_mbuf *pkts_valid[rx_count];
439 for (i = 0; i < rx_count; i++) {
442 &dev_statistics[vid].rx_bad_ip_csum,
443 (pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD)
446 &dev_statistics[vid].rx_bad_ip_csum,
447 (pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD)
450 ret = vxlan_rx_process(pkts_burst[i]);
451 if (unlikely(ret < 0))
454 pkts_valid[count] = pkts_burst[i];
458 ret = rte_vhost_enqueue_burst(vid, VIRTIO_RXQ, pkts_valid, count);