i40evf: fix MAC deletion when stopping
[dpdk.git] / examples / tep_termination / vxlan_setup.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <getopt.h>
35 #include <linux/if_ether.h>
36 #include <linux/if_vlan.h>
37 #include <linux/virtio_net.h>
38 #include <linux/virtio_ring.h>
39 #include <sys/param.h>
40 #include <unistd.h>
41
42 #include <rte_ethdev.h>
43 #include <rte_log.h>
44 #include <rte_string_fns.h>
45 #include <rte_mbuf.h>
46 #include <rte_malloc.h>
47 #include <rte_ip.h>
48 #include <rte_udp.h>
49 #include <rte_tcp.h>
50
51 #include "main.h"
52 #include "rte_virtio_net.h"
53 #include "vxlan.h"
54 #include "vxlan_setup.h"
55
56 #define IPV4_HEADER_LEN 20
57 #define UDP_HEADER_LEN  8
58 #define VXLAN_HEADER_LEN 8
59
60 #define IP_VERSION 0x40
61 #define IP_HDRLEN  0x05 /* default IP header length == five 32-bits words. */
62 #define IP_DEFTTL  64   /* from RFC 1340. */
63 #define IP_VHL_DEF (IP_VERSION | IP_HDRLEN)
64
65 #define IP_DN_FRAGMENT_FLAG 0x0040
66
67 /* Used to compare MAC addresses. */
68 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
69
70 /* Configurable number of RX/TX ring descriptors */
71 #define RTE_TEST_RX_DESC_DEFAULT 1024
72 #define RTE_TEST_TX_DESC_DEFAULT 512
73
74 /* Default inner VLAN ID */
75 #define INNER_VLAN_ID 100
76
77 /* VXLAN device */
78 struct vxlan_conf vxdev;
79
80 struct ipv4_hdr app_ip_hdr[VXLAN_N_PORTS];
81 struct ether_hdr app_l2_hdr[VXLAN_N_PORTS];
82
83 /* local VTEP IP address */
84 uint8_t vxlan_multicast_ips[2][4] = { {239, 1, 1, 1 }, {239, 1, 2, 1 } };
85
86 /* Remote VTEP IP address */
87 uint8_t vxlan_overlay_ips[2][4] = { {192, 168, 10, 1}, {192, 168, 30, 1} };
88
89 /* Remote VTEP MAC address */
90 uint8_t peer_mac[6] = {0x00, 0x11, 0x01, 0x00, 0x00, 0x01};
91
92 /* VXLAN RX filter type */
93 uint8_t tep_filter_type[] = {RTE_TUNNEL_FILTER_IMAC_TENID,
94                         RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID,
95                         RTE_TUNNEL_FILTER_OMAC_TENID_IMAC,};
96
97 /* Options for configuring ethernet port */
98 static const struct rte_eth_conf port_conf = {
99         .rxmode = {
100                 .split_hdr_size = 0,
101                 .header_split   = 0, /**< Header Split disabled */
102                 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
103                 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
104                 .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
105                 .hw_strip_crc   = 0, /**< CRC stripped by hardware */
106         },
107         .txmode = {
108                 .mq_mode = ETH_MQ_TX_NONE,
109         },
110 };
111
112 /**
113  * The one or two device(s) that belongs to the same tenant ID can
114  * be assigned in a VM.
115  */
116 const uint16_t tenant_id_conf[] = {
117         1000, 1000, 1001, 1001, 1002, 1002, 1003, 1003,
118         1004, 1004, 1005, 1005, 1006, 1006, 1007, 1007,
119         1008, 1008, 1009, 1009, 1010, 1010, 1011, 1011,
120         1012, 1012, 1013, 1013, 1014, 1014, 1015, 1015,
121         1016, 1016, 1017, 1017, 1018, 1018, 1019, 1019,
122         1020, 1020, 1021, 1021, 1022, 1022, 1023, 1023,
123         1024, 1024, 1025, 1025, 1026, 1026, 1027, 1027,
124         1028, 1028, 1029, 1029, 1030, 1030, 1031, 1031,
125 };
126
127 /**
128  * Initialises a given port using global settings and with the rx buffers
129  * coming from the mbuf_pool passed as parameter
130  */
131 int
132 vxlan_port_init(uint8_t port, struct rte_mempool *mbuf_pool)
133 {
134         int retval;
135         uint16_t q;
136         struct rte_eth_dev_info dev_info;
137         uint16_t rx_rings, tx_rings = (uint16_t)rte_lcore_count();
138         const uint16_t rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
139         const uint16_t tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
140         struct rte_eth_udp_tunnel tunnel_udp;
141         struct rte_eth_rxconf *rxconf;
142         struct rte_eth_txconf *txconf;
143         struct vxlan_conf *pconf = &vxdev;
144
145         pconf->dst_port = udp_port;
146
147         rte_eth_dev_info_get(port, &dev_info);
148
149         if (dev_info.max_rx_queues > MAX_QUEUES) {
150                 rte_exit(EXIT_FAILURE,
151                         "please define MAX_QUEUES no less than %u in %s\n",
152                         dev_info.max_rx_queues, __FILE__);
153         }
154
155         rxconf = &dev_info.default_rxconf;
156         txconf = &dev_info.default_txconf;
157         txconf->txq_flags = 0;
158
159         if (port >= rte_eth_dev_count())
160                 return -1;
161
162         rx_rings = nb_devices;
163
164         /* Configure ethernet device. */
165         retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
166         if (retval != 0)
167                 return retval;
168
169         /* Setup the queues. */
170         for (q = 0; q < rx_rings; q++) {
171                 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
172                                                 rte_eth_dev_socket_id(port),
173                                                 rxconf,
174                                                 mbuf_pool);
175                 if (retval < 0)
176                         return retval;
177         }
178         for (q = 0; q < tx_rings; q++) {
179                 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
180                                                 rte_eth_dev_socket_id(port),
181                                                 txconf);
182                 if (retval < 0)
183                         return retval;
184         }
185
186         /* Start the device. */
187         retval  = rte_eth_dev_start(port);
188         if (retval < 0)
189                 return retval;
190
191         /* Configure UDP port for UDP tunneling */
192         tunnel_udp.udp_port = udp_port;
193         tunnel_udp.prot_type = RTE_TUNNEL_TYPE_VXLAN;
194         retval = rte_eth_dev_udp_tunnel_add(port, &tunnel_udp);
195         if (retval < 0)
196                 return retval;
197         rte_eth_macaddr_get(port, &ports_eth_addr[port]);
198         RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
199                         " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
200                         (unsigned)port,
201                         ports_eth_addr[port].addr_bytes[0],
202                         ports_eth_addr[port].addr_bytes[1],
203                         ports_eth_addr[port].addr_bytes[2],
204                         ports_eth_addr[port].addr_bytes[3],
205                         ports_eth_addr[port].addr_bytes[4],
206                         ports_eth_addr[port].addr_bytes[5]);
207
208         if (tso_segsz != 0) {
209                 struct rte_eth_dev_info dev_info;
210                 rte_eth_dev_info_get(port, &dev_info);
211                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0)
212                         RTE_LOG(WARNING, PORT,
213                                 "hardware TSO offload is not supported\n");
214         }
215         return 0;
216 }
217
218 static int
219 vxlan_rx_process(struct rte_mbuf *pkt)
220 {
221         int ret = 0;
222
223         if (rx_decap)
224                 ret = decapsulation(pkt);
225
226         return ret;
227 }
228
229 static void
230 vxlan_tx_process(uint8_t queue_id, struct rte_mbuf *pkt)
231 {
232         if (tx_encap)
233                 encapsulation(pkt, queue_id);
234
235         return;
236 }
237
238 /*
239  * This function learns the MAC address of the device and set init
240  * L2 header and L3 header info.
241  */
242 int
243 vxlan_link(struct vhost_dev *vdev, struct rte_mbuf *m)
244 {
245         int i, ret;
246         struct ether_hdr *pkt_hdr;
247         struct virtio_net *dev = vdev->dev;
248         uint64_t portid = dev->device_fh;
249         struct ipv4_hdr *ip;
250
251         struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
252
253         if (unlikely(portid > VXLAN_N_PORTS)) {
254                 RTE_LOG(INFO, VHOST_DATA,
255                         "(%"PRIu64") WARNING: Not configuring device,"
256                         "as already have %d ports for VXLAN.",
257                         dev->device_fh, VXLAN_N_PORTS);
258                 return -1;
259         }
260
261         /* Learn MAC address of guest device from packet */
262         pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
263         if (is_same_ether_addr(&(pkt_hdr->s_addr), &vdev->mac_address)) {
264                 RTE_LOG(INFO, VHOST_DATA,
265                         "(%"PRIu64") WARNING: This device is using an existing"
266                         " MAC address and has not been registered.\n",
267                         dev->device_fh);
268                 return -1;
269         }
270
271         for (i = 0; i < ETHER_ADDR_LEN; i++) {
272                 vdev->mac_address.addr_bytes[i] =
273                         vxdev.port[portid].vport_mac.addr_bytes[i] =
274                         pkt_hdr->s_addr.addr_bytes[i];
275                 vxdev.port[portid].peer_mac.addr_bytes[i] = peer_mac[i];
276         }
277
278         memset(&tunnel_filter_conf, 0,
279                 sizeof(struct rte_eth_tunnel_filter_conf));
280
281         tunnel_filter_conf.outer_mac = &ports_eth_addr[0];
282         tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
283
284         /* inner MAC */
285         tunnel_filter_conf.inner_mac = &vdev->mac_address;
286
287         tunnel_filter_conf.queue_id = vdev->rx_q;
288         tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
289
290         if (tep_filter_type[filter_idx] == RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
291                 tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
292
293         tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
294
295         ret = rte_eth_dev_filter_ctrl(ports[0],
296                 RTE_ETH_FILTER_TUNNEL,
297                 RTE_ETH_FILTER_ADD,
298                 &tunnel_filter_conf);
299         if (ret) {
300                 RTE_LOG(ERR, VHOST_DATA,
301                         "%d Failed to add device MAC address to cloud filter\n",
302                 vdev->rx_q);
303                 return -1;
304         }
305
306         /* Print out inner MAC and VNI info. */
307         RTE_LOG(INFO, VHOST_DATA,
308                 "(%d) MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VNI %d registered\n",
309                 vdev->rx_q,
310                 vdev->mac_address.addr_bytes[0],
311                 vdev->mac_address.addr_bytes[1],
312                 vdev->mac_address.addr_bytes[2],
313                 vdev->mac_address.addr_bytes[3],
314                 vdev->mac_address.addr_bytes[4],
315                 vdev->mac_address.addr_bytes[5],
316                 tenant_id_conf[vdev->rx_q]);
317
318         vxdev.port[portid].vport_id = portid;
319
320         for (i = 0; i < 4; i++) {
321                 /* Local VTEP IP */
322                 vxdev.port_ip |= vxlan_multicast_ips[portid][i] << (8 * i);
323                 /* Remote VTEP IP */
324                 vxdev.port[portid].peer_ip |=
325                         vxlan_overlay_ips[portid][i] << (8 * i);
326         }
327
328         vxdev.out_key = tenant_id_conf[vdev->rx_q];
329         ether_addr_copy(&vxdev.port[portid].peer_mac,
330                         &app_l2_hdr[portid].d_addr);
331         ether_addr_copy(&ports_eth_addr[0],
332                         &app_l2_hdr[portid].s_addr);
333         app_l2_hdr[portid].ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
334
335         ip = &app_ip_hdr[portid];
336         ip->version_ihl = IP_VHL_DEF;
337         ip->type_of_service = 0;
338         ip->total_length = 0;
339         ip->packet_id = 0;
340         ip->fragment_offset = IP_DN_FRAGMENT_FLAG;
341         ip->time_to_live = IP_DEFTTL;
342         ip->next_proto_id = IPPROTO_UDP;
343         ip->hdr_checksum = 0;
344         ip->src_addr = vxdev.port_ip;
345         ip->dst_addr = vxdev.port[portid].peer_ip;
346
347         /* Set device as ready for RX. */
348         vdev->ready = DEVICE_RX;
349
350         return 0;
351 }
352
353 /**
354  * Removes cloud filter. Ensures that nothing is adding buffers to the RX
355  * queue before disabling RX on the device.
356  */
357 void
358 vxlan_unlink(struct vhost_dev *vdev)
359 {
360         unsigned i = 0, rx_count;
361         int ret;
362         struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
363         struct rte_eth_tunnel_filter_conf tunnel_filter_conf;
364
365         if (vdev->ready == DEVICE_RX) {
366                 memset(&tunnel_filter_conf, 0,
367                         sizeof(struct rte_eth_tunnel_filter_conf));
368
369                 tunnel_filter_conf.outer_mac = &ports_eth_addr[0];
370                 tunnel_filter_conf.inner_mac = &vdev->mac_address;
371                 tunnel_filter_conf.tenant_id = tenant_id_conf[vdev->rx_q];
372                 tunnel_filter_conf.filter_type = tep_filter_type[filter_idx];
373
374                 if (tep_filter_type[filter_idx] ==
375                         RTE_TUNNEL_FILTER_IMAC_IVLAN_TENID)
376                         tunnel_filter_conf.inner_vlan = INNER_VLAN_ID;
377
378                 tunnel_filter_conf.queue_id = vdev->rx_q;
379                 tunnel_filter_conf.tunnel_type = RTE_TUNNEL_TYPE_VXLAN;
380
381                 ret = rte_eth_dev_filter_ctrl(ports[0],
382                                 RTE_ETH_FILTER_TUNNEL,
383                                 RTE_ETH_FILTER_DELETE,
384                                 &tunnel_filter_conf);
385                 if (ret) {
386                         RTE_LOG(ERR, VHOST_DATA,
387                                 "%d Failed to add device MAC address to cloud filter\n",
388                                 vdev->rx_q);
389                         return;
390                 }
391                 for (i = 0; i < ETHER_ADDR_LEN; i++)
392                         vdev->mac_address.addr_bytes[i] = 0;
393
394                 /* Clear out the receive buffers */
395                 rx_count = rte_eth_rx_burst(ports[0],
396                                 (uint16_t)vdev->rx_q,
397                                 pkts_burst, MAX_PKT_BURST);
398
399                 while (rx_count) {
400                         for (i = 0; i < rx_count; i++)
401                                 rte_pktmbuf_free(pkts_burst[i]);
402
403                         rx_count = rte_eth_rx_burst(ports[0],
404                                         (uint16_t)vdev->rx_q,
405                                         pkts_burst, MAX_PKT_BURST);
406                 }
407                 vdev->ready = DEVICE_MAC_LEARNING;
408         }
409 }
410
411 /* Transmit packets after encapsulating */
412 int
413 vxlan_tx_pkts(uint8_t port_id, uint16_t queue_id,
414                 struct rte_mbuf **tx_pkts, uint16_t nb_pkts) {
415         int ret = 0;
416         uint16_t i;
417
418         for (i = 0; i < nb_pkts; i++)
419                 vxlan_tx_process(queue_id, tx_pkts[i]);
420
421         ret = rte_eth_tx_burst(port_id, queue_id, tx_pkts, nb_pkts);
422
423         return ret;
424 }
425
426 /* Check for decapsulation and pass packets directly to VIRTIO device */
427 int
428 vxlan_rx_pkts(struct virtio_net *dev, struct rte_mbuf **pkts_burst,
429                 uint32_t rx_count)
430 {
431         uint32_t i = 0;
432         uint32_t count = 0;
433         int ret;
434         struct rte_mbuf *pkts_valid[rx_count];
435
436         for (i = 0; i < rx_count; i++) {
437                 if (enable_stats) {
438                         rte_atomic64_add(
439                                 &dev_statistics[dev->device_fh].rx_bad_ip_csum,
440                                 (pkts_burst[i]->ol_flags & PKT_RX_IP_CKSUM_BAD)
441                                 != 0);
442                         rte_atomic64_add(
443                                 &dev_statistics[dev->device_fh].rx_bad_ip_csum,
444                                 (pkts_burst[i]->ol_flags & PKT_RX_L4_CKSUM_BAD)
445                                 != 0);
446                 }
447                 ret = vxlan_rx_process(pkts_burst[i]);
448                 if (unlikely(ret < 0))
449                         continue;
450
451                 pkts_valid[count] = pkts_burst[i];
452                         count++;
453         }
454
455         ret = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_valid, count);
456         return ret;
457 }