net/tap: add eBPF program file
authorOphir Munk <ophirmu@mellanox.com>
Sat, 20 Jan 2018 21:11:33 +0000 (21:11 +0000)
committerFerruh Yigit <ferruh.yigit@intel.com>
Sun, 21 Jan 2018 14:51:52 +0000 (15:51 +0100)
File tap_bpf_program.c was added with two ELF sections
corresponding to two BPF programs and one BPF map.

Section cls_q - BPF classifier to classify packets to their
corresponding queue after an RSS hash was calculated on the packet
and saved in skb->cb[1]
Section l3_l4 - BPF action to calculate RSS hash on packet
layers 3 and 4
This file is not part of DPDK tree compilation.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
Acked-by: Pascal Mazon <pascal.mazon@6wind.com>
drivers/net/tap/tap_bpf_program.c [new file with mode: 0644]

diff --git a/drivers/net/tap/tap_bpf_program.c b/drivers/net/tap/tap_bpf_program.c
new file mode 100644 (file)
index 0000000..848c50b
--- /dev/null
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ * Copyright 2017 Mellanox Technologies, Ltd.
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <asm/types.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_tunnel.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+#include "tap_rss.h"
+
+/** Create IPv4 address */
+#define IPv4(a, b, c, d) ((__u32)(((a) & 0xff) << 24) | \
+               (((b) & 0xff) << 16) | \
+               (((c) & 0xff) << 8)  | \
+               ((d) & 0xff))
+
+#define PORT(a, b) ((__u16)(((a) & 0xff) << 8) | \
+               ((b) & 0xff))
+
+/*
+ * The queue number is offset by 1, to distinguish packets that have
+ * gone through this rule (skb->cb[1] != 0) from others.
+ */
+#define QUEUE_OFFSET           1
+#define PIN_GLOBAL_NS          2
+
+#define KEY_IDX                        0
+#define BPF_MAP_ID_KEY 1
+
+struct vlan_hdr {
+       __be16 proto;
+       __be16 tci;
+};
+
+struct bpf_elf_map __attribute__((section("maps"), used))
+map_keys = {
+       .type           =       BPF_MAP_TYPE_HASH,
+       .id             =       BPF_MAP_ID_KEY,
+       .size_key       =       sizeof(__u32),
+       .size_value     =       sizeof(struct rss_key),
+       .max_elem       =       256,
+       .pinning        =       PIN_GLOBAL_NS,
+};
+
+__section("cls_q") int
+match_q(struct __sk_buff *skb)
+{
+       __u32 queue = skb->cb[1];
+       volatile __u32 q = 0xdeadbeef;
+       __u32 match_queue = QUEUE_OFFSET + q;
+
+       /* printt("match_q$i() queue = %d\n", queue); */
+
+       if (queue != match_queue)
+               return TC_ACT_OK;
+       return TC_ACT_UNSPEC;
+}
+
+
+struct ipv4_l3_l4_tuple {
+       __u32    src_addr;
+       __u32    dst_addr;
+       __u16    dport;
+       __u16    sport;
+} __attribute__((packed));
+
+struct ipv6_l3_l4_tuple {
+       __u8        src_addr[16];
+       __u8        dst_addr[16];
+       __u16       dport;
+       __u16       sport;
+} __attribute__((packed));
+
+static const __u8 def_rss_key[] = {
+       0xd1, 0x81, 0xc6, 0x2c,
+       0xf7, 0xf4, 0xdb, 0x5b,
+       0x19, 0x83, 0xa2, 0xfc,
+       0x94, 0x3e, 0x1a, 0xdb,
+       0xd9, 0x38, 0x9e, 0x6b,
+       0xd1, 0x03, 0x9c, 0x2c,
+       0xa7, 0x44, 0x99, 0xad,
+       0x59, 0x3d, 0x56, 0xd9,
+       0xf3, 0x25, 0x3c, 0x06,
+       0x2a, 0xdc, 0x1f, 0xfc,
+};
+
+static __u32  __attribute__((always_inline))
+rte_softrss_be(const __u32 *input_tuple, const uint8_t *rss_key,
+               __u8 input_len)
+{
+       __u32 i, j, hash = 0;
+#pragma unroll
+       for (j = 0; j < input_len; j++) {
+#pragma unroll
+               for (i = 0; i < 32; i++) {
+                       if (input_tuple[j] & (1 << (31 - i))) {
+                               hash ^= ((const __u32 *)def_rss_key)[j] << i |
+                               (__u32)((uint64_t)
+                               (((const __u32 *)def_rss_key)[j + 1])
+                                       >> (32 - i));
+                       }
+               }
+       }
+       return hash;
+}
+
+static int __attribute__((always_inline))
+rss_l3_l4(struct __sk_buff *skb)
+{
+       void *data_end = (void *)(long)skb->data_end;
+       void *data = (void *)(long)skb->data;
+       __u16 proto = (__u16)skb->protocol;
+       __u32 key_idx = 0xdeadbeef;
+       __u32 hash;
+       struct rss_key *rsskey;
+       __u64 off = ETH_HLEN;
+       int j;
+       __u8 *key = 0;
+       __u32 len;
+       __u32 queue = 0;
+
+       rsskey = map_lookup_elem(&map_keys, &key_idx);
+       if (!rsskey) {
+               printt("hash(): rss key is not configured\n");
+               return TC_ACT_OK;
+       }
+       key = (__u8 *)rsskey->key;
+
+       /* Get correct proto for 802.1ad */
+       if (skb->vlan_present && skb->vlan_proto == htons(ETH_P_8021AD)) {
+               if (data + ETH_ALEN * 2 + sizeof(struct vlan_hdr) +
+                   sizeof(proto) > data_end)
+                       return TC_ACT_OK;
+               proto = *(__u16 *)(data + ETH_ALEN * 2 +
+                                  sizeof(struct vlan_hdr));
+               off += sizeof(struct vlan_hdr);
+       }
+
+       if (proto == htons(ETH_P_IP)) {
+               if (data + off + sizeof(struct iphdr) + sizeof(__u32)
+                       > data_end)
+                       return TC_ACT_OK;
+
+               __u8 *src_dst_addr = data + off + offsetof(struct iphdr, saddr);
+               __u8 *src_dst_port = data + off + sizeof(struct iphdr);
+               struct ipv4_l3_l4_tuple v4_tuple = {
+                       .src_addr = IPv4(*(src_dst_addr + 0),
+                                       *(src_dst_addr + 1),
+                                       *(src_dst_addr + 2),
+                                       *(src_dst_addr + 3)),
+                       .dst_addr = IPv4(*(src_dst_addr + 4),
+                                       *(src_dst_addr + 5),
+                                       *(src_dst_addr + 6),
+                                       *(src_dst_addr + 7)),
+                       .sport = PORT(*(src_dst_port + 0),
+                                       *(src_dst_port + 1)),
+                       .dport = PORT(*(src_dst_port + 2),
+                                       *(src_dst_port + 3)),
+               };
+               __u8 input_len = sizeof(v4_tuple) / sizeof(__u32);
+               if (rsskey->hash_fields & (1 << HASH_FIELD_IPV4_L3))
+                       input_len--;
+               hash = rte_softrss_be((__u32 *)&v4_tuple, key, 3);
+       } else if (proto == htons(ETH_P_IPV6)) {
+               if (data + off + sizeof(struct ipv6hdr) +
+                                       sizeof(__u32) > data_end)
+                       return TC_ACT_OK;
+               __u8 *src_dst_addr = data + off +
+                                       offsetof(struct ipv6hdr, saddr);
+               __u8 *src_dst_port = data + off +
+                                       sizeof(struct ipv6hdr);
+               struct ipv6_l3_l4_tuple v6_tuple;
+               for (j = 0; j < 4; j++)
+                       *((uint32_t *)&v6_tuple.src_addr + j) =
+                               __builtin_bswap32(*((uint32_t *)
+                                               src_dst_addr + j));
+               for (j = 0; j < 4; j++)
+                       *((uint32_t *)&v6_tuple.dst_addr + j) =
+                               __builtin_bswap32(*((uint32_t *)
+                                               src_dst_addr + 4 + j));
+               v6_tuple.sport = PORT(*(src_dst_port + 0),
+                             *(src_dst_port + 1));
+               v6_tuple.dport = PORT(*(src_dst_port + 2),
+                             *(src_dst_port + 3));
+
+               __u8 input_len = sizeof(v6_tuple) / sizeof(__u32);
+               if (rsskey->hash_fields & (1 << HASH_FIELD_IPV6_L3))
+                       input_len--;
+               hash = rte_softrss_be((__u32 *)&v6_tuple, key, 9);
+       } else {
+               return TC_ACT_PIPE;
+       }
+
+       queue = rsskey->queues[(hash % rsskey->nb_queues) &
+                                      (TAP_MAX_QUEUES - 1)];
+       skb->cb[1] = QUEUE_OFFSET + queue;
+       /* printt(">>>>> rss_l3_l4 hash=0x%x queue=%u\n", hash, queue); */
+
+       return TC_ACT_RECLASSIFY;
+}
+
+#define RSS(L)                                         \
+       __section(#L) int                               \
+               L ## _hash(struct __sk_buff *skb)       \
+       {                                               \
+               return rss_ ## L (skb);                 \
+       }
+
+RSS(l3_l4)
+
+BPF_LICENSE("Dual BSD/GPL");