+/* Data structures used by flow_tcf_xxx_cb() routines. */
+struct tcf_nlcb_buf {
+ LIST_ENTRY(tcf_nlcb_buf) next;
+ uint32_t size;
+ alignas(struct nlmsghdr)
+ uint8_t msg[]; /**< Netlink message data. */
+};
+
+struct tcf_nlcb_context {
+ unsigned int ifindex; /**< Base interface index. */
+ uint32_t bufsize;
+ LIST_HEAD(, tcf_nlcb_buf) nlbuf;
+};
+
+/**
+ * Allocate space for netlink command in buffer list
+ *
+ * @param[in, out] ctx
+ * Pointer to callback context with command buffers list.
+ * @param[in] size
+ * Required size of data buffer to be allocated.
+ *
+ * @return
+ * Pointer to allocated memory, aligned as message header.
+ * NULL if some error occurred.
+ */
+static struct nlmsghdr *
+flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
+{
+ struct tcf_nlcb_buf *buf;
+ struct nlmsghdr *nlh;
+
+ size = NLMSG_ALIGN(size);
+ buf = LIST_FIRST(&ctx->nlbuf);
+ if (buf && (buf->size + size) <= ctx->bufsize) {
+ nlh = (struct nlmsghdr *)&buf->msg[buf->size];
+ buf->size += size;
+ return nlh;
+ }
+ if (size > ctx->bufsize) {
+ DRV_LOG(WARNING, "netlink: too long command buffer requested");
+ return NULL;
+ }
+ buf = rte_malloc(__func__,
+ ctx->bufsize + sizeof(struct tcf_nlcb_buf),
+ alignof(struct tcf_nlcb_buf));
+ if (!buf) {
+ DRV_LOG(WARNING, "netlink: no memory for command buffer");
+ return NULL;
+ }
+ LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
+ buf->size = size;
+ nlh = (struct nlmsghdr *)&buf->msg[0];
+ return nlh;
+}
+
+/**
+ * Set NLM_F_ACK flags in the last netlink command in buffer.
+ * Only last command in the buffer will be acked by system.
+ *
+ * @param[in, out] buf
+ * Pointer to buffer with netlink commands.
+ */
+static void
+flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf)
+{
+ struct nlmsghdr *nlh;
+ uint32_t size = 0;
+
+ assert(buf->size);
+ do {
+ nlh = (struct nlmsghdr *)&buf->msg[size];
+ size += NLMSG_ALIGN(nlh->nlmsg_len);
+ if (size >= buf->size) {
+ nlh->nlmsg_flags |= NLM_F_ACK;
+ break;
+ }
+ } while (true);
+}
+
+/**
+ * Send the buffers with prepared netlink commands. Scans the list and
+ * sends all found buffers. Buffers are sent and freed anyway in order
+ * to prevent memory leakage if some every message in received packet.
+ *
+ * @param[in] tcf
+ * Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in, out] ctx
+ * Pointer to callback context with command buffers list.
+ *
+ * @return
+ * Zero value on success, negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
+ struct tcf_nlcb_context *ctx)
+{
+ struct tcf_nlcb_buf *bc, *bn;
+ struct nlmsghdr *nlh;
+ int ret = 0;
+
+ bc = LIST_FIRST(&ctx->nlbuf);
+ while (bc) {
+ int rc;
+
+ bn = LIST_NEXT(bc, next);
+ if (bc->size) {
+ flow_tcf_setack_nlcmd(bc);
+ nlh = (struct nlmsghdr *)&bc->msg;
+ rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL);
+ if (rc && !ret)
+ ret = rc;
+ }
+ rte_free(bc);
+ bc = bn;
+ }
+ LIST_INIT(&ctx->nlbuf);
+ return ret;
+}
+
+/**
+ * Collect local IP address rules with scope link attribute on specified
+ * network device. This is callback routine called by libmnl mnl_cb_run()
+ * in loop for every message in received packet.
+ *
+ * @param[in] nlh
+ * Pointer to reply header.
+ * @param[in, out] arg
+ * Opaque data pointer for this callback.
+ *
+ * @return
+ * A positive, nonzero value on success, negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
+{
+ struct tcf_nlcb_context *ctx = arg;
+ struct nlmsghdr *cmd;
+ struct ifaddrmsg *ifa;
+ struct nlattr *na;
+ struct nlattr *na_local = NULL;
+ struct nlattr *na_peer = NULL;
+ unsigned char family;
+
+ if (nlh->nlmsg_type != RTM_NEWADDR) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ifa = mnl_nlmsg_get_payload(nlh);
+ family = ifa->ifa_family;
+ if (ifa->ifa_index != ctx->ifindex ||
+ ifa->ifa_scope != RT_SCOPE_LINK ||
+ !(ifa->ifa_flags & IFA_F_PERMANENT) ||
+ (family != AF_INET && family != AF_INET6))
+ return 1;
+ mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
+ switch (mnl_attr_get_type(na)) {
+ case IFA_LOCAL:
+ na_local = na;
+ break;
+ case IFA_ADDRESS:
+ na_peer = na;
+ break;
+ }
+ if (na_local && na_peer)
+ break;
+ }
+ if (!na_local || !na_peer)
+ return 1;
+ /* Local rule found with scope link, permanent and assigned peer. */
+ cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
+ MNL_ALIGN(sizeof(struct ifaddrmsg)) +
+ (family == AF_INET6
+ ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
+ : 2 * SZ_NLATTR_TYPE_OF(uint32_t)));
+ if (!cmd) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ cmd = mnl_nlmsg_put_header(cmd);
+ cmd->nlmsg_type = RTM_DELADDR;
+ cmd->nlmsg_flags = NLM_F_REQUEST;
+ ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
+ ifa->ifa_flags = IFA_F_PERMANENT;
+ ifa->ifa_scope = RT_SCOPE_LINK;
+ ifa->ifa_index = ctx->ifindex;
+ if (family == AF_INET) {
+ ifa->ifa_family = AF_INET;
+ ifa->ifa_prefixlen = 32;
+ mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
+ mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
+ } else {
+ ifa->ifa_family = AF_INET6;
+ ifa->ifa_prefixlen = 128;
+ mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
+ mnl_attr_get_payload(na_local));
+ mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
+ mnl_attr_get_payload(na_peer));
+ }
+ return 1;
+}
+
+/**
+ * Cleanup the local IP addresses on outer interface.
+ *
+ * @param[in] tcf
+ * Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in] ifindex
+ * Network inferface index to perform cleanup.
+ */
+static void
+flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
+ unsigned int ifindex)
+{
+ struct nlmsghdr *nlh;
+ struct ifaddrmsg *ifa;
+ struct tcf_nlcb_context ctx = {
+ .ifindex = ifindex,
+ .bufsize = MNL_REQUEST_SIZE,
+ .nlbuf = LIST_HEAD_INITIALIZER(),
+ };
+ int ret;
+
+ assert(ifindex);
+ /*
+ * Seek and destroy leftovers of local IP addresses with
+ * matching properties "scope link".
+ */
+ nlh = mnl_nlmsg_put_header(tcf->buf);
+ nlh->nlmsg_type = RTM_GETADDR;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
+ ifa->ifa_family = AF_UNSPEC;
+ ifa->ifa_index = ifindex;
+ ifa->ifa_scope = RT_SCOPE_LINK;
+ ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx);
+ if (ret)
+ DRV_LOG(WARNING, "netlink: query device list error %d", ret);
+ ret = flow_tcf_send_nlcmd(tcf, &ctx);
+ if (ret)
+ DRV_LOG(WARNING, "netlink: device delete error %d", ret);
+}
+
+/**
+ * Collect neigh permament rules on specified network device.
+ * This is callback routine called by libmnl mnl_cb_run() in loop for
+ * every message in received packet.
+ *
+ * @param[in] nlh
+ * Pointer to reply header.
+ * @param[in, out] arg
+ * Opaque data pointer for this callback.
+ *
+ * @return
+ * A positive, nonzero value on success, negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
+{
+ struct tcf_nlcb_context *ctx = arg;
+ struct nlmsghdr *cmd;
+ struct ndmsg *ndm;
+ struct nlattr *na;
+ struct nlattr *na_ip = NULL;
+ struct nlattr *na_mac = NULL;
+ unsigned char family;
+
+ if (nlh->nlmsg_type != RTM_NEWNEIGH) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ndm = mnl_nlmsg_get_payload(nlh);
+ family = ndm->ndm_family;
+ if (ndm->ndm_ifindex != (int)ctx->ifindex ||
+ !(ndm->ndm_state & NUD_PERMANENT) ||
+ (family != AF_INET && family != AF_INET6))
+ return 1;
+ mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
+ switch (mnl_attr_get_type(na)) {
+ case NDA_DST:
+ na_ip = na;
+ break;
+ case NDA_LLADDR:
+ na_mac = na;
+ break;
+ }
+ if (na_mac && na_ip)
+ break;
+ }
+ if (!na_mac || !na_ip)
+ return 1;
+ /* Neigh rule with permenent attribute found. */
+ cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
+ MNL_ALIGN(sizeof(struct ndmsg)) +
+ SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
+ (family == AF_INET6
+ ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
+ : SZ_NLATTR_TYPE_OF(uint32_t)));
+ if (!cmd) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ cmd = mnl_nlmsg_put_header(cmd);
+ cmd->nlmsg_type = RTM_DELNEIGH;
+ cmd->nlmsg_flags = NLM_F_REQUEST;
+ ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
+ ndm->ndm_ifindex = ctx->ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+ ndm->ndm_flags = 0;
+ ndm->ndm_type = 0;
+ if (family == AF_INET) {
+ ndm->ndm_family = AF_INET;
+ mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
+ } else {
+ ndm->ndm_family = AF_INET6;
+ mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
+ mnl_attr_get_payload(na_ip));
+ }
+ mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
+ mnl_attr_get_payload(na_mac));
+ return 1;
+}
+
+/**
+ * Cleanup the neigh rules on outer interface.
+ *
+ * @param[in] tcf
+ * Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in] ifindex
+ * Network inferface index to perform cleanup.
+ */
+static void
+flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
+ unsigned int ifindex)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+ struct tcf_nlcb_context ctx = {
+ .ifindex = ifindex,
+ .bufsize = MNL_REQUEST_SIZE,
+ .nlbuf = LIST_HEAD_INITIALIZER(),
+ };
+ int ret;
+
+ assert(ifindex);
+ /* Seek and destroy leftovers of neigh rules. */
+ nlh = mnl_nlmsg_put_header(tcf->buf);
+ nlh->nlmsg_type = RTM_GETNEIGH;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
+ ndm->ndm_family = AF_UNSPEC;
+ ndm->ndm_ifindex = ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+ ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx);
+ if (ret)
+ DRV_LOG(WARNING, "netlink: query device list error %d", ret);
+ ret = flow_tcf_send_nlcmd(tcf, &ctx);
+ if (ret)
+ DRV_LOG(WARNING, "netlink: device delete error %d", ret);
+}
+
+/**
+ * Collect indices of VXLAN encap/decap interfaces associated with device.
+ * This is callback routine called by libmnl mnl_cb_run() in loop for
+ * every message in received packet.
+ *
+ * @param[in] nlh
+ * Pointer to reply header.
+ * @param[in, out] arg
+ * Opaque data pointer for this callback.
+ *
+ * @return
+ * A positive, nonzero value on success, negative errno value otherwise
+ * and rte_errno is set.
+ */
+static int
+flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
+{
+ struct tcf_nlcb_context *ctx = arg;
+ struct nlmsghdr *cmd;
+ struct ifinfomsg *ifm;
+ struct nlattr *na;
+ struct nlattr *na_info = NULL;
+ struct nlattr *na_vxlan = NULL;
+ bool found = false;
+ unsigned int vxindex;
+
+ if (nlh->nlmsg_type != RTM_NEWLINK) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ifm = mnl_nlmsg_get_payload(nlh);
+ if (!ifm->ifi_index) {
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ mnl_attr_for_each(na, nlh, sizeof(*ifm))
+ if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
+ na_info = na;
+ break;
+ }
+ if (!na_info)
+ return 1;
+ mnl_attr_for_each_nested(na, na_info) {
+ switch (mnl_attr_get_type(na)) {
+ case IFLA_INFO_KIND:
+ if (!strncmp("vxlan", mnl_attr_get_str(na),
+ mnl_attr_get_len(na)))
+ found = true;
+ break;
+ case IFLA_INFO_DATA:
+ na_vxlan = na;
+ break;
+ }
+ if (found && na_vxlan)
+ break;
+ }
+ if (!found || !na_vxlan)
+ return 1;
+ found = false;
+ mnl_attr_for_each_nested(na, na_vxlan) {
+ if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
+ mnl_attr_get_u32(na) == ctx->ifindex) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return 1;
+ /* Attached VXLAN device found, store the command to delete. */
+ vxindex = ifm->ifi_index;
+ cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
+ MNL_ALIGN(sizeof(struct ifinfomsg)));
+ if (!nlh) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ cmd = mnl_nlmsg_put_header(cmd);
+ cmd->nlmsg_type = RTM_DELLINK;
+ cmd->nlmsg_flags = NLM_F_REQUEST;
+ ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->ifi_index = vxindex;
+ return 1;
+}
+
+/**
+ * Cleanup the outer interface. Removes all found vxlan devices
+ * attached to specified index, flushes the meigh and local IP
+ * datavase.
+ *
+ * @param[in] tcf
+ * Context object initialized by mlx5_flow_tcf_context_create().
+ * @param[in] ifindex
+ * Network inferface index to perform cleanup.
+ */
+static void
+flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
+ unsigned int ifindex)
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ struct tcf_nlcb_context ctx = {
+ .ifindex = ifindex,
+ .bufsize = MNL_REQUEST_SIZE,
+ .nlbuf = LIST_HEAD_INITIALIZER(),
+ };
+ int ret;
+
+ assert(ifindex);
+ /*
+ * Seek and destroy leftover VXLAN encap/decap interfaces with
+ * matching properties.
+ */
+ nlh = mnl_nlmsg_put_header(tcf->buf);
+ nlh->nlmsg_type = RTM_GETLINK;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+ ifm->ifi_family = AF_UNSPEC;
+ ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx);
+ if (ret)
+ DRV_LOG(WARNING, "netlink: query device list error %d", ret);
+ ret = flow_tcf_send_nlcmd(tcf, &ctx);
+ if (ret)
+ DRV_LOG(WARNING, "netlink: device delete error %d", ret);
+}
+
+/**
+ * Emit Netlink message to add/remove local address to the outer device.
+ * The address being added is visible within the link only (scope link).
+ *
+ * Note that an implicit route is maintained by the kernel due to the
+ * presence of a peer address (IFA_ADDRESS).
+ *
+ * These rules are used for encapsultion only and allow to assign
+ * the outer tunnel source IP address.
+ *
+ * @param[in] tcf
+ * Libmnl socket context object.
+ * @param[in] encap
+ * Encapsulation properties (source address and its peer).
+ * @param[in] ifindex
+ * Network interface to apply rule.
+ * @param[in] enable
+ * Toggle between add and remove.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
+ const struct flow_tcf_vxlan_encap *encap,
+ unsigned int ifindex,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh;
+ struct ifaddrmsg *ifa;
+ alignas(struct nlmsghdr)
+ uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
+
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
+ nlh->nlmsg_flags =
+ NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+ nlh->nlmsg_seq = 0;
+ ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
+ ifa->ifa_flags = IFA_F_PERMANENT;
+ ifa->ifa_scope = RT_SCOPE_LINK;
+ ifa->ifa_index = ifindex;
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
+ ifa->ifa_family = AF_INET;
+ ifa->ifa_prefixlen = 32;
+ mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
+ mnl_attr_put_u32(nlh, IFA_ADDRESS,
+ encap->ipv4.dst);
+ } else {
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
+ ifa->ifa_family = AF_INET6;
+ ifa->ifa_prefixlen = 128;
+ mnl_attr_put(nlh, IFA_LOCAL,
+ sizeof(encap->ipv6.src),
+ &encap->ipv6.src);
+ if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
+ mnl_attr_put(nlh, IFA_ADDRESS,
+ sizeof(encap->ipv6.dst),
+ &encap->ipv6.dst);
+ }
+ if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
+ return 0;
+ return rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "netlink: cannot complete IFA request"
+ " (ip addr add)");
+}
+
+/**
+ * Emit Netlink message to add/remove neighbor.
+ *
+ * @param[in] tcf
+ * Libmnl socket context object.
+ * @param[in] encap
+ * Encapsulation properties (destination address).
+ * @param[in] ifindex
+ * Network interface.
+ * @param[in] enable
+ * Toggle between add and remove.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
+ const struct flow_tcf_vxlan_encap *encap,
+ unsigned int ifindex,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+ alignas(struct nlmsghdr)
+ uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
+
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
+ nlh->nlmsg_flags =
+ NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
+ nlh->nlmsg_seq = 0;
+ ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
+ ndm->ndm_ifindex = ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+ ndm->ndm_flags = 0;
+ ndm->ndm_type = 0;
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
+ ndm->ndm_family = AF_INET;
+ mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
+ } else {
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
+ ndm->ndm_family = AF_INET6;
+ mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
+ &encap->ipv6.dst);
+ }
+ if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
+ DRV_LOG(WARNING,
+ "outer ethernet source address cannot be "
+ "forced for VXLAN encapsulation");
+ if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
+ mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
+ &encap->eth.dst);
+ if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
+ return 0;
+ return rte_flow_error_set(error, rte_errno,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "netlink: cannot complete ND request"
+ " (ip neigh)");
+}
+
+/**
+ * Manage the local IP addresses and their peers IP addresses on the
+ * outer interface for encapsulation purposes. The kernel searches the
+ * appropriate device for tunnel egress traffic using the outer source
+ * IP, this IP should be assigned to the outer network device, otherwise
+ * kernel rejects the rule.
+ *
+ * Adds or removes the addresses using the Netlink command like this:
+ * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
+ *
+ * The addresses are local to the netdev ("scope link"), this reduces
+ * the risk of conflicts. Note that an implicit route is maintained by
+ * the kernel due to the presence of a peer address (IFA_ADDRESS).
+ *
+ * @param[in] tcf
+ * Libmnl socket context object.
+ * @param[in] vtep
+ * VTEP object, contains rule database and ifouter index.
+ * @param[in] dev_flow
+ * Flow object, contains the tunnel parameters (for encap only).
+ * @param[in] enable
+ * Toggle between add and remove.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
+ struct tcf_vtep *vtep,
+ struct mlx5_flow *dev_flow,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
+ struct tcf_local_rule *rule;
+ bool found = false;
+ int ret;
+
+ assert(encap);
+ assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
+ LIST_FOREACH(rule, &vtep->local, next) {
+ if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
+ encap->ipv4.src == rule->ipv4.src &&
+ encap->ipv4.dst == rule->ipv4.dst) {
+ found = true;
+ break;
+ }
+ }
+ } else {
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
+ LIST_FOREACH(rule, &vtep->local, next) {
+ if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
+ !memcmp(&encap->ipv6.src, &rule->ipv6.src,
+ sizeof(encap->ipv6.src)) &&
+ !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
+ sizeof(encap->ipv6.dst))) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (found) {
+ if (enable) {
+ rule->refcnt++;
+ return 0;
+ }
+ if (!rule->refcnt || !--rule->refcnt) {
+ LIST_REMOVE(rule, next);
+ return flow_tcf_rule_local(tcf, encap,
+ vtep->ifouter, false, error);
+ }
+ return 0;
+ }
+ if (!enable) {
+ DRV_LOG(WARNING, "disabling not existing local rule");
+ rte_flow_error_set(error, ENOENT,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "disabling not existing local rule");
+ return -ENOENT;
+ }
+ rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
+ alignof(struct tcf_local_rule));
+ if (!rule) {
+ rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "unable to allocate memory for local rule");
+ return -rte_errno;
+ }
+ *rule = (struct tcf_local_rule){.refcnt = 0,
+ .mask = 0,
+ };
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
+ rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
+ | FLOW_TCF_ENCAP_IPV4_DST;
+ rule->ipv4.src = encap->ipv4.src;
+ rule->ipv4.dst = encap->ipv4.dst;
+ } else {
+ rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
+ | FLOW_TCF_ENCAP_IPV6_DST;
+ memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
+ memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
+ }
+ ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
+ if (ret) {
+ rte_free(rule);
+ return ret;
+ }
+ rule->refcnt++;
+ LIST_INSERT_HEAD(&vtep->local, rule, next);
+ return 0;
+}
+
+/**
+ * Manage the destination MAC/IP addresses neigh database, kernel uses
+ * this one to determine the destination MAC address within encapsulation
+ * header. Adds or removes the entries using the Netlink command like this:
+ * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
+ *
+ * @param[in] tcf
+ * Libmnl socket context object.
+ * @param[in] vtep
+ * VTEP object, contains rule database and ifouter index.
+ * @param[in] dev_flow
+ * Flow object, contains the tunnel parameters (for encap only).
+ * @param[in] enable
+ * Toggle between add and remove.
+ * @param[out] error
+ * Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
+ struct tcf_vtep *vtep,
+ struct mlx5_flow *dev_flow,
+ bool enable,
+ struct rte_flow_error *error)
+{
+ const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
+ struct tcf_neigh_rule *rule;
+ bool found = false;
+ int ret;
+
+ assert(encap);
+ assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
+ LIST_FOREACH(rule, &vtep->neigh, next) {
+ if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
+ encap->ipv4.dst == rule->ipv4.dst) {
+ found = true;
+ break;
+ }
+ }
+ } else {
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
+ assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
+ LIST_FOREACH(rule, &vtep->neigh, next) {
+ if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
+ !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
+ sizeof(encap->ipv6.dst))) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (found) {
+ if (memcmp(&encap->eth.dst, &rule->eth,
+ sizeof(encap->eth.dst))) {
+ DRV_LOG(WARNING, "Destination MAC differs"
+ " in neigh rule");
+ rte_flow_error_set(error, EEXIST,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+ NULL, "Different MAC address"
+ " neigh rule for the same"
+ " destination IP");
+ return -EEXIST;
+ }
+ if (enable) {
+ rule->refcnt++;
+ return 0;
+ }
+ if (!rule->refcnt || !--rule->refcnt) {
+ LIST_REMOVE(rule, next);
+ return flow_tcf_rule_neigh(tcf, encap,
+ vtep->ifouter,
+ false, error);
+ }
+ return 0;
+ }
+ if (!enable) {
+ DRV_LOG(WARNING, "Disabling not existing neigh rule");
+ rte_flow_error_set(error, ENOENT,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "unable to allocate memory for neigh rule");
+ return -ENOENT;
+ }
+ rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
+ alignof(struct tcf_neigh_rule));
+ if (!rule) {
+ rte_flow_error_set(error, ENOMEM,
+ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+ "unable to allocate memory for neigh rule");
+ return -rte_errno;
+ }
+ *rule = (struct tcf_neigh_rule){.refcnt = 0,
+ .mask = 0,
+ };
+ if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
+ rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
+ rule->ipv4.dst = encap->ipv4.dst;
+ } else {
+ rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
+ memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
+ }
+ memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
+ ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
+ if (ret) {
+ rte_free(rule);
+ return ret;
+ }
+ rule->refcnt++;
+ LIST_INSERT_HEAD(&vtep->neigh, rule, next);
+ return 0;
+}
+