From: Pascal Mazon Date: Thu, 23 Mar 2017 08:42:11 +0000 (+0100) Subject: net/tap: add remote netdevice traffic capture X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=2bc06869cd94195e986cfb7939a549d7050097e8;p=dpdk.git net/tap: add remote netdevice traffic capture By default, a tap netdevice is of no use when not fed by a separate process. The ability to automatically feed it from another netdevice allows applications to capture any kind of traffic normally destined to the kernel stack. This patch implements this ability through a new optional "remote" parameter. Packets matching filtering rules created with the flow API are matched on the remote device and redirected to the tap PMD, where the relevant action will be performed. Signed-off-by: Pascal Mazon Acked-by: Olga Shern Acked-by: Keith Wiles --- diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst index 4986e47e9f..5c5ba5357b 100644 --- a/doc/guides/nics/tap.rst +++ b/doc/guides/nics/tap.rst @@ -58,6 +58,22 @@ needed, but the interface does not enforce that speed, for example:: --vdev=net_tap0,iface=foo0,speed=25000 +It is possible to specify a remote netdevice to capture packets from by adding +``remote=foo1``, for example:: + + --vdev=net_tap,iface=tap0,remote=foo1 + +If a ``remote`` is set, the tap MAC address will be set to match the remote one +just after netdevice creation. Using TC rules, traffic from the remote netdevice +will be redirected to the tap. If the tap is in promiscuous mode, then all +packets will be redirected. In allmulti mode, all multicast packets will be +redirected. + +Using the remote feature is especially useful for capturing traffic from a +netdevice that has no support in the DPDK. It is possible to add explicit +rte_flow rules on the tap PMD to capture specific traffic (see next section for +examples). + After the DPDK application is started you can send and receive packets on the interface using the standard rx_burst/tx_burst APIs in DPDK. From the host point of view you can use any host tool like tcpdump, Wireshark, ping, Pktgen diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c index 61e5ffcdd0..1570aef297 100644 --- a/drivers/net/tap/rte_eth_tap.c +++ b/drivers/net/tap/rte_eth_tap.c @@ -69,6 +69,7 @@ #define ETH_TAP_IFACE_ARG "iface" #define ETH_TAP_SPEED_ARG "speed" +#define ETH_TAP_REMOTE_ARG "remote" #ifdef IFF_MULTI_QUEUE #define RTE_PMD_TAP_MAX_QUEUES 16 @@ -84,6 +85,7 @@ static struct rte_vdev_driver pmd_tap_drv; static const char *valid_arguments[] = { ETH_TAP_IFACE_ARG, ETH_TAP_SPEED_ARG, + ETH_TAP_REMOTE_ARG, NULL }; @@ -243,10 +245,43 @@ tun_alloc(struct pmd_internals *pmd, uint16_t qid) pmd->name); return fd; } + if (pmd->remote_if_index) { + /* + * Flush usually returns negative value because it tries + * to delete every QDISC (and on a running device, one + * QDISC at least is needed). Ignore negative return + * value. + */ + qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index); + if (qdisc_create_ingress(pmd->nlsk_fd, + pmd->remote_if_index) < 0) + goto remote_fail; + LIST_INIT(&pmd->implicit_flows); + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_LOCAL_MAC) < 0) + goto remote_fail; + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_BROADCAST) < 0) + goto remote_fail; + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_BROADCASTV6) < 0) + goto remote_fail; + if (tap_flow_implicit_create( + pmd, TAP_REMOTE_TX) < 0) + goto remote_fail; + } } return fd; +remote_fail: + RTE_LOG(ERR, PMD, + "Could not set up remote flow rules for %s: remote disabled.\n", + pmd->name); + pmd->remote_if_index = 0; + tap_flow_implicit_flush(pmd, NULL); + return fd; + error: if (fd > 0) close(fd); @@ -402,8 +437,17 @@ tap_ioctl(struct pmd_internals *pmd, unsigned long request, struct ifreq *ifr, int set) { short req_flags = ifr->ifr_flags; + int remote = !!pmd->remote_if_index; - snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name); + /* + * If there is a remote netdevice, apply ioctl on it, then apply it on + * the tap netdevice. + */ +apply: + if (remote) + snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface); + else + snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name); switch (request) { case SIOCSIFFLAGS: /* fetch current flags to leave other flags untouched */ @@ -415,6 +459,12 @@ tap_ioctl(struct pmd_internals *pmd, unsigned long request, ifr->ifr_flags &= ~req_flags; break; case SIOCGIFHWADDR: + /* Set remote MAC on the tap netdevice */ + if (!remote && pmd->remote_if_index) { + request = SIOCSIFHWADDR; + goto apply; + } + break; case SIOCSIFHWADDR: case SIOCSIFMTU: break; @@ -425,6 +475,8 @@ tap_ioctl(struct pmd_internals *pmd, unsigned long request, } if (ioctl(pmd->ioctl_sock, request, ifr) < 0) goto error; + if (remote--) + goto apply; return 0; error: @@ -585,6 +637,7 @@ tap_dev_close(struct rte_eth_dev *dev __rte_unused) tap_link_set_down(dev); tap_flow_flush(dev, NULL); + tap_flow_implicit_flush(internals, NULL); for (i = 0; i < internals->nb_queues; i++) { if (internals->rxq[i].fd != -1) @@ -635,6 +688,8 @@ tap_promisc_enable(struct rte_eth_dev *dev) dev->data->promiscuous = 1; tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1); + if (pmd->remote_if_index) + tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC); } static void @@ -645,6 +700,8 @@ tap_promisc_disable(struct rte_eth_dev *dev) dev->data->promiscuous = 0; tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0); + if (pmd->remote_if_index) + tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC); } static void @@ -655,6 +712,8 @@ tap_allmulti_enable(struct rte_eth_dev *dev) dev->data->all_multicast = 1; tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1); + if (pmd->remote_if_index) + tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI); } static void @@ -665,6 +724,8 @@ tap_allmulti_disable(struct rte_eth_dev *dev) dev->data->all_multicast = 0; tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0); + if (pmd->remote_if_index) + tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI); } @@ -982,7 +1043,7 @@ tap_kernel_support(struct pmd_internals *pmd) } static int -eth_dev_tap_create(const char *name, char *tap_name) +eth_dev_tap_create(const char *name, char *tap_name, char *remote_iface) { int numa_node = rte_socket_id(); struct rte_eth_dev *dev = NULL; @@ -1059,6 +1120,15 @@ eth_dev_tap_create(const char *name, char *tap_name) * creating/destroying flow rules. */ pmd->nlsk_fd = nl_init(); + if (strlen(remote_iface)) { + pmd->remote_if_index = if_nametoindex(remote_iface); + snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN, + "%s", remote_iface); + if (!pmd->remote_if_index) + RTE_LOG(ERR, PMD, "Could not find %s ifindex: " + "remote interface will remain unconfigured\n", + remote_iface); + } return 0; @@ -1099,6 +1169,19 @@ set_interface_speed(const char *key __rte_unused, return 0; } +static int +set_remote_iface(const char *key __rte_unused, + const char *value, + void *extra_args) +{ + char *name = (char *)extra_args; + + if (value) + snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value); + + return 0; +} + /* Open a TAP interface device. */ static int @@ -1108,10 +1191,12 @@ rte_pmd_tap_probe(const char *name, const char *params) struct rte_kvargs *kvlist = NULL; int speed; char tap_name[RTE_ETH_NAME_MAX_LEN]; + char remote_iface[RTE_ETH_NAME_MAX_LEN]; speed = ETH_SPEED_NUM_10G; snprintf(tap_name, sizeof(tap_name), "%s%d", DEFAULT_TAP_NAME, tap_unit++); + memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN); if (params && (params[0] != '\0')) { RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params); @@ -1135,6 +1220,15 @@ rte_pmd_tap_probe(const char *name, const char *params) if (ret == -1) goto leave; } + + if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) { + ret = rte_kvargs_process(kvlist, + ETH_TAP_REMOTE_ARG, + &set_remote_iface, + remote_iface); + if (ret == -1) + goto leave; + } } } pmd_link.link_speed = speed; @@ -1142,7 +1236,7 @@ rte_pmd_tap_probe(const char *name, const char *params) RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n", name, tap_name); - ret = eth_dev_tap_create(name, tap_name); + ret = eth_dev_tap_create(name, tap_name, remote_iface); leave: if (ret == -1) { @@ -1175,6 +1269,7 @@ rte_pmd_tap_remove(const char *name) internals = eth_dev->data->dev_private; if (internals->flower_support && internals->nlsk_fd) { tap_flow_flush(eth_dev, NULL); + tap_flow_implicit_flush(internals, NULL); nl_final(internals->nlsk_fd); } for (i = 0; i < internals->nb_queues; i++) diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h index be4952c810..a559f6b0a3 100644 --- a/drivers/net/tap/rte_eth_tap.h +++ b/drivers/net/tap/rte_eth_tap.h @@ -75,15 +75,19 @@ struct tx_queue { }; struct pmd_internals { + char remote_iface[RTE_ETH_NAME_MAX_LEN]; /* Remote netdevice name */ char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */ uint16_t nb_queues; /* Number of queues supported */ struct ether_addr eth_addr; /* Mac address of the device port */ + int remote_if_index; /* remote netdevice IF_INDEX */ int if_index; /* IF_INDEX for the port */ int ioctl_sock; /* socket for ioctl calls */ int nlsk_fd; /* Netlink socket fd */ int flower_support; /* 1 if kernel supports, else 0 */ int flower_vlan_support; /* 1 if kernel supports, else 0 */ LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */ + /* implicit rte_flow rules set when a remote device is active */ + LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows; struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */ struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */ }; diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c index 6adacdc22d..7f1693d404 100644 --- a/drivers/net/tap/tap_flow.c +++ b/drivers/net/tap/tap_flow.c @@ -82,6 +82,7 @@ enum { struct rte_flow { LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */ + struct rte_flow *remote_flow; /* associated remote flow */ struct nlmsg msg; }; @@ -92,6 +93,12 @@ struct convert_data { struct rte_flow *flow; }; +struct remote_rule { + struct rte_flow_attr attr; + struct rte_flow_item items[2]; + int mirred; +}; + static int tap_flow_create_eth(const struct rte_flow_item *item, void *data); static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data); static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data); @@ -249,6 +256,114 @@ static const struct tap_flow_items tap_flow_items[] = { }, }; +static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = { + [TAP_REMOTE_LOCAL_MAC] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_REDIR, + }, + [TAP_REMOTE_BROADCAST] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + .spec = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_BROADCASTV6] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", + }, + .spec = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x33\x33\x00\x00\x00\x00", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_PROMISC] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_PROMISC, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_VOID, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_ALLMULTI] = { + .attr = { + .group = MAX_GROUP, + .priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI, + .ingress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .mask = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + }, + .spec = &(const struct rte_flow_item_eth){ + .dst.addr_bytes = "\x01\x00\x00\x00\x00\x00", + }, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, + [TAP_REMOTE_TX] = { + .attr = { + .group = 0, + .priority = TAP_REMOTE_TX, + .egress = 1, + }, + .items[0] = { + .type = RTE_FLOW_ITEM_TYPE_VOID, + }, + .items[1] = { + .type = RTE_FLOW_ITEM_TYPE_END, + }, + .mirred = TCA_EGRESS_MIRROR, + }, +}; + /** * Make as much checks as possible on an Ethernet item, and if a flow is * provided, fill it appropriately with Ethernet info. @@ -672,6 +787,47 @@ add_action_gact(struct rte_flow *flow, int action) return 0; } +/** + * Transform a MIRRED action item in the provided flow for TC. + * + * @param[in, out] flow + * Flow to be filled. + * @param[in] ifindex + * Netdevice ifindex, where to mirror/redirect packet to. + * @param[in] action_type + * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring. + * + * @return + * 0 if checks are alright, -1 otherwise. + */ +static int +add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type) +{ + struct nlmsg *msg = &flow->msg; + size_t act_index = 1; + struct tc_mirred p = { + .eaction = action_type, + .ifindex = ifindex, + }; + + if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) + return -1; + if (nlattr_nested_start(msg, act_index++) < 0) + return -1; + nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred"); + if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) + return -1; + if (action_type == TCA_EGRESS_MIRROR) + p.action = TC_ACT_PIPE; + else /* REDIRECT */ + p.action = TC_ACT_STOLEN; + nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p); + nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ + nlattr_nested_finish(msg); /* nested act_index */ + nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ + return 0; +} + /** * Transform a QUEUE action item in the provided flow for TC. * @@ -723,6 +879,15 @@ add_action_skbedit(struct rte_flow *flow, uint16_t queue) * Perform verbose error reporting if not NULL. * @param[in, out] flow * Flow structure to update. + * @param[in] mirred + * If set to TCA_EGRESS_REDIR, provided actions will be replaced with a + * redirection to the tap netdevice, and the TC rule will be configured + * on the remote netdevice in pmd. + * If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a + * mirroring to the tap netdevice, and the TC rule will be configured + * on the remote netdevice in pmd. Matching packets will thus be duplicated. + * If set to 0, the standard behavior is to be used: set correct actions for + * the TC rule, and apply it on the tap netdevice. * * @return * 0 on success, a negative errno value otherwise and rte_errno is set. @@ -733,7 +898,8 @@ priv_flow_process(struct pmd_internals *pmd, const struct rte_flow_item items[], const struct rte_flow_action actions[], struct rte_flow_error *error, - struct rte_flow *flow) + struct rte_flow *flow, + int mirred) { const struct tap_flow_items *cur_item = tap_flow_items; struct convert_data data = { @@ -760,15 +926,21 @@ priv_flow_process(struct pmd_internals *pmd, flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, flow->msg.t.tcm_info); } - if (!attr->ingress) { - rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR, - NULL, "direction should be ingress"); - return -rte_errno; - } - /* rte_flow ingress is actually egress as seen in the kernel */ - if (attr->ingress && flow) - flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); if (flow) { + if (mirred) { + /* + * If attr->ingress, the rule applies on remote ingress + * to match incoming packets + * If attr->egress, the rule applies on tap ingress (as + * seen from the kernel) to deal with packets going out + * from the DPDK app. + */ + flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0); + } else { + /* Standard rule on tap egress (kernel standpoint). */ + flow->msg.t.tcm_parent = + TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); + } /* use flower filter type */ nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower"); if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0) @@ -821,6 +993,22 @@ priv_flow_process(struct pmd_internals *pmd, data.eth_type); } } + if (mirred && flow) { + uint16_t if_index = pmd->if_index; + + /* + * If attr->egress && mirred, then this is a special + * case where the rule must be applied on the tap, to + * redirect packets coming from the DPDK App, out + * through the remote netdevice. + */ + if (attr->egress) + if_index = pmd->remote_if_index; + if (add_action_mirred(flow, if_index, mirred) < 0) + goto exit_action_not_supported; + else + goto end; + } for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) { int err = 0; @@ -855,6 +1043,7 @@ priv_flow_process(struct pmd_internals *pmd, if (err) goto exit_action_not_supported; } +end: if (flow) nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */ return 0; @@ -885,7 +1074,7 @@ tap_flow_validate(struct rte_eth_dev *dev, { struct pmd_internals *pmd = dev->data->dev_private; - return priv_flow_process(pmd, attr, items, actions, error, NULL); + return priv_flow_process(pmd, attr, items, actions, error, NULL, 0); } /** @@ -933,6 +1122,7 @@ tap_flow_create(struct rte_eth_dev *dev, struct rte_flow_error *error) { struct pmd_internals *pmd = dev->data->dev_private; + struct rte_flow *remote_flow = NULL; struct rte_flow *flow = NULL; struct nlmsg *msg = NULL; int err; @@ -943,6 +1133,17 @@ tap_flow_create(struct rte_eth_dev *dev, "can't create rule, ifindex not found"); goto fail; } + /* + * No rules configured through standard rte_flow should be set on the + * priorities used by implicit rules. + */ + if ((attr->group == MAX_GROUP) && + attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) { + rte_flow_error_set( + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY, + NULL, "priority value too big"); + goto fail; + } flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); if (!flow) { rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, @@ -954,7 +1155,7 @@ tap_flow_create(struct rte_eth_dev *dev, NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); tap_flow_set_handle(flow); - if (priv_flow_process(pmd, attr, items, actions, error, flow)) + if (priv_flow_process(pmd, attr, items, actions, error, flow, 0)) goto fail; err = nl_send(pmd->nlsk_fd, &msg->nh); if (err < 0) { @@ -969,25 +1170,76 @@ tap_flow_create(struct rte_eth_dev *dev, goto fail; } LIST_INSERT_HEAD(&pmd->flows, flow, next); + /** + * If a remote device is configured, a TC rule with identical items for + * matching must be set on that device, with a single action: redirect + * to the local pmd->if_index. + */ + if (pmd->remote_if_index) { + remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); + if (!remote_flow) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, + "cannot allocate memory for rte_flow"); + goto fail; + } + msg = &remote_flow->msg; + /* set the rule if_index for the remote netdevice */ + tc_init_msg( + msg, pmd->remote_if_index, RTM_NEWTFILTER, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(remote_flow); + if (priv_flow_process(pmd, attr, items, NULL, + error, remote_flow, TCA_EGRESS_REDIR)) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "rte flow rule validation failed"); + goto fail; + } + err = nl_send(pmd->nlsk_fd, &msg->nh); + if (err < 0) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "Failure sending nl request"); + goto fail; + } + err = nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "overlapping rules"); + goto fail; + } + flow->remote_flow = remote_flow; + } return flow; fail: + if (remote_flow) + rte_free(remote_flow); if (flow) rte_free(flow); return NULL; } /** - * Destroy a flow. + * Destroy a flow using pointer to pmd_internal. * - * @see rte_flow_destroy() - * @see rte_flow_ops + * @param[in, out] pmd + * Pointer to private structure. + * @param[in] flow + * Pointer to the flow to destroy. + * @param[in, out] error + * Pointer to the flow error handler + * + * @return 0 if the flow could be destroyed, -1 otherwise. */ static int -tap_flow_destroy(struct rte_eth_dev *dev, - struct rte_flow *flow, - struct rte_flow_error *error) +tap_flow_destroy_pmd(struct pmd_internals *pmd, + struct rte_flow *flow, + struct rte_flow_error *error) { - struct pmd_internals *pmd = dev->data->dev_private; + struct rte_flow *remote_flow = flow->remote_flow; int ret = 0; LIST_REMOVE(flow, next); @@ -1001,15 +1253,54 @@ tap_flow_destroy(struct rte_eth_dev *dev, goto end; } ret = nl_recv_ack(pmd->nlsk_fd); - if (ret < 0) + if (ret < 0) { rte_flow_error_set( error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, "couldn't receive kernel ack to our request"); + goto end; + } + if (remote_flow) { + remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; + + ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh); + if (ret < 0) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "Failure sending nl request"); + goto end; + } + ret = nl_recv_ack(pmd->nlsk_fd); + if (ret < 0) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "Failure trying to receive nl ack"); + goto end; + } + } end: + if (remote_flow) + rte_free(remote_flow); rte_free(flow); return ret; } +/** + * Destroy a flow. + * + * @see rte_flow_destroy() + * @see rte_flow_ops + */ +static int +tap_flow_destroy(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct rte_flow_error *error) +{ + struct pmd_internals *pmd = dev->data->dev_private; + + return tap_flow_destroy_pmd(pmd, flow, error); +} + /** * Destroy all flows. * @@ -1030,6 +1321,128 @@ tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error) return 0; } +/** + * Add an implicit flow rule on the remote device to make sure traffic gets to + * the tap netdevice from there. + * + * @param pmd + * Pointer to private structure. + * @param[in] idx + * The idx in the implicit_rte_flows array specifying which rule to apply. + * + * @return -1 if the rule couldn't be applied, 0 otherwise. + */ +int tap_flow_implicit_create(struct pmd_internals *pmd, + enum implicit_rule_index idx) +{ + struct rte_flow_item *items = implicit_rte_flows[idx].items; + struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr; + struct rte_flow_item_eth eth_local = { .type = 0 }; + uint16_t if_index = pmd->remote_if_index; + struct rte_flow *remote_flow = NULL; + struct nlmsg *msg = NULL; + int err = 0; + struct rte_flow_item items_local[2] = { + [0] = { + .type = items[0].type, + .spec = ð_local, + .mask = items[0].mask, + }, + [1] = { + .type = items[1].type, + } + }; + + remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); + if (!remote_flow) { + RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow"); + goto fail; + } + msg = &remote_flow->msg; + if (idx == TAP_REMOTE_TX) { + if_index = pmd->if_index; + } else if (idx == TAP_REMOTE_LOCAL_MAC) { + /* + * eth addr couldn't be set in implicit_rte_flows[] as it is not + * known at compile time. + */ + memcpy(ð_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr)); + items = items_local; + } + tc_init_msg(msg, if_index, RTM_NEWTFILTER, + NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); + tap_flow_set_handle(remote_flow); + if (priv_flow_process(pmd, attr, items, NULL, NULL, + remote_flow, implicit_rte_flows[idx].mirred)) { + RTE_LOG(ERR, PMD, "rte flow rule validation failed\n"); + goto fail; + } + err = nl_send(pmd->nlsk_fd, &msg->nh); + if (err < 0) { + RTE_LOG(ERR, PMD, "Failure sending nl request"); + goto fail; + } + err = nl_recv_ack(pmd->nlsk_fd); + if (err < 0) { + RTE_LOG(ERR, PMD, + "Kernel refused TC filter rule creation"); + goto fail; + } + LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next); + return 0; +fail: + if (remote_flow) + rte_free(remote_flow); + return -1; +} + +/** + * Remove specific implicit flow rule on the remote device. + * + * @param[in, out] pmd + * Pointer to private structure. + * @param[in] idx + * The idx in the implicit_rte_flows array specifying which rule to remove. + * + * @return -1 if one of the implicit rules couldn't be created, 0 otherwise. + */ +int tap_flow_implicit_destroy(struct pmd_internals *pmd, + enum implicit_rule_index idx) +{ + struct rte_flow *remote_flow; + int cur_prio = -1; + int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET; + + for (remote_flow = LIST_FIRST(&pmd->implicit_flows); + remote_flow; + remote_flow = LIST_NEXT(remote_flow, next)) { + cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK; + if (cur_prio != idx_prio) + continue; + return tap_flow_destroy_pmd(pmd, remote_flow, NULL); + } + return 0; +} + +/** + * Destroy all implicit flows. + * + * @see rte_flow_flush() + */ +int +tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error) +{ + struct rte_flow *remote_flow; + + while (!LIST_EMPTY(&pmd->implicit_flows)) { + remote_flow = LIST_FIRST(&pmd->implicit_flows); + if (tap_flow_destroy_pmd(pmd, remote_flow, error) < 0) + return -1; + } + return 0; +} + /** * Manage filter operations. * diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h index a05e945df5..94414f18f4 100644 --- a/drivers/net/tap/tap_flow.h +++ b/drivers/net/tap/tap_flow.h @@ -36,6 +36,7 @@ #include #include +#include /** * In TC, priority 0 means we require the kernel to allocate one for us. @@ -49,10 +50,33 @@ #define GROUP_SHIFT 12 #define MAX_GROUP GROUP_MASK +/** + * These index are actually in reversed order: their priority is processed + * by subtracting their value to the lowest priority (PRIORITY_MASK). + * Thus the first one will have the lowest priority in the end + * (but biggest value). + */ +enum implicit_rule_index { + TAP_REMOTE_TX, + TAP_REMOTE_BROADCASTV6, + TAP_REMOTE_BROADCAST, + TAP_REMOTE_ALLMULTI, + TAP_REMOTE_PROMISC, + TAP_REMOTE_LOCAL_MAC, + TAP_REMOTE_MAX_IDX, +}; + int tap_dev_filter_ctrl(struct rte_eth_dev *dev, enum rte_filter_type filter_type, enum rte_filter_op filter_op, void *arg); int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error); +int tap_flow_implicit_create(struct pmd_internals *pmd, + enum implicit_rule_index idx); +int tap_flow_implicit_destroy(struct pmd_internals *pmd, + enum implicit_rule_index idx); +int tap_flow_implicit_flush(struct pmd_internals *pmd, + struct rte_flow_error *error); + #endif /* _TAP_FLOW_H_ */