1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <libmnl/libmnl.h>
8 #include <linux/if_ether.h>
9 #include <linux/netlink.h>
10 #include <linux/pkt_cls.h>
11 #include <linux/pkt_sched.h>
12 #include <linux/rtnetlink.h>
13 #include <linux/tc_act/tc_gact.h>
14 #include <linux/tc_act/tc_mirred.h>
19 #include <sys/socket.h>
21 #include <rte_byteorder.h>
22 #include <rte_errno.h>
27 /* Normally found in linux/netlink.h. */
28 #ifndef NETLINK_CAP_ACK
29 #define NETLINK_CAP_ACK 10
32 /* Normally found in linux/pkt_sched.h. */
33 #ifndef TC_H_MIN_INGRESS
34 #define TC_H_MIN_INGRESS 0xfff2u
37 /* Normally found in linux/pkt_cls.h. */
38 #ifndef TCA_CLS_FLAGS_SKIP_SW
39 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
41 #ifndef HAVE_TCA_FLOWER_ACT
42 #define TCA_FLOWER_ACT 3
44 #ifndef HAVE_TCA_FLOWER_FLAGS
45 #define TCA_FLOWER_FLAGS 22
48 /** Parser state definitions for mlx5_nl_flow_trans[]. */
49 enum mlx5_nl_flow_trans {
62 #define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
64 #define PATTERN_COMMON \
66 #define ACTIONS_COMMON \
68 #define ACTIONS_FATE \
69 ACTION_PORT_ID, ACTION_DROP
71 /** Parser state transitions used by mlx5_nl_flow_transpose(). */
72 static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
75 [ATTR] = TRANS(PATTERN),
76 [PATTERN] = TRANS(PATTERN_COMMON),
77 [ITEM_VOID] = TRANS(BACK),
78 [ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
79 [ACTION_VOID] = TRANS(BACK),
80 [ACTION_PORT_ID] = TRANS(ACTION_VOID, END),
81 [ACTION_DROP] = TRANS(ACTION_VOID, END),
86 * Transpose flow rule description to rtnetlink message.
88 * This function transposes a flow rule description to a traffic control
89 * (TC) filter creation message ready to be sent over Netlink.
91 * Target interface is specified as the first entry of the @p ptoi table.
92 * Subsequent entries enable this function to resolve other DPDK port IDs
93 * found in the flow rule.
96 * Output message buffer. May be NULL when @p size is 0.
98 * Size of @p buf. Message may be truncated if not large enough.
100 * DPDK port ID to network interface index translation table. This table
101 * is terminated by an entry with a zero ifindex value.
103 * Flow rule attributes.
105 * Pattern specification.
107 * Associated actions.
109 * Perform verbose error reporting if not NULL.
112 * A positive value representing the exact size of the message in bytes
113 * regardless of the @p size parameter on success, a negative errno value
114 * otherwise and rte_errno is set.
117 mlx5_nl_flow_transpose(void *buf,
119 const struct mlx5_nl_flow_ptoi *ptoi,
120 const struct rte_flow_attr *attr,
121 const struct rte_flow_item *pattern,
122 const struct rte_flow_action *actions,
123 struct rte_flow_error *error)
125 alignas(struct nlmsghdr)
126 uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
127 const struct rte_flow_item *item;
128 const struct rte_flow_action *action;
130 uint32_t act_index_cur;
131 struct nlattr *na_flower;
132 struct nlattr *na_flower_act;
133 const enum mlx5_nl_flow_trans *trans;
134 const enum mlx5_nl_flow_trans *back;
144 na_flower_act = NULL;
148 switch (trans[n++]) {
150 const struct rte_flow_action_port_id *port_id;
152 struct nlmsghdr *nlh;
154 struct nlattr *act_index;
160 return rte_flow_error_set
161 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
162 item, "unsupported pattern item combination");
163 else if (action->type)
164 return rte_flow_error_set
165 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
166 action, "unsupported action combination");
167 return rte_flow_error_set
168 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
169 "flow rule lacks some kind of fate action");
176 * Supported attributes: no groups, some priorities and
177 * ingress only. Don't care about transfer as it is the
181 return rte_flow_error_set
183 RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
184 attr, "groups are not supported");
185 if (attr->priority > 0xfffe)
186 return rte_flow_error_set
188 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
189 attr, "lowest priority level is 0xfffe");
191 return rte_flow_error_set
193 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
194 attr, "only ingress is supported");
196 return rte_flow_error_set
198 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
199 attr, "egress is not supported");
200 if (size < mnl_nlmsg_size(sizeof(*tcm)))
202 nlh = mnl_nlmsg_put_header(buf);
204 nlh->nlmsg_flags = 0;
206 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
207 tcm->tcm_family = AF_UNSPEC;
208 tcm->tcm_ifindex = ptoi[0].ifindex;
210 * Let kernel pick a handle by default. A predictable handle
211 * can be set by the caller on the resulting buffer through
212 * mlx5_nl_flow_brand().
215 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
217 * Priority cannot be zero to prevent the kernel from
218 * picking one automatically.
220 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
221 RTE_BE16(ETH_P_ALL));
224 if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
226 na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
229 if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
230 TCA_CLS_FLAGS_SKIP_SW))
234 if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
239 if (item->type != RTE_FLOW_ITEM_TYPE_END)
242 assert(!na_flower_act);
244 mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
250 if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
255 if (action->type != RTE_FLOW_ACTION_TYPE_PORT_ID)
257 conf.port_id = action->conf;
258 if (conf.port_id->original)
261 for (i = 0; ptoi[i].ifindex; ++i)
262 if (ptoi[i].port_id == conf.port_id->id)
264 if (!ptoi[i].ifindex)
265 return rte_flow_error_set
266 (error, ENODEV, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
268 "missing data to convert port ID to ifindex");
270 mnl_attr_nest_start_check(buf, size, act_index_cur++);
272 !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "mirred"))
274 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
277 if (!mnl_attr_put_check(buf, size, TCA_MIRRED_PARMS,
278 sizeof(struct tc_mirred),
280 .action = TC_ACT_STOLEN,
281 .eaction = TCA_EGRESS_REDIR,
282 .ifindex = ptoi[i].ifindex,
285 mnl_attr_nest_end(buf, act);
286 mnl_attr_nest_end(buf, act_index);
290 if (action->type != RTE_FLOW_ACTION_TYPE_DROP)
293 mnl_attr_nest_start_check(buf, size, act_index_cur++);
295 !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "gact"))
297 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
300 if (!mnl_attr_put_check(buf, size, TCA_GACT_PARMS,
301 sizeof(struct tc_gact),
303 .action = TC_ACT_SHOT,
306 mnl_attr_nest_end(buf, act);
307 mnl_attr_nest_end(buf, act_index);
311 if (item->type != RTE_FLOW_ITEM_TYPE_END ||
312 action->type != RTE_FLOW_ACTION_TYPE_END)
315 mnl_attr_nest_end(buf, na_flower_act);
317 mnl_attr_nest_end(buf, na_flower);
319 return nlh->nlmsg_len;
322 trans = mlx5_nl_flow_trans[trans[n - 1]];
326 if (buf != buf_tmp) {
328 size = sizeof(buf_tmp);
331 return rte_flow_error_set
332 (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
333 "generated TC message is too large");
337 * Brand rtnetlink buffer with unique handle.
339 * This handle should be unique for a given network interface to avoid
343 * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
345 * Unique 32-bit handle to use.
348 mlx5_nl_flow_brand(void *buf, uint32_t handle)
350 struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
352 tcm->tcm_handle = handle;
356 * Send Netlink message with acknowledgment.
359 * Libmnl socket to use.
361 * Message to send. This function always raises the NLM_F_ACK flag before
365 * 0 on success, a negative errno value otherwise and rte_errno is set.
368 mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
370 alignas(struct nlmsghdr)
371 uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
372 nlh->nlmsg_len - sizeof(*nlh)];
373 uint32_t seq = random();
376 nlh->nlmsg_flags |= NLM_F_ACK;
377 nlh->nlmsg_seq = seq;
378 ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
380 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
383 (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
391 * Create a Netlink flow rule.
394 * Libmnl socket to use.
396 * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
398 * Perform verbose error reporting if not NULL.
401 * 0 on success, a negative errno value otherwise and rte_errno is set.
404 mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
405 struct rte_flow_error *error)
407 struct nlmsghdr *nlh = buf;
409 nlh->nlmsg_type = RTM_NEWTFILTER;
410 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
411 if (!mlx5_nl_flow_nl_ack(nl, nlh))
413 return rte_flow_error_set
414 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
415 "netlink: failed to create TC flow rule");
419 * Destroy a Netlink flow rule.
422 * Libmnl socket to use.
424 * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
426 * Perform verbose error reporting if not NULL.
429 * 0 on success, a negative errno value otherwise and rte_errno is set.
432 mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
433 struct rte_flow_error *error)
435 struct nlmsghdr *nlh = buf;
437 nlh->nlmsg_type = RTM_DELTFILTER;
438 nlh->nlmsg_flags = NLM_F_REQUEST;
439 if (!mlx5_nl_flow_nl_ack(nl, nlh))
441 return rte_flow_error_set
442 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
443 "netlink: failed to destroy TC flow rule");
447 * Initialize ingress qdisc of a given network interface.
450 * Libmnl socket of the @p NETLINK_ROUTE kind.
452 * Index of network interface to initialize.
454 * Perform verbose error reporting if not NULL.
457 * 0 on success, a negative errno value otherwise and rte_errno is set.
460 mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
461 struct rte_flow_error *error)
463 struct nlmsghdr *nlh;
465 alignas(struct nlmsghdr)
466 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
468 /* Destroy existing ingress qdisc and everything attached to it. */
469 nlh = mnl_nlmsg_put_header(buf);
470 nlh->nlmsg_type = RTM_DELQDISC;
471 nlh->nlmsg_flags = NLM_F_REQUEST;
472 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
473 tcm->tcm_family = AF_UNSPEC;
474 tcm->tcm_ifindex = ifindex;
475 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
476 tcm->tcm_parent = TC_H_INGRESS;
477 /* Ignore errors when qdisc is already absent. */
478 if (mlx5_nl_flow_nl_ack(nl, nlh) &&
479 rte_errno != EINVAL && rte_errno != ENOENT)
480 return rte_flow_error_set
481 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
482 NULL, "netlink: failed to remove ingress qdisc");
483 /* Create fresh ingress qdisc. */
484 nlh = mnl_nlmsg_put_header(buf);
485 nlh->nlmsg_type = RTM_NEWQDISC;
486 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
487 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
488 tcm->tcm_family = AF_UNSPEC;
489 tcm->tcm_ifindex = ifindex;
490 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
491 tcm->tcm_parent = TC_H_INGRESS;
492 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
493 if (mlx5_nl_flow_nl_ack(nl, nlh))
494 return rte_flow_error_set
495 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
496 NULL, "netlink: failed to create ingress qdisc");
501 * Create and configure a libmnl socket for Netlink flow rules.
504 * A valid libmnl socket object pointer on success, NULL otherwise and
508 mlx5_nl_flow_socket_create(void)
510 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
513 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
515 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
520 mnl_socket_close(nl);
525 * Destroy a libmnl socket.
528 mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
530 mnl_socket_close(nl);