1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
7 #include <libmnl/libmnl.h>
8 #include <linux/if_ether.h>
9 #include <linux/netlink.h>
10 #include <linux/pkt_cls.h>
11 #include <linux/pkt_sched.h>
12 #include <linux/rtnetlink.h>
17 #include <sys/socket.h>
19 #include <rte_byteorder.h>
20 #include <rte_errno.h>
25 /* Normally found in linux/netlink.h. */
26 #ifndef NETLINK_CAP_ACK
27 #define NETLINK_CAP_ACK 10
30 /* Normally found in linux/pkt_sched.h. */
31 #ifndef TC_H_MIN_INGRESS
32 #define TC_H_MIN_INGRESS 0xfff2u
35 /* Normally found in linux/pkt_cls.h. */
36 #ifndef TCA_CLS_FLAGS_SKIP_SW
37 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
39 #ifndef HAVE_TCA_FLOWER_ACT
40 #define TCA_FLOWER_ACT 3
42 #ifndef HAVE_TCA_FLOWER_FLAGS
43 #define TCA_FLOWER_FLAGS 22
46 /** Parser state definitions for mlx5_nl_flow_trans[]. */
47 enum mlx5_nl_flow_trans {
58 #define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
60 #define PATTERN_COMMON \
62 #define ACTIONS_COMMON \
65 /** Parser state transitions used by mlx5_nl_flow_transpose(). */
66 static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
69 [ATTR] = TRANS(PATTERN),
70 [PATTERN] = TRANS(PATTERN_COMMON),
71 [ITEM_VOID] = TRANS(BACK),
72 [ACTIONS] = TRANS(ACTIONS_COMMON),
73 [ACTION_VOID] = TRANS(BACK),
78 * Transpose flow rule description to rtnetlink message.
80 * This function transposes a flow rule description to a traffic control
81 * (TC) filter creation message ready to be sent over Netlink.
83 * Target interface is specified as the first entry of the @p ptoi table.
84 * Subsequent entries enable this function to resolve other DPDK port IDs
85 * found in the flow rule.
88 * Output message buffer. May be NULL when @p size is 0.
90 * Size of @p buf. Message may be truncated if not large enough.
92 * DPDK port ID to network interface index translation table. This table
93 * is terminated by an entry with a zero ifindex value.
95 * Flow rule attributes.
97 * Pattern specification.
101 * Perform verbose error reporting if not NULL.
104 * A positive value representing the exact size of the message in bytes
105 * regardless of the @p size parameter on success, a negative errno value
106 * otherwise and rte_errno is set.
109 mlx5_nl_flow_transpose(void *buf,
111 const struct mlx5_nl_flow_ptoi *ptoi,
112 const struct rte_flow_attr *attr,
113 const struct rte_flow_item *pattern,
114 const struct rte_flow_action *actions,
115 struct rte_flow_error *error)
117 alignas(struct nlmsghdr)
118 uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
119 const struct rte_flow_item *item;
120 const struct rte_flow_action *action;
122 struct nlattr *na_flower;
123 struct nlattr *na_flower_act;
124 const enum mlx5_nl_flow_trans *trans;
125 const enum mlx5_nl_flow_trans *back;
134 na_flower_act = NULL;
138 switch (trans[n++]) {
139 struct nlmsghdr *nlh;
144 return rte_flow_error_set
145 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
146 item, "unsupported pattern item combination");
147 else if (action->type)
148 return rte_flow_error_set
149 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
150 action, "unsupported action combination");
151 return rte_flow_error_set
152 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
153 "flow rule lacks some kind of fate action");
160 * Supported attributes: no groups, some priorities and
161 * ingress only. Don't care about transfer as it is the
165 return rte_flow_error_set
167 RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
168 attr, "groups are not supported");
169 if (attr->priority > 0xfffe)
170 return rte_flow_error_set
172 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
173 attr, "lowest priority level is 0xfffe");
175 return rte_flow_error_set
177 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
178 attr, "only ingress is supported");
180 return rte_flow_error_set
182 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
183 attr, "egress is not supported");
184 if (size < mnl_nlmsg_size(sizeof(*tcm)))
186 nlh = mnl_nlmsg_put_header(buf);
188 nlh->nlmsg_flags = 0;
190 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
191 tcm->tcm_family = AF_UNSPEC;
192 tcm->tcm_ifindex = ptoi[0].ifindex;
194 * Let kernel pick a handle by default. A predictable handle
195 * can be set by the caller on the resulting buffer through
196 * mlx5_nl_flow_brand().
199 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
201 * Priority cannot be zero to prevent the kernel from
202 * picking one automatically.
204 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
205 RTE_BE16(ETH_P_ALL));
208 if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
210 na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
213 if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
214 TCA_CLS_FLAGS_SKIP_SW))
218 if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
223 if (item->type != RTE_FLOW_ITEM_TYPE_END)
226 assert(!na_flower_act);
228 mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
233 if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
238 if (item->type != RTE_FLOW_ITEM_TYPE_END ||
239 action->type != RTE_FLOW_ACTION_TYPE_END)
242 mnl_attr_nest_end(buf, na_flower_act);
244 mnl_attr_nest_end(buf, na_flower);
246 return nlh->nlmsg_len;
249 trans = mlx5_nl_flow_trans[trans[n - 1]];
253 if (buf != buf_tmp) {
255 size = sizeof(buf_tmp);
258 return rte_flow_error_set
259 (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
260 "generated TC message is too large");
264 * Brand rtnetlink buffer with unique handle.
266 * This handle should be unique for a given network interface to avoid
270 * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
272 * Unique 32-bit handle to use.
275 mlx5_nl_flow_brand(void *buf, uint32_t handle)
277 struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
279 tcm->tcm_handle = handle;
283 * Send Netlink message with acknowledgment.
286 * Libmnl socket to use.
288 * Message to send. This function always raises the NLM_F_ACK flag before
292 * 0 on success, a negative errno value otherwise and rte_errno is set.
295 mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
297 alignas(struct nlmsghdr)
298 uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
299 nlh->nlmsg_len - sizeof(*nlh)];
300 uint32_t seq = random();
303 nlh->nlmsg_flags |= NLM_F_ACK;
304 nlh->nlmsg_seq = seq;
305 ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
307 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
310 (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
318 * Create a Netlink flow rule.
321 * Libmnl socket to use.
323 * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
325 * Perform verbose error reporting if not NULL.
328 * 0 on success, a negative errno value otherwise and rte_errno is set.
331 mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
332 struct rte_flow_error *error)
334 struct nlmsghdr *nlh = buf;
336 nlh->nlmsg_type = RTM_NEWTFILTER;
337 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
338 if (!mlx5_nl_flow_nl_ack(nl, nlh))
340 return rte_flow_error_set
341 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
342 "netlink: failed to create TC flow rule");
346 * Destroy a Netlink flow rule.
349 * Libmnl socket to use.
351 * Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
353 * Perform verbose error reporting if not NULL.
356 * 0 on success, a negative errno value otherwise and rte_errno is set.
359 mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
360 struct rte_flow_error *error)
362 struct nlmsghdr *nlh = buf;
364 nlh->nlmsg_type = RTM_DELTFILTER;
365 nlh->nlmsg_flags = NLM_F_REQUEST;
366 if (!mlx5_nl_flow_nl_ack(nl, nlh))
368 return rte_flow_error_set
369 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
370 "netlink: failed to destroy TC flow rule");
374 * Initialize ingress qdisc of a given network interface.
377 * Libmnl socket of the @p NETLINK_ROUTE kind.
379 * Index of network interface to initialize.
381 * Perform verbose error reporting if not NULL.
384 * 0 on success, a negative errno value otherwise and rte_errno is set.
387 mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
388 struct rte_flow_error *error)
390 struct nlmsghdr *nlh;
392 alignas(struct nlmsghdr)
393 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
395 /* Destroy existing ingress qdisc and everything attached to it. */
396 nlh = mnl_nlmsg_put_header(buf);
397 nlh->nlmsg_type = RTM_DELQDISC;
398 nlh->nlmsg_flags = NLM_F_REQUEST;
399 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
400 tcm->tcm_family = AF_UNSPEC;
401 tcm->tcm_ifindex = ifindex;
402 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
403 tcm->tcm_parent = TC_H_INGRESS;
404 /* Ignore errors when qdisc is already absent. */
405 if (mlx5_nl_flow_nl_ack(nl, nlh) &&
406 rte_errno != EINVAL && rte_errno != ENOENT)
407 return rte_flow_error_set
408 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
409 NULL, "netlink: failed to remove ingress qdisc");
410 /* Create fresh ingress qdisc. */
411 nlh = mnl_nlmsg_put_header(buf);
412 nlh->nlmsg_type = RTM_NEWQDISC;
413 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
414 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
415 tcm->tcm_family = AF_UNSPEC;
416 tcm->tcm_ifindex = ifindex;
417 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
418 tcm->tcm_parent = TC_H_INGRESS;
419 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
420 if (mlx5_nl_flow_nl_ack(nl, nlh))
421 return rte_flow_error_set
422 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
423 NULL, "netlink: failed to create ingress qdisc");
428 * Create and configure a libmnl socket for Netlink flow rules.
431 * A valid libmnl socket object pointer on success, NULL otherwise and
435 mlx5_nl_flow_socket_create(void)
437 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
440 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
442 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
447 mnl_socket_close(nl);
452 * Destroy a libmnl socket.
455 mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
457 mnl_socket_close(nl);