net/mlx5: add VLAN item and actions to switch flow rules
[dpdk.git] / drivers / net / mlx5 / mlx5_nl_flow.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/if_ether.h>
10 #include <linux/netlink.h>
11 #include <linux/pkt_cls.h>
12 #include <linux/pkt_sched.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/tc_act/tc_gact.h>
15 #include <linux/tc_act/tc_mirred.h>
16 #include <netinet/in.h>
17 #include <stdalign.h>
18 #include <stdbool.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <sys/socket.h>
23
24 #include <rte_byteorder.h>
25 #include <rte_errno.h>
26 #include <rte_ether.h>
27 #include <rte_flow.h>
28
29 #include "mlx5.h"
30 #include "mlx5_autoconf.h"
31
32 #ifdef HAVE_TC_ACT_VLAN
33
34 #include <linux/tc_act/tc_vlan.h>
35
36 #else /* HAVE_TC_ACT_VLAN */
37
38 #define TCA_VLAN_ACT_POP 1
39 #define TCA_VLAN_ACT_PUSH 2
40 #define TCA_VLAN_ACT_MODIFY 3
41 #define TCA_VLAN_PARMS 2
42 #define TCA_VLAN_PUSH_VLAN_ID 3
43 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
44 #define TCA_VLAN_PAD 5
45 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
46
47 struct tc_vlan {
48         tc_gen;
49         int v_action;
50 };
51
52 #endif /* HAVE_TC_ACT_VLAN */
53
54 /* Normally found in linux/netlink.h. */
55 #ifndef NETLINK_CAP_ACK
56 #define NETLINK_CAP_ACK 10
57 #endif
58
59 /* Normally found in linux/pkt_sched.h. */
60 #ifndef TC_H_MIN_INGRESS
61 #define TC_H_MIN_INGRESS 0xfff2u
62 #endif
63
64 /* Normally found in linux/pkt_cls.h. */
65 #ifndef TCA_CLS_FLAGS_SKIP_SW
66 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
67 #endif
68 #ifndef HAVE_TCA_FLOWER_ACT
69 #define TCA_FLOWER_ACT 3
70 #endif
71 #ifndef HAVE_TCA_FLOWER_FLAGS
72 #define TCA_FLOWER_FLAGS 22
73 #endif
74 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
75 #define TCA_FLOWER_KEY_ETH_TYPE 8
76 #endif
77 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
78 #define TCA_FLOWER_KEY_ETH_DST 4
79 #endif
80 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
81 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
82 #endif
83 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
84 #define TCA_FLOWER_KEY_ETH_SRC 6
85 #endif
86 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
87 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
88 #endif
89 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
90 #define TCA_FLOWER_KEY_IP_PROTO 9
91 #endif
92 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
93 #define TCA_FLOWER_KEY_IPV4_SRC 10
94 #endif
95 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
96 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
97 #endif
98 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
99 #define TCA_FLOWER_KEY_IPV4_DST 12
100 #endif
101 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
102 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
103 #endif
104 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
105 #define TCA_FLOWER_KEY_IPV6_SRC 14
106 #endif
107 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
108 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
109 #endif
110 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
111 #define TCA_FLOWER_KEY_IPV6_DST 16
112 #endif
113 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
114 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
115 #endif
116 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
117 #define TCA_FLOWER_KEY_TCP_SRC 18
118 #endif
119 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
120 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
121 #endif
122 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
123 #define TCA_FLOWER_KEY_TCP_DST 19
124 #endif
125 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
126 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
127 #endif
128 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
129 #define TCA_FLOWER_KEY_UDP_SRC 20
130 #endif
131 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
132 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
133 #endif
134 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
135 #define TCA_FLOWER_KEY_UDP_DST 21
136 #endif
137 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
138 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
139 #endif
140 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
141 #define TCA_FLOWER_KEY_VLAN_ID 23
142 #endif
143 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
144 #define TCA_FLOWER_KEY_VLAN_PRIO 24
145 #endif
146 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
147 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
148 #endif
149
150 /** Parser state definitions for mlx5_nl_flow_trans[]. */
151 enum mlx5_nl_flow_trans {
152         INVALID,
153         BACK,
154         ATTR,
155         PATTERN,
156         ITEM_VOID,
157         ITEM_ETH,
158         ITEM_VLAN,
159         ITEM_IPV4,
160         ITEM_IPV6,
161         ITEM_TCP,
162         ITEM_UDP,
163         ACTIONS,
164         ACTION_VOID,
165         ACTION_PORT_ID,
166         ACTION_DROP,
167         ACTION_OF_POP_VLAN,
168         ACTION_OF_PUSH_VLAN,
169         ACTION_OF_SET_VLAN_VID,
170         ACTION_OF_SET_VLAN_PCP,
171         END,
172 };
173
174 #define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
175
176 #define PATTERN_COMMON \
177         ITEM_VOID, ACTIONS
178 #define ACTIONS_COMMON \
179         ACTION_VOID, ACTION_OF_POP_VLAN, ACTION_OF_PUSH_VLAN, \
180         ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP
181 #define ACTIONS_FATE \
182         ACTION_PORT_ID, ACTION_DROP
183
184 /** Parser state transitions used by mlx5_nl_flow_transpose(). */
185 static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
186         [INVALID] = NULL,
187         [BACK] = NULL,
188         [ATTR] = TRANS(PATTERN),
189         [PATTERN] = TRANS(ITEM_ETH, PATTERN_COMMON),
190         [ITEM_VOID] = TRANS(BACK),
191         [ITEM_ETH] = TRANS(ITEM_IPV4, ITEM_IPV6, ITEM_VLAN, PATTERN_COMMON),
192         [ITEM_VLAN] = TRANS(ITEM_IPV4, ITEM_IPV6, PATTERN_COMMON),
193         [ITEM_IPV4] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
194         [ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
195         [ITEM_TCP] = TRANS(PATTERN_COMMON),
196         [ITEM_UDP] = TRANS(PATTERN_COMMON),
197         [ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
198         [ACTION_VOID] = TRANS(BACK),
199         [ACTION_PORT_ID] = TRANS(ACTION_VOID, END),
200         [ACTION_DROP] = TRANS(ACTION_VOID, END),
201         [ACTION_OF_POP_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
202         [ACTION_OF_PUSH_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
203         [ACTION_OF_SET_VLAN_VID] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
204         [ACTION_OF_SET_VLAN_PCP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
205         [END] = NULL,
206 };
207
208 /** Empty masks for known item types. */
209 static const union {
210         struct rte_flow_item_eth eth;
211         struct rte_flow_item_vlan vlan;
212         struct rte_flow_item_ipv4 ipv4;
213         struct rte_flow_item_ipv6 ipv6;
214         struct rte_flow_item_tcp tcp;
215         struct rte_flow_item_udp udp;
216 } mlx5_nl_flow_mask_empty;
217
218 /** Supported masks for known item types. */
219 static const struct {
220         struct rte_flow_item_eth eth;
221         struct rte_flow_item_vlan vlan;
222         struct rte_flow_item_ipv4 ipv4;
223         struct rte_flow_item_ipv6 ipv6;
224         struct rte_flow_item_tcp tcp;
225         struct rte_flow_item_udp udp;
226 } mlx5_nl_flow_mask_supported = {
227         .eth = {
228                 .type = RTE_BE16(0xffff),
229                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
230                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
231         },
232         .vlan = {
233                 /* PCP and VID only, no DEI. */
234                 .tci = RTE_BE16(0xefff),
235                 .inner_type = RTE_BE16(0xffff),
236         },
237         .ipv4.hdr = {
238                 .next_proto_id = 0xff,
239                 .src_addr = RTE_BE32(0xffffffff),
240                 .dst_addr = RTE_BE32(0xffffffff),
241         },
242         .ipv6.hdr = {
243                 .proto = 0xff,
244                 .src_addr =
245                         "\xff\xff\xff\xff\xff\xff\xff\xff"
246                         "\xff\xff\xff\xff\xff\xff\xff\xff",
247                 .dst_addr =
248                         "\xff\xff\xff\xff\xff\xff\xff\xff"
249                         "\xff\xff\xff\xff\xff\xff\xff\xff",
250         },
251         .tcp.hdr = {
252                 .src_port = RTE_BE16(0xffff),
253                 .dst_port = RTE_BE16(0xffff),
254         },
255         .udp.hdr = {
256                 .src_port = RTE_BE16(0xffff),
257                 .dst_port = RTE_BE16(0xffff),
258         },
259 };
260
261 /**
262  * Retrieve mask for pattern item.
263  *
264  * This function does basic sanity checks on a pattern item in order to
265  * return the most appropriate mask for it.
266  *
267  * @param[in] item
268  *   Item specification.
269  * @param[in] mask_default
270  *   Default mask for pattern item as specified by the flow API.
271  * @param[in] mask_supported
272  *   Mask fields supported by the implementation.
273  * @param[in] mask_empty
274  *   Empty mask to return when there is no specification.
275  * @param[out] error
276  *   Perform verbose error reporting if not NULL.
277  *
278  * @return
279  *   Either @p item->mask or one of the mask parameters on success, NULL
280  *   otherwise and rte_errno is set.
281  */
282 static const void *
283 mlx5_nl_flow_item_mask(const struct rte_flow_item *item,
284                        const void *mask_default,
285                        const void *mask_supported,
286                        const void *mask_empty,
287                        size_t mask_size,
288                        struct rte_flow_error *error)
289 {
290         const uint8_t *mask;
291         size_t i;
292
293         /* item->last and item->mask cannot exist without item->spec. */
294         if (!item->spec && (item->mask || item->last)) {
295                 rte_flow_error_set
296                         (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item,
297                          "\"mask\" or \"last\" field provided without a"
298                          " corresponding \"spec\"");
299                 return NULL;
300         }
301         /* No spec, no mask, no problem. */
302         if (!item->spec)
303                 return mask_empty;
304         mask = item->mask ? item->mask : mask_default;
305         assert(mask);
306         /*
307          * Single-pass check to make sure that:
308          * - Mask is supported, no bits are set outside mask_supported.
309          * - Both item->spec and item->last are included in mask.
310          */
311         for (i = 0; i != mask_size; ++i) {
312                 if (!mask[i])
313                         continue;
314                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
315                     ((const uint8_t *)mask_supported)[i]) {
316                         rte_flow_error_set
317                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
318                                  mask, "unsupported field found in \"mask\"");
319                         return NULL;
320                 }
321                 if (item->last &&
322                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
323                     (((const uint8_t *)item->last)[i] & mask[i])) {
324                         rte_flow_error_set
325                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_LAST,
326                                  item->last,
327                                  "range between \"spec\" and \"last\" not"
328                                  " comprised in \"mask\"");
329                         return NULL;
330                 }
331         }
332         return mask;
333 }
334
335 /**
336  * Transpose flow rule description to rtnetlink message.
337  *
338  * This function transposes a flow rule description to a traffic control
339  * (TC) filter creation message ready to be sent over Netlink.
340  *
341  * Target interface is specified as the first entry of the @p ptoi table.
342  * Subsequent entries enable this function to resolve other DPDK port IDs
343  * found in the flow rule.
344  *
345  * @param[out] buf
346  *   Output message buffer. May be NULL when @p size is 0.
347  * @param size
348  *   Size of @p buf. Message may be truncated if not large enough.
349  * @param[in] ptoi
350  *   DPDK port ID to network interface index translation table. This table
351  *   is terminated by an entry with a zero ifindex value.
352  * @param[in] attr
353  *   Flow rule attributes.
354  * @param[in] pattern
355  *   Pattern specification.
356  * @param[in] actions
357  *   Associated actions.
358  * @param[out] error
359  *   Perform verbose error reporting if not NULL.
360  *
361  * @return
362  *   A positive value representing the exact size of the message in bytes
363  *   regardless of the @p size parameter on success, a negative errno value
364  *   otherwise and rte_errno is set.
365  */
366 int
367 mlx5_nl_flow_transpose(void *buf,
368                        size_t size,
369                        const struct mlx5_nl_flow_ptoi *ptoi,
370                        const struct rte_flow_attr *attr,
371                        const struct rte_flow_item *pattern,
372                        const struct rte_flow_action *actions,
373                        struct rte_flow_error *error)
374 {
375         alignas(struct nlmsghdr)
376         uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
377         const struct rte_flow_item *item;
378         const struct rte_flow_action *action;
379         unsigned int n;
380         uint32_t act_index_cur;
381         bool eth_type_set;
382         bool vlan_present;
383         bool vlan_eth_type_set;
384         bool ip_proto_set;
385         struct nlattr *na_flower;
386         struct nlattr *na_flower_act;
387         struct nlattr *na_vlan_id;
388         struct nlattr *na_vlan_priority;
389         const enum mlx5_nl_flow_trans *trans;
390         const enum mlx5_nl_flow_trans *back;
391
392         if (!size)
393                 goto error_nobufs;
394 init:
395         item = pattern;
396         action = actions;
397         n = 0;
398         act_index_cur = 0;
399         eth_type_set = false;
400         vlan_present = false;
401         vlan_eth_type_set = false;
402         ip_proto_set = false;
403         na_flower = NULL;
404         na_flower_act = NULL;
405         na_vlan_id = NULL;
406         na_vlan_priority = NULL;
407         trans = TRANS(ATTR);
408         back = trans;
409 trans:
410         switch (trans[n++]) {
411                 union {
412                         const struct rte_flow_item_eth *eth;
413                         const struct rte_flow_item_vlan *vlan;
414                         const struct rte_flow_item_ipv4 *ipv4;
415                         const struct rte_flow_item_ipv6 *ipv6;
416                         const struct rte_flow_item_tcp *tcp;
417                         const struct rte_flow_item_udp *udp;
418                 } spec, mask;
419                 union {
420                         const struct rte_flow_action_port_id *port_id;
421                         const struct rte_flow_action_of_push_vlan *of_push_vlan;
422                         const struct rte_flow_action_of_set_vlan_vid *
423                                 of_set_vlan_vid;
424                         const struct rte_flow_action_of_set_vlan_pcp *
425                                 of_set_vlan_pcp;
426                 } conf;
427                 struct nlmsghdr *nlh;
428                 struct tcmsg *tcm;
429                 struct nlattr *act_index;
430                 struct nlattr *act;
431                 unsigned int i;
432
433         case INVALID:
434                 if (item->type)
435                         return rte_flow_error_set
436                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
437                                  item, "unsupported pattern item combination");
438                 else if (action->type)
439                         return rte_flow_error_set
440                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
441                                  action, "unsupported action combination");
442                 return rte_flow_error_set
443                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
444                          "flow rule lacks some kind of fate action");
445         case BACK:
446                 trans = back;
447                 n = 0;
448                 goto trans;
449         case ATTR:
450                 /*
451                  * Supported attributes: no groups, some priorities and
452                  * ingress only. Don't care about transfer as it is the
453                  * caller's problem.
454                  */
455                 if (attr->group)
456                         return rte_flow_error_set
457                                 (error, ENOTSUP,
458                                  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
459                                  attr, "groups are not supported");
460                 if (attr->priority > 0xfffe)
461                         return rte_flow_error_set
462                                 (error, ENOTSUP,
463                                  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
464                                  attr, "lowest priority level is 0xfffe");
465                 if (!attr->ingress)
466                         return rte_flow_error_set
467                                 (error, ENOTSUP,
468                                  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
469                                  attr, "only ingress is supported");
470                 if (attr->egress)
471                         return rte_flow_error_set
472                                 (error, ENOTSUP,
473                                  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
474                                  attr, "egress is not supported");
475                 if (size < mnl_nlmsg_size(sizeof(*tcm)))
476                         goto error_nobufs;
477                 nlh = mnl_nlmsg_put_header(buf);
478                 nlh->nlmsg_type = 0;
479                 nlh->nlmsg_flags = 0;
480                 nlh->nlmsg_seq = 0;
481                 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
482                 tcm->tcm_family = AF_UNSPEC;
483                 tcm->tcm_ifindex = ptoi[0].ifindex;
484                 /*
485                  * Let kernel pick a handle by default. A predictable handle
486                  * can be set by the caller on the resulting buffer through
487                  * mlx5_nl_flow_brand().
488                  */
489                 tcm->tcm_handle = 0;
490                 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
491                 /*
492                  * Priority cannot be zero to prevent the kernel from
493                  * picking one automatically.
494                  */
495                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
496                                           RTE_BE16(ETH_P_ALL));
497                 break;
498         case PATTERN:
499                 if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
500                         goto error_nobufs;
501                 na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
502                 if (!na_flower)
503                         goto error_nobufs;
504                 if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
505                                             TCA_CLS_FLAGS_SKIP_SW))
506                         goto error_nobufs;
507                 break;
508         case ITEM_VOID:
509                 if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
510                         goto trans;
511                 ++item;
512                 break;
513         case ITEM_ETH:
514                 if (item->type != RTE_FLOW_ITEM_TYPE_ETH)
515                         goto trans;
516                 mask.eth = mlx5_nl_flow_item_mask
517                         (item, &rte_flow_item_eth_mask,
518                          &mlx5_nl_flow_mask_supported.eth,
519                          &mlx5_nl_flow_mask_empty.eth,
520                          sizeof(mlx5_nl_flow_mask_supported.eth), error);
521                 if (!mask.eth)
522                         return -rte_errno;
523                 if (mask.eth == &mlx5_nl_flow_mask_empty.eth) {
524                         ++item;
525                         break;
526                 }
527                 spec.eth = item->spec;
528                 if (mask.eth->type && mask.eth->type != RTE_BE16(0xffff))
529                         return rte_flow_error_set
530                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
531                                  mask.eth,
532                                  "no support for partial mask on"
533                                  " \"type\" field");
534                 if (mask.eth->type) {
535                         if (!mnl_attr_put_u16_check(buf, size,
536                                                     TCA_FLOWER_KEY_ETH_TYPE,
537                                                     spec.eth->type))
538                                 goto error_nobufs;
539                         eth_type_set = 1;
540                 }
541                 if ((!is_zero_ether_addr(&mask.eth->dst) &&
542                      (!mnl_attr_put_check(buf, size,
543                                           TCA_FLOWER_KEY_ETH_DST,
544                                           ETHER_ADDR_LEN,
545                                           spec.eth->dst.addr_bytes) ||
546                       !mnl_attr_put_check(buf, size,
547                                           TCA_FLOWER_KEY_ETH_DST_MASK,
548                                           ETHER_ADDR_LEN,
549                                           mask.eth->dst.addr_bytes))) ||
550                     (!is_zero_ether_addr(&mask.eth->src) &&
551                      (!mnl_attr_put_check(buf, size,
552                                           TCA_FLOWER_KEY_ETH_SRC,
553                                           ETHER_ADDR_LEN,
554                                           spec.eth->src.addr_bytes) ||
555                       !mnl_attr_put_check(buf, size,
556                                           TCA_FLOWER_KEY_ETH_SRC_MASK,
557                                           ETHER_ADDR_LEN,
558                                           mask.eth->src.addr_bytes))))
559                         goto error_nobufs;
560                 ++item;
561                 break;
562         case ITEM_VLAN:
563                 if (item->type != RTE_FLOW_ITEM_TYPE_VLAN)
564                         goto trans;
565                 mask.vlan = mlx5_nl_flow_item_mask
566                         (item, &rte_flow_item_vlan_mask,
567                          &mlx5_nl_flow_mask_supported.vlan,
568                          &mlx5_nl_flow_mask_empty.vlan,
569                          sizeof(mlx5_nl_flow_mask_supported.vlan), error);
570                 if (!mask.vlan)
571                         return -rte_errno;
572                 if (!eth_type_set &&
573                     !mnl_attr_put_u16_check(buf, size,
574                                             TCA_FLOWER_KEY_ETH_TYPE,
575                                             RTE_BE16(ETH_P_8021Q)))
576                         goto error_nobufs;
577                 eth_type_set = 1;
578                 vlan_present = 1;
579                 if (mask.vlan == &mlx5_nl_flow_mask_empty.vlan) {
580                         ++item;
581                         break;
582                 }
583                 spec.vlan = item->spec;
584                 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
585                      (mask.vlan->tci & RTE_BE16(0xe000)) != RTE_BE16(0xe000)) ||
586                     (mask.vlan->tci & RTE_BE16(0x0fff) &&
587                      (mask.vlan->tci & RTE_BE16(0x0fff)) != RTE_BE16(0x0fff)) ||
588                     (mask.vlan->inner_type &&
589                      mask.vlan->inner_type != RTE_BE16(0xffff)))
590                         return rte_flow_error_set
591                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
592                                  mask.vlan,
593                                  "no support for partial masks on"
594                                  " \"tci\" (PCP and VID parts) and"
595                                  " \"inner_type\" fields");
596                 if (mask.vlan->inner_type) {
597                         if (!mnl_attr_put_u16_check
598                             (buf, size, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
599                              spec.vlan->inner_type))
600                                 goto error_nobufs;
601                         vlan_eth_type_set = 1;
602                 }
603                 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
604                      !mnl_attr_put_u8_check
605                      (buf, size, TCA_FLOWER_KEY_VLAN_PRIO,
606                       (rte_be_to_cpu_16(spec.vlan->tci) >> 13) & 0x7)) ||
607                     (mask.vlan->tci & RTE_BE16(0x0fff) &&
608                      !mnl_attr_put_u16_check
609                      (buf, size, TCA_FLOWER_KEY_VLAN_ID,
610                       spec.vlan->tci & RTE_BE16(0x0fff))))
611                         goto error_nobufs;
612                 ++item;
613                 break;
614         case ITEM_IPV4:
615                 if (item->type != RTE_FLOW_ITEM_TYPE_IPV4)
616                         goto trans;
617                 mask.ipv4 = mlx5_nl_flow_item_mask
618                         (item, &rte_flow_item_ipv4_mask,
619                          &mlx5_nl_flow_mask_supported.ipv4,
620                          &mlx5_nl_flow_mask_empty.ipv4,
621                          sizeof(mlx5_nl_flow_mask_supported.ipv4), error);
622                 if (!mask.ipv4)
623                         return -rte_errno;
624                 if ((!eth_type_set || !vlan_eth_type_set) &&
625                     !mnl_attr_put_u16_check(buf, size,
626                                             vlan_present ?
627                                             TCA_FLOWER_KEY_VLAN_ETH_TYPE :
628                                             TCA_FLOWER_KEY_ETH_TYPE,
629                                             RTE_BE16(ETH_P_IP)))
630                         goto error_nobufs;
631                 eth_type_set = 1;
632                 vlan_eth_type_set = 1;
633                 if (mask.ipv4 == &mlx5_nl_flow_mask_empty.ipv4) {
634                         ++item;
635                         break;
636                 }
637                 spec.ipv4 = item->spec;
638                 if (mask.ipv4->hdr.next_proto_id &&
639                     mask.ipv4->hdr.next_proto_id != 0xff)
640                         return rte_flow_error_set
641                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
642                                  mask.ipv4,
643                                  "no support for partial mask on"
644                                  " \"hdr.next_proto_id\" field");
645                 if (mask.ipv4->hdr.next_proto_id) {
646                         if (!mnl_attr_put_u8_check
647                             (buf, size, TCA_FLOWER_KEY_IP_PROTO,
648                              spec.ipv4->hdr.next_proto_id))
649                                 goto error_nobufs;
650                         ip_proto_set = 1;
651                 }
652                 if ((mask.ipv4->hdr.src_addr &&
653                      (!mnl_attr_put_u32_check(buf, size,
654                                               TCA_FLOWER_KEY_IPV4_SRC,
655                                               spec.ipv4->hdr.src_addr) ||
656                       !mnl_attr_put_u32_check(buf, size,
657                                               TCA_FLOWER_KEY_IPV4_SRC_MASK,
658                                               mask.ipv4->hdr.src_addr))) ||
659                     (mask.ipv4->hdr.dst_addr &&
660                      (!mnl_attr_put_u32_check(buf, size,
661                                               TCA_FLOWER_KEY_IPV4_DST,
662                                               spec.ipv4->hdr.dst_addr) ||
663                       !mnl_attr_put_u32_check(buf, size,
664                                               TCA_FLOWER_KEY_IPV4_DST_MASK,
665                                               mask.ipv4->hdr.dst_addr))))
666                         goto error_nobufs;
667                 ++item;
668                 break;
669         case ITEM_IPV6:
670                 if (item->type != RTE_FLOW_ITEM_TYPE_IPV6)
671                         goto trans;
672                 mask.ipv6 = mlx5_nl_flow_item_mask
673                         (item, &rte_flow_item_ipv6_mask,
674                          &mlx5_nl_flow_mask_supported.ipv6,
675                          &mlx5_nl_flow_mask_empty.ipv6,
676                          sizeof(mlx5_nl_flow_mask_supported.ipv6), error);
677                 if (!mask.ipv6)
678                         return -rte_errno;
679                 if ((!eth_type_set || !vlan_eth_type_set) &&
680                     !mnl_attr_put_u16_check(buf, size,
681                                             vlan_present ?
682                                             TCA_FLOWER_KEY_VLAN_ETH_TYPE :
683                                             TCA_FLOWER_KEY_ETH_TYPE,
684                                             RTE_BE16(ETH_P_IPV6)))
685                         goto error_nobufs;
686                 eth_type_set = 1;
687                 vlan_eth_type_set = 1;
688                 if (mask.ipv6 == &mlx5_nl_flow_mask_empty.ipv6) {
689                         ++item;
690                         break;
691                 }
692                 spec.ipv6 = item->spec;
693                 if (mask.ipv6->hdr.proto && mask.ipv6->hdr.proto != 0xff)
694                         return rte_flow_error_set
695                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
696                                  mask.ipv6,
697                                  "no support for partial mask on"
698                                  " \"hdr.proto\" field");
699                 if (mask.ipv6->hdr.proto) {
700                         if (!mnl_attr_put_u8_check
701                             (buf, size, TCA_FLOWER_KEY_IP_PROTO,
702                              spec.ipv6->hdr.proto))
703                                 goto error_nobufs;
704                         ip_proto_set = 1;
705                 }
706                 if ((!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr) &&
707                      (!mnl_attr_put_check(buf, size,
708                                           TCA_FLOWER_KEY_IPV6_SRC,
709                                           sizeof(spec.ipv6->hdr.src_addr),
710                                           spec.ipv6->hdr.src_addr) ||
711                       !mnl_attr_put_check(buf, size,
712                                           TCA_FLOWER_KEY_IPV6_SRC_MASK,
713                                           sizeof(mask.ipv6->hdr.src_addr),
714                                           mask.ipv6->hdr.src_addr))) ||
715                     (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr) &&
716                      (!mnl_attr_put_check(buf, size,
717                                           TCA_FLOWER_KEY_IPV6_DST,
718                                           sizeof(spec.ipv6->hdr.dst_addr),
719                                           spec.ipv6->hdr.dst_addr) ||
720                       !mnl_attr_put_check(buf, size,
721                                           TCA_FLOWER_KEY_IPV6_DST_MASK,
722                                           sizeof(mask.ipv6->hdr.dst_addr),
723                                           mask.ipv6->hdr.dst_addr))))
724                         goto error_nobufs;
725                 ++item;
726                 break;
727         case ITEM_TCP:
728                 if (item->type != RTE_FLOW_ITEM_TYPE_TCP)
729                         goto trans;
730                 mask.tcp = mlx5_nl_flow_item_mask
731                         (item, &rte_flow_item_tcp_mask,
732                          &mlx5_nl_flow_mask_supported.tcp,
733                          &mlx5_nl_flow_mask_empty.tcp,
734                          sizeof(mlx5_nl_flow_mask_supported.tcp), error);
735                 if (!mask.tcp)
736                         return -rte_errno;
737                 if (!ip_proto_set &&
738                     !mnl_attr_put_u8_check(buf, size,
739                                            TCA_FLOWER_KEY_IP_PROTO,
740                                            IPPROTO_TCP))
741                         goto error_nobufs;
742                 if (mask.tcp == &mlx5_nl_flow_mask_empty.tcp) {
743                         ++item;
744                         break;
745                 }
746                 spec.tcp = item->spec;
747                 if ((mask.tcp->hdr.src_port &&
748                      mask.tcp->hdr.src_port != RTE_BE16(0xffff)) ||
749                     (mask.tcp->hdr.dst_port &&
750                      mask.tcp->hdr.dst_port != RTE_BE16(0xffff)))
751                         return rte_flow_error_set
752                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
753                                  mask.tcp,
754                                  "no support for partial masks on"
755                                  " \"hdr.src_port\" and \"hdr.dst_port\""
756                                  " fields");
757                 if ((mask.tcp->hdr.src_port &&
758                      (!mnl_attr_put_u16_check(buf, size,
759                                               TCA_FLOWER_KEY_TCP_SRC,
760                                               spec.tcp->hdr.src_port) ||
761                       !mnl_attr_put_u16_check(buf, size,
762                                               TCA_FLOWER_KEY_TCP_SRC_MASK,
763                                               mask.tcp->hdr.src_port))) ||
764                     (mask.tcp->hdr.dst_port &&
765                      (!mnl_attr_put_u16_check(buf, size,
766                                               TCA_FLOWER_KEY_TCP_DST,
767                                               spec.tcp->hdr.dst_port) ||
768                       !mnl_attr_put_u16_check(buf, size,
769                                               TCA_FLOWER_KEY_TCP_DST_MASK,
770                                               mask.tcp->hdr.dst_port))))
771                         goto error_nobufs;
772                 ++item;
773                 break;
774         case ITEM_UDP:
775                 if (item->type != RTE_FLOW_ITEM_TYPE_UDP)
776                         goto trans;
777                 mask.udp = mlx5_nl_flow_item_mask
778                         (item, &rte_flow_item_udp_mask,
779                          &mlx5_nl_flow_mask_supported.udp,
780                          &mlx5_nl_flow_mask_empty.udp,
781                          sizeof(mlx5_nl_flow_mask_supported.udp), error);
782                 if (!mask.udp)
783                         return -rte_errno;
784                 if (!ip_proto_set &&
785                     !mnl_attr_put_u8_check(buf, size,
786                                            TCA_FLOWER_KEY_IP_PROTO,
787                                            IPPROTO_UDP))
788                         goto error_nobufs;
789                 if (mask.udp == &mlx5_nl_flow_mask_empty.udp) {
790                         ++item;
791                         break;
792                 }
793                 spec.udp = item->spec;
794                 if ((mask.udp->hdr.src_port &&
795                      mask.udp->hdr.src_port != RTE_BE16(0xffff)) ||
796                     (mask.udp->hdr.dst_port &&
797                      mask.udp->hdr.dst_port != RTE_BE16(0xffff)))
798                         return rte_flow_error_set
799                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
800                                  mask.udp,
801                                  "no support for partial masks on"
802                                  " \"hdr.src_port\" and \"hdr.dst_port\""
803                                  " fields");
804                 if ((mask.udp->hdr.src_port &&
805                      (!mnl_attr_put_u16_check(buf, size,
806                                               TCA_FLOWER_KEY_UDP_SRC,
807                                               spec.udp->hdr.src_port) ||
808                       !mnl_attr_put_u16_check(buf, size,
809                                               TCA_FLOWER_KEY_UDP_SRC_MASK,
810                                               mask.udp->hdr.src_port))) ||
811                     (mask.udp->hdr.dst_port &&
812                      (!mnl_attr_put_u16_check(buf, size,
813                                               TCA_FLOWER_KEY_UDP_DST,
814                                               spec.udp->hdr.dst_port) ||
815                       !mnl_attr_put_u16_check(buf, size,
816                                               TCA_FLOWER_KEY_UDP_DST_MASK,
817                                               mask.udp->hdr.dst_port))))
818                         goto error_nobufs;
819                 ++item;
820                 break;
821         case ACTIONS:
822                 if (item->type != RTE_FLOW_ITEM_TYPE_END)
823                         goto trans;
824                 assert(na_flower);
825                 assert(!na_flower_act);
826                 na_flower_act =
827                         mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
828                 if (!na_flower_act)
829                         goto error_nobufs;
830                 act_index_cur = 1;
831                 break;
832         case ACTION_VOID:
833                 if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
834                         goto trans;
835                 ++action;
836                 break;
837         case ACTION_PORT_ID:
838                 if (action->type != RTE_FLOW_ACTION_TYPE_PORT_ID)
839                         goto trans;
840                 conf.port_id = action->conf;
841                 if (conf.port_id->original)
842                         i = 0;
843                 else
844                         for (i = 0; ptoi[i].ifindex; ++i)
845                                 if (ptoi[i].port_id == conf.port_id->id)
846                                         break;
847                 if (!ptoi[i].ifindex)
848                         return rte_flow_error_set
849                                 (error, ENODEV, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
850                                  conf.port_id,
851                                  "missing data to convert port ID to ifindex");
852                 act_index =
853                         mnl_attr_nest_start_check(buf, size, act_index_cur++);
854                 if (!act_index ||
855                     !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "mirred"))
856                         goto error_nobufs;
857                 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
858                 if (!act)
859                         goto error_nobufs;
860                 if (!mnl_attr_put_check(buf, size, TCA_MIRRED_PARMS,
861                                         sizeof(struct tc_mirred),
862                                         &(struct tc_mirred){
863                                                 .action = TC_ACT_STOLEN,
864                                                 .eaction = TCA_EGRESS_REDIR,
865                                                 .ifindex = ptoi[i].ifindex,
866                                         }))
867                         goto error_nobufs;
868                 mnl_attr_nest_end(buf, act);
869                 mnl_attr_nest_end(buf, act_index);
870                 ++action;
871                 break;
872         case ACTION_DROP:
873                 if (action->type != RTE_FLOW_ACTION_TYPE_DROP)
874                         goto trans;
875                 act_index =
876                         mnl_attr_nest_start_check(buf, size, act_index_cur++);
877                 if (!act_index ||
878                     !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "gact"))
879                         goto error_nobufs;
880                 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
881                 if (!act)
882                         goto error_nobufs;
883                 if (!mnl_attr_put_check(buf, size, TCA_GACT_PARMS,
884                                         sizeof(struct tc_gact),
885                                         &(struct tc_gact){
886                                                 .action = TC_ACT_SHOT,
887                                         }))
888                         goto error_nobufs;
889                 mnl_attr_nest_end(buf, act);
890                 mnl_attr_nest_end(buf, act_index);
891                 ++action;
892                 break;
893         case ACTION_OF_POP_VLAN:
894                 if (action->type != RTE_FLOW_ACTION_TYPE_OF_POP_VLAN)
895                         goto trans;
896                 conf.of_push_vlan = NULL;
897                 i = TCA_VLAN_ACT_POP;
898                 goto action_of_vlan;
899         case ACTION_OF_PUSH_VLAN:
900                 if (action->type != RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN)
901                         goto trans;
902                 conf.of_push_vlan = action->conf;
903                 i = TCA_VLAN_ACT_PUSH;
904                 goto action_of_vlan;
905         case ACTION_OF_SET_VLAN_VID:
906                 if (action->type != RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID)
907                         goto trans;
908                 conf.of_set_vlan_vid = action->conf;
909                 if (na_vlan_id)
910                         goto override_na_vlan_id;
911                 i = TCA_VLAN_ACT_MODIFY;
912                 goto action_of_vlan;
913         case ACTION_OF_SET_VLAN_PCP:
914                 if (action->type != RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP)
915                         goto trans;
916                 conf.of_set_vlan_pcp = action->conf;
917                 if (na_vlan_priority)
918                         goto override_na_vlan_priority;
919                 i = TCA_VLAN_ACT_MODIFY;
920                 goto action_of_vlan;
921 action_of_vlan:
922                 act_index =
923                         mnl_attr_nest_start_check(buf, size, act_index_cur++);
924                 if (!act_index ||
925                     !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "vlan"))
926                         goto error_nobufs;
927                 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
928                 if (!act)
929                         goto error_nobufs;
930                 if (!mnl_attr_put_check(buf, size, TCA_VLAN_PARMS,
931                                         sizeof(struct tc_vlan),
932                                         &(struct tc_vlan){
933                                                 .action = TC_ACT_PIPE,
934                                                 .v_action = i,
935                                         }))
936                         goto error_nobufs;
937                 if (i == TCA_VLAN_ACT_POP) {
938                         mnl_attr_nest_end(buf, act);
939                         ++action;
940                         break;
941                 }
942                 if (i == TCA_VLAN_ACT_PUSH &&
943                     !mnl_attr_put_u16_check(buf, size,
944                                             TCA_VLAN_PUSH_VLAN_PROTOCOL,
945                                             conf.of_push_vlan->ethertype))
946                         goto error_nobufs;
947                 na_vlan_id = mnl_nlmsg_get_payload_tail(buf);
948                 if (!mnl_attr_put_u16_check(buf, size, TCA_VLAN_PAD, 0))
949                         goto error_nobufs;
950                 na_vlan_priority = mnl_nlmsg_get_payload_tail(buf);
951                 if (!mnl_attr_put_u8_check(buf, size, TCA_VLAN_PAD, 0))
952                         goto error_nobufs;
953                 mnl_attr_nest_end(buf, act);
954                 mnl_attr_nest_end(buf, act_index);
955                 if (action->type == RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
956 override_na_vlan_id:
957                         na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
958                         *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
959                                 rte_be_to_cpu_16
960                                 (conf.of_set_vlan_vid->vlan_vid);
961                 } else if (action->type ==
962                            RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
963 override_na_vlan_priority:
964                         na_vlan_priority->nla_type =
965                                 TCA_VLAN_PUSH_VLAN_PRIORITY;
966                         *(uint8_t *)mnl_attr_get_payload(na_vlan_priority) =
967                                 conf.of_set_vlan_pcp->vlan_pcp;
968                 }
969                 ++action;
970                 break;
971         case END:
972                 if (item->type != RTE_FLOW_ITEM_TYPE_END ||
973                     action->type != RTE_FLOW_ACTION_TYPE_END)
974                         goto trans;
975                 if (na_flower_act)
976                         mnl_attr_nest_end(buf, na_flower_act);
977                 if (na_flower)
978                         mnl_attr_nest_end(buf, na_flower);
979                 nlh = buf;
980                 return nlh->nlmsg_len;
981         }
982         back = trans;
983         trans = mlx5_nl_flow_trans[trans[n - 1]];
984         n = 0;
985         goto trans;
986 error_nobufs:
987         if (buf != buf_tmp) {
988                 buf = buf_tmp;
989                 size = sizeof(buf_tmp);
990                 goto init;
991         }
992         return rte_flow_error_set
993                 (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
994                  "generated TC message is too large");
995 }
996
997 /**
998  * Brand rtnetlink buffer with unique handle.
999  *
1000  * This handle should be unique for a given network interface to avoid
1001  * collisions.
1002  *
1003  * @param buf
1004  *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
1005  * @param handle
1006  *   Unique 32-bit handle to use.
1007  */
1008 void
1009 mlx5_nl_flow_brand(void *buf, uint32_t handle)
1010 {
1011         struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
1012
1013         tcm->tcm_handle = handle;
1014 }
1015
1016 /**
1017  * Send Netlink message with acknowledgment.
1018  *
1019  * @param nl
1020  *   Libmnl socket to use.
1021  * @param nlh
1022  *   Message to send. This function always raises the NLM_F_ACK flag before
1023  *   sending.
1024  *
1025  * @return
1026  *   0 on success, a negative errno value otherwise and rte_errno is set.
1027  */
1028 static int
1029 mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
1030 {
1031         alignas(struct nlmsghdr)
1032         uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
1033                     nlh->nlmsg_len - sizeof(*nlh)];
1034         uint32_t seq = random();
1035         int ret;
1036
1037         nlh->nlmsg_flags |= NLM_F_ACK;
1038         nlh->nlmsg_seq = seq;
1039         ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
1040         if (ret != -1)
1041                 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
1042         if (ret != -1)
1043                 ret = mnl_cb_run
1044                         (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
1045         if (!ret)
1046                 return 0;
1047         rte_errno = errno;
1048         return -rte_errno;
1049 }
1050
1051 /**
1052  * Create a Netlink flow rule.
1053  *
1054  * @param nl
1055  *   Libmnl socket to use.
1056  * @param buf
1057  *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
1058  * @param[out] error
1059  *   Perform verbose error reporting if not NULL.
1060  *
1061  * @return
1062  *   0 on success, a negative errno value otherwise and rte_errno is set.
1063  */
1064 int
1065 mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
1066                     struct rte_flow_error *error)
1067 {
1068         struct nlmsghdr *nlh = buf;
1069
1070         nlh->nlmsg_type = RTM_NEWTFILTER;
1071         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
1072         if (!mlx5_nl_flow_nl_ack(nl, nlh))
1073                 return 0;
1074         return rte_flow_error_set
1075                 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1076                  "netlink: failed to create TC flow rule");
1077 }
1078
1079 /**
1080  * Destroy a Netlink flow rule.
1081  *
1082  * @param nl
1083  *   Libmnl socket to use.
1084  * @param buf
1085  *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
1086  * @param[out] error
1087  *   Perform verbose error reporting if not NULL.
1088  *
1089  * @return
1090  *   0 on success, a negative errno value otherwise and rte_errno is set.
1091  */
1092 int
1093 mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
1094                      struct rte_flow_error *error)
1095 {
1096         struct nlmsghdr *nlh = buf;
1097
1098         nlh->nlmsg_type = RTM_DELTFILTER;
1099         nlh->nlmsg_flags = NLM_F_REQUEST;
1100         if (!mlx5_nl_flow_nl_ack(nl, nlh))
1101                 return 0;
1102         return rte_flow_error_set
1103                 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1104                  "netlink: failed to destroy TC flow rule");
1105 }
1106
1107 /**
1108  * Initialize ingress qdisc of a given network interface.
1109  *
1110  * @param nl
1111  *   Libmnl socket of the @p NETLINK_ROUTE kind.
1112  * @param ifindex
1113  *   Index of network interface to initialize.
1114  * @param[out] error
1115  *   Perform verbose error reporting if not NULL.
1116  *
1117  * @return
1118  *   0 on success, a negative errno value otherwise and rte_errno is set.
1119  */
1120 int
1121 mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
1122                   struct rte_flow_error *error)
1123 {
1124         struct nlmsghdr *nlh;
1125         struct tcmsg *tcm;
1126         alignas(struct nlmsghdr)
1127         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
1128
1129         /* Destroy existing ingress qdisc and everything attached to it. */
1130         nlh = mnl_nlmsg_put_header(buf);
1131         nlh->nlmsg_type = RTM_DELQDISC;
1132         nlh->nlmsg_flags = NLM_F_REQUEST;
1133         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1134         tcm->tcm_family = AF_UNSPEC;
1135         tcm->tcm_ifindex = ifindex;
1136         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
1137         tcm->tcm_parent = TC_H_INGRESS;
1138         /* Ignore errors when qdisc is already absent. */
1139         if (mlx5_nl_flow_nl_ack(nl, nlh) &&
1140             rte_errno != EINVAL && rte_errno != ENOENT)
1141                 return rte_flow_error_set
1142                         (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
1143                          NULL, "netlink: failed to remove ingress qdisc");
1144         /* Create fresh ingress qdisc. */
1145         nlh = mnl_nlmsg_put_header(buf);
1146         nlh->nlmsg_type = RTM_NEWQDISC;
1147         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
1148         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1149         tcm->tcm_family = AF_UNSPEC;
1150         tcm->tcm_ifindex = ifindex;
1151         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
1152         tcm->tcm_parent = TC_H_INGRESS;
1153         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
1154         if (mlx5_nl_flow_nl_ack(nl, nlh))
1155                 return rte_flow_error_set
1156                         (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
1157                          NULL, "netlink: failed to create ingress qdisc");
1158         return 0;
1159 }
1160
1161 /**
1162  * Create and configure a libmnl socket for Netlink flow rules.
1163  *
1164  * @return
1165  *   A valid libmnl socket object pointer on success, NULL otherwise and
1166  *   rte_errno is set.
1167  */
1168 struct mnl_socket *
1169 mlx5_nl_flow_socket_create(void)
1170 {
1171         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
1172
1173         if (nl) {
1174                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
1175                                       sizeof(int));
1176                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
1177                         return nl;
1178         }
1179         rte_errno = errno;
1180         if (nl)
1181                 mnl_socket_close(nl);
1182         return NULL;
1183 }
1184
1185 /**
1186  * Destroy a libmnl socket.
1187  */
1188 void
1189 mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
1190 {
1191         mnl_socket_close(nl);
1192 }