net/mlx5: add L2-L4 pattern items to switch flow rules
[dpdk.git] / drivers / net / mlx5 / mlx5_nl_flow.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/if_ether.h>
10 #include <linux/netlink.h>
11 #include <linux/pkt_cls.h>
12 #include <linux/pkt_sched.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/tc_act/tc_gact.h>
15 #include <linux/tc_act/tc_mirred.h>
16 #include <netinet/in.h>
17 #include <stdalign.h>
18 #include <stdbool.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <sys/socket.h>
23
24 #include <rte_byteorder.h>
25 #include <rte_errno.h>
26 #include <rte_ether.h>
27 #include <rte_flow.h>
28
29 #include "mlx5.h"
30
31 /* Normally found in linux/netlink.h. */
32 #ifndef NETLINK_CAP_ACK
33 #define NETLINK_CAP_ACK 10
34 #endif
35
36 /* Normally found in linux/pkt_sched.h. */
37 #ifndef TC_H_MIN_INGRESS
38 #define TC_H_MIN_INGRESS 0xfff2u
39 #endif
40
41 /* Normally found in linux/pkt_cls.h. */
42 #ifndef TCA_CLS_FLAGS_SKIP_SW
43 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
44 #endif
45 #ifndef HAVE_TCA_FLOWER_ACT
46 #define TCA_FLOWER_ACT 3
47 #endif
48 #ifndef HAVE_TCA_FLOWER_FLAGS
49 #define TCA_FLOWER_FLAGS 22
50 #endif
51 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
52 #define TCA_FLOWER_KEY_ETH_TYPE 8
53 #endif
54 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
55 #define TCA_FLOWER_KEY_ETH_DST 4
56 #endif
57 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
58 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
59 #endif
60 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
61 #define TCA_FLOWER_KEY_ETH_SRC 6
62 #endif
63 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
64 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
65 #endif
66 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
67 #define TCA_FLOWER_KEY_IP_PROTO 9
68 #endif
69 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
70 #define TCA_FLOWER_KEY_IPV4_SRC 10
71 #endif
72 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
73 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
74 #endif
75 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
76 #define TCA_FLOWER_KEY_IPV4_DST 12
77 #endif
78 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
79 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
80 #endif
81 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
82 #define TCA_FLOWER_KEY_IPV6_SRC 14
83 #endif
84 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
85 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
86 #endif
87 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
88 #define TCA_FLOWER_KEY_IPV6_DST 16
89 #endif
90 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
91 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
92 #endif
93 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
94 #define TCA_FLOWER_KEY_TCP_SRC 18
95 #endif
96 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
97 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
98 #endif
99 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
100 #define TCA_FLOWER_KEY_TCP_DST 19
101 #endif
102 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
103 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
104 #endif
105 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
106 #define TCA_FLOWER_KEY_UDP_SRC 20
107 #endif
108 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
109 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
110 #endif
111 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
112 #define TCA_FLOWER_KEY_UDP_DST 21
113 #endif
114 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
115 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
116 #endif
117
118 /** Parser state definitions for mlx5_nl_flow_trans[]. */
119 enum mlx5_nl_flow_trans {
120         INVALID,
121         BACK,
122         ATTR,
123         PATTERN,
124         ITEM_VOID,
125         ITEM_ETH,
126         ITEM_IPV4,
127         ITEM_IPV6,
128         ITEM_TCP,
129         ITEM_UDP,
130         ACTIONS,
131         ACTION_VOID,
132         ACTION_PORT_ID,
133         ACTION_DROP,
134         END,
135 };
136
137 #define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
138
139 #define PATTERN_COMMON \
140         ITEM_VOID, ACTIONS
141 #define ACTIONS_COMMON \
142         ACTION_VOID
143 #define ACTIONS_FATE \
144         ACTION_PORT_ID, ACTION_DROP
145
146 /** Parser state transitions used by mlx5_nl_flow_transpose(). */
147 static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
148         [INVALID] = NULL,
149         [BACK] = NULL,
150         [ATTR] = TRANS(PATTERN),
151         [PATTERN] = TRANS(ITEM_ETH, PATTERN_COMMON),
152         [ITEM_VOID] = TRANS(BACK),
153         [ITEM_ETH] = TRANS(ITEM_IPV4, ITEM_IPV6, PATTERN_COMMON),
154         [ITEM_IPV4] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
155         [ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
156         [ITEM_TCP] = TRANS(PATTERN_COMMON),
157         [ITEM_UDP] = TRANS(PATTERN_COMMON),
158         [ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
159         [ACTION_VOID] = TRANS(BACK),
160         [ACTION_PORT_ID] = TRANS(ACTION_VOID, END),
161         [ACTION_DROP] = TRANS(ACTION_VOID, END),
162         [END] = NULL,
163 };
164
165 /** Empty masks for known item types. */
166 static const union {
167         struct rte_flow_item_eth eth;
168         struct rte_flow_item_ipv4 ipv4;
169         struct rte_flow_item_ipv6 ipv6;
170         struct rte_flow_item_tcp tcp;
171         struct rte_flow_item_udp udp;
172 } mlx5_nl_flow_mask_empty;
173
174 /** Supported masks for known item types. */
175 static const struct {
176         struct rte_flow_item_eth eth;
177         struct rte_flow_item_ipv4 ipv4;
178         struct rte_flow_item_ipv6 ipv6;
179         struct rte_flow_item_tcp tcp;
180         struct rte_flow_item_udp udp;
181 } mlx5_nl_flow_mask_supported = {
182         .eth = {
183                 .type = RTE_BE16(0xffff),
184                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
185                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
186         },
187         .ipv4.hdr = {
188                 .next_proto_id = 0xff,
189                 .src_addr = RTE_BE32(0xffffffff),
190                 .dst_addr = RTE_BE32(0xffffffff),
191         },
192         .ipv6.hdr = {
193                 .proto = 0xff,
194                 .src_addr =
195                         "\xff\xff\xff\xff\xff\xff\xff\xff"
196                         "\xff\xff\xff\xff\xff\xff\xff\xff",
197                 .dst_addr =
198                         "\xff\xff\xff\xff\xff\xff\xff\xff"
199                         "\xff\xff\xff\xff\xff\xff\xff\xff",
200         },
201         .tcp.hdr = {
202                 .src_port = RTE_BE16(0xffff),
203                 .dst_port = RTE_BE16(0xffff),
204         },
205         .udp.hdr = {
206                 .src_port = RTE_BE16(0xffff),
207                 .dst_port = RTE_BE16(0xffff),
208         },
209 };
210
211 /**
212  * Retrieve mask for pattern item.
213  *
214  * This function does basic sanity checks on a pattern item in order to
215  * return the most appropriate mask for it.
216  *
217  * @param[in] item
218  *   Item specification.
219  * @param[in] mask_default
220  *   Default mask for pattern item as specified by the flow API.
221  * @param[in] mask_supported
222  *   Mask fields supported by the implementation.
223  * @param[in] mask_empty
224  *   Empty mask to return when there is no specification.
225  * @param[out] error
226  *   Perform verbose error reporting if not NULL.
227  *
228  * @return
229  *   Either @p item->mask or one of the mask parameters on success, NULL
230  *   otherwise and rte_errno is set.
231  */
232 static const void *
233 mlx5_nl_flow_item_mask(const struct rte_flow_item *item,
234                        const void *mask_default,
235                        const void *mask_supported,
236                        const void *mask_empty,
237                        size_t mask_size,
238                        struct rte_flow_error *error)
239 {
240         const uint8_t *mask;
241         size_t i;
242
243         /* item->last and item->mask cannot exist without item->spec. */
244         if (!item->spec && (item->mask || item->last)) {
245                 rte_flow_error_set
246                         (error, EINVAL, RTE_FLOW_ERROR_TYPE_ITEM, item,
247                          "\"mask\" or \"last\" field provided without a"
248                          " corresponding \"spec\"");
249                 return NULL;
250         }
251         /* No spec, no mask, no problem. */
252         if (!item->spec)
253                 return mask_empty;
254         mask = item->mask ? item->mask : mask_default;
255         assert(mask);
256         /*
257          * Single-pass check to make sure that:
258          * - Mask is supported, no bits are set outside mask_supported.
259          * - Both item->spec and item->last are included in mask.
260          */
261         for (i = 0; i != mask_size; ++i) {
262                 if (!mask[i])
263                         continue;
264                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
265                     ((const uint8_t *)mask_supported)[i]) {
266                         rte_flow_error_set
267                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
268                                  mask, "unsupported field found in \"mask\"");
269                         return NULL;
270                 }
271                 if (item->last &&
272                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
273                     (((const uint8_t *)item->last)[i] & mask[i])) {
274                         rte_flow_error_set
275                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_LAST,
276                                  item->last,
277                                  "range between \"spec\" and \"last\" not"
278                                  " comprised in \"mask\"");
279                         return NULL;
280                 }
281         }
282         return mask;
283 }
284
285 /**
286  * Transpose flow rule description to rtnetlink message.
287  *
288  * This function transposes a flow rule description to a traffic control
289  * (TC) filter creation message ready to be sent over Netlink.
290  *
291  * Target interface is specified as the first entry of the @p ptoi table.
292  * Subsequent entries enable this function to resolve other DPDK port IDs
293  * found in the flow rule.
294  *
295  * @param[out] buf
296  *   Output message buffer. May be NULL when @p size is 0.
297  * @param size
298  *   Size of @p buf. Message may be truncated if not large enough.
299  * @param[in] ptoi
300  *   DPDK port ID to network interface index translation table. This table
301  *   is terminated by an entry with a zero ifindex value.
302  * @param[in] attr
303  *   Flow rule attributes.
304  * @param[in] pattern
305  *   Pattern specification.
306  * @param[in] actions
307  *   Associated actions.
308  * @param[out] error
309  *   Perform verbose error reporting if not NULL.
310  *
311  * @return
312  *   A positive value representing the exact size of the message in bytes
313  *   regardless of the @p size parameter on success, a negative errno value
314  *   otherwise and rte_errno is set.
315  */
316 int
317 mlx5_nl_flow_transpose(void *buf,
318                        size_t size,
319                        const struct mlx5_nl_flow_ptoi *ptoi,
320                        const struct rte_flow_attr *attr,
321                        const struct rte_flow_item *pattern,
322                        const struct rte_flow_action *actions,
323                        struct rte_flow_error *error)
324 {
325         alignas(struct nlmsghdr)
326         uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
327         const struct rte_flow_item *item;
328         const struct rte_flow_action *action;
329         unsigned int n;
330         uint32_t act_index_cur;
331         bool eth_type_set;
332         bool ip_proto_set;
333         struct nlattr *na_flower;
334         struct nlattr *na_flower_act;
335         const enum mlx5_nl_flow_trans *trans;
336         const enum mlx5_nl_flow_trans *back;
337
338         if (!size)
339                 goto error_nobufs;
340 init:
341         item = pattern;
342         action = actions;
343         n = 0;
344         act_index_cur = 0;
345         eth_type_set = false;
346         ip_proto_set = false;
347         na_flower = NULL;
348         na_flower_act = NULL;
349         trans = TRANS(ATTR);
350         back = trans;
351 trans:
352         switch (trans[n++]) {
353                 union {
354                         const struct rte_flow_item_eth *eth;
355                         const struct rte_flow_item_ipv4 *ipv4;
356                         const struct rte_flow_item_ipv6 *ipv6;
357                         const struct rte_flow_item_tcp *tcp;
358                         const struct rte_flow_item_udp *udp;
359                 } spec, mask;
360                 union {
361                         const struct rte_flow_action_port_id *port_id;
362                 } conf;
363                 struct nlmsghdr *nlh;
364                 struct tcmsg *tcm;
365                 struct nlattr *act_index;
366                 struct nlattr *act;
367                 unsigned int i;
368
369         case INVALID:
370                 if (item->type)
371                         return rte_flow_error_set
372                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
373                                  item, "unsupported pattern item combination");
374                 else if (action->type)
375                         return rte_flow_error_set
376                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
377                                  action, "unsupported action combination");
378                 return rte_flow_error_set
379                         (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
380                          "flow rule lacks some kind of fate action");
381         case BACK:
382                 trans = back;
383                 n = 0;
384                 goto trans;
385         case ATTR:
386                 /*
387                  * Supported attributes: no groups, some priorities and
388                  * ingress only. Don't care about transfer as it is the
389                  * caller's problem.
390                  */
391                 if (attr->group)
392                         return rte_flow_error_set
393                                 (error, ENOTSUP,
394                                  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
395                                  attr, "groups are not supported");
396                 if (attr->priority > 0xfffe)
397                         return rte_flow_error_set
398                                 (error, ENOTSUP,
399                                  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
400                                  attr, "lowest priority level is 0xfffe");
401                 if (!attr->ingress)
402                         return rte_flow_error_set
403                                 (error, ENOTSUP,
404                                  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
405                                  attr, "only ingress is supported");
406                 if (attr->egress)
407                         return rte_flow_error_set
408                                 (error, ENOTSUP,
409                                  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
410                                  attr, "egress is not supported");
411                 if (size < mnl_nlmsg_size(sizeof(*tcm)))
412                         goto error_nobufs;
413                 nlh = mnl_nlmsg_put_header(buf);
414                 nlh->nlmsg_type = 0;
415                 nlh->nlmsg_flags = 0;
416                 nlh->nlmsg_seq = 0;
417                 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
418                 tcm->tcm_family = AF_UNSPEC;
419                 tcm->tcm_ifindex = ptoi[0].ifindex;
420                 /*
421                  * Let kernel pick a handle by default. A predictable handle
422                  * can be set by the caller on the resulting buffer through
423                  * mlx5_nl_flow_brand().
424                  */
425                 tcm->tcm_handle = 0;
426                 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
427                 /*
428                  * Priority cannot be zero to prevent the kernel from
429                  * picking one automatically.
430                  */
431                 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
432                                           RTE_BE16(ETH_P_ALL));
433                 break;
434         case PATTERN:
435                 if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
436                         goto error_nobufs;
437                 na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
438                 if (!na_flower)
439                         goto error_nobufs;
440                 if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
441                                             TCA_CLS_FLAGS_SKIP_SW))
442                         goto error_nobufs;
443                 break;
444         case ITEM_VOID:
445                 if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
446                         goto trans;
447                 ++item;
448                 break;
449         case ITEM_ETH:
450                 if (item->type != RTE_FLOW_ITEM_TYPE_ETH)
451                         goto trans;
452                 mask.eth = mlx5_nl_flow_item_mask
453                         (item, &rte_flow_item_eth_mask,
454                          &mlx5_nl_flow_mask_supported.eth,
455                          &mlx5_nl_flow_mask_empty.eth,
456                          sizeof(mlx5_nl_flow_mask_supported.eth), error);
457                 if (!mask.eth)
458                         return -rte_errno;
459                 if (mask.eth == &mlx5_nl_flow_mask_empty.eth) {
460                         ++item;
461                         break;
462                 }
463                 spec.eth = item->spec;
464                 if (mask.eth->type && mask.eth->type != RTE_BE16(0xffff))
465                         return rte_flow_error_set
466                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
467                                  mask.eth,
468                                  "no support for partial mask on"
469                                  " \"type\" field");
470                 if (mask.eth->type) {
471                         if (!mnl_attr_put_u16_check(buf, size,
472                                                     TCA_FLOWER_KEY_ETH_TYPE,
473                                                     spec.eth->type))
474                                 goto error_nobufs;
475                         eth_type_set = 1;
476                 }
477                 if ((!is_zero_ether_addr(&mask.eth->dst) &&
478                      (!mnl_attr_put_check(buf, size,
479                                           TCA_FLOWER_KEY_ETH_DST,
480                                           ETHER_ADDR_LEN,
481                                           spec.eth->dst.addr_bytes) ||
482                       !mnl_attr_put_check(buf, size,
483                                           TCA_FLOWER_KEY_ETH_DST_MASK,
484                                           ETHER_ADDR_LEN,
485                                           mask.eth->dst.addr_bytes))) ||
486                     (!is_zero_ether_addr(&mask.eth->src) &&
487                      (!mnl_attr_put_check(buf, size,
488                                           TCA_FLOWER_KEY_ETH_SRC,
489                                           ETHER_ADDR_LEN,
490                                           spec.eth->src.addr_bytes) ||
491                       !mnl_attr_put_check(buf, size,
492                                           TCA_FLOWER_KEY_ETH_SRC_MASK,
493                                           ETHER_ADDR_LEN,
494                                           mask.eth->src.addr_bytes))))
495                         goto error_nobufs;
496                 ++item;
497                 break;
498         case ITEM_IPV4:
499                 if (item->type != RTE_FLOW_ITEM_TYPE_IPV4)
500                         goto trans;
501                 mask.ipv4 = mlx5_nl_flow_item_mask
502                         (item, &rte_flow_item_ipv4_mask,
503                          &mlx5_nl_flow_mask_supported.ipv4,
504                          &mlx5_nl_flow_mask_empty.ipv4,
505                          sizeof(mlx5_nl_flow_mask_supported.ipv4), error);
506                 if (!mask.ipv4)
507                         return -rte_errno;
508                 if (!eth_type_set &&
509                     !mnl_attr_put_u16_check(buf, size,
510                                             TCA_FLOWER_KEY_ETH_TYPE,
511                                             RTE_BE16(ETH_P_IP)))
512                         goto error_nobufs;
513                 eth_type_set = 1;
514                 if (mask.ipv4 == &mlx5_nl_flow_mask_empty.ipv4) {
515                         ++item;
516                         break;
517                 }
518                 spec.ipv4 = item->spec;
519                 if (mask.ipv4->hdr.next_proto_id &&
520                     mask.ipv4->hdr.next_proto_id != 0xff)
521                         return rte_flow_error_set
522                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
523                                  mask.ipv4,
524                                  "no support for partial mask on"
525                                  " \"hdr.next_proto_id\" field");
526                 if (mask.ipv4->hdr.next_proto_id) {
527                         if (!mnl_attr_put_u8_check
528                             (buf, size, TCA_FLOWER_KEY_IP_PROTO,
529                              spec.ipv4->hdr.next_proto_id))
530                                 goto error_nobufs;
531                         ip_proto_set = 1;
532                 }
533                 if ((mask.ipv4->hdr.src_addr &&
534                      (!mnl_attr_put_u32_check(buf, size,
535                                               TCA_FLOWER_KEY_IPV4_SRC,
536                                               spec.ipv4->hdr.src_addr) ||
537                       !mnl_attr_put_u32_check(buf, size,
538                                               TCA_FLOWER_KEY_IPV4_SRC_MASK,
539                                               mask.ipv4->hdr.src_addr))) ||
540                     (mask.ipv4->hdr.dst_addr &&
541                      (!mnl_attr_put_u32_check(buf, size,
542                                               TCA_FLOWER_KEY_IPV4_DST,
543                                               spec.ipv4->hdr.dst_addr) ||
544                       !mnl_attr_put_u32_check(buf, size,
545                                               TCA_FLOWER_KEY_IPV4_DST_MASK,
546                                               mask.ipv4->hdr.dst_addr))))
547                         goto error_nobufs;
548                 ++item;
549                 break;
550         case ITEM_IPV6:
551                 if (item->type != RTE_FLOW_ITEM_TYPE_IPV6)
552                         goto trans;
553                 mask.ipv6 = mlx5_nl_flow_item_mask
554                         (item, &rte_flow_item_ipv6_mask,
555                          &mlx5_nl_flow_mask_supported.ipv6,
556                          &mlx5_nl_flow_mask_empty.ipv6,
557                          sizeof(mlx5_nl_flow_mask_supported.ipv6), error);
558                 if (!mask.ipv6)
559                         return -rte_errno;
560                 if (!eth_type_set &&
561                     !mnl_attr_put_u16_check(buf, size,
562                                             TCA_FLOWER_KEY_ETH_TYPE,
563                                             RTE_BE16(ETH_P_IPV6)))
564                         goto error_nobufs;
565                 eth_type_set = 1;
566                 if (mask.ipv6 == &mlx5_nl_flow_mask_empty.ipv6) {
567                         ++item;
568                         break;
569                 }
570                 spec.ipv6 = item->spec;
571                 if (mask.ipv6->hdr.proto && mask.ipv6->hdr.proto != 0xff)
572                         return rte_flow_error_set
573                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
574                                  mask.ipv6,
575                                  "no support for partial mask on"
576                                  " \"hdr.proto\" field");
577                 if (mask.ipv6->hdr.proto) {
578                         if (!mnl_attr_put_u8_check
579                             (buf, size, TCA_FLOWER_KEY_IP_PROTO,
580                              spec.ipv6->hdr.proto))
581                                 goto error_nobufs;
582                         ip_proto_set = 1;
583                 }
584                 if ((!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr) &&
585                      (!mnl_attr_put_check(buf, size,
586                                           TCA_FLOWER_KEY_IPV6_SRC,
587                                           sizeof(spec.ipv6->hdr.src_addr),
588                                           spec.ipv6->hdr.src_addr) ||
589                       !mnl_attr_put_check(buf, size,
590                                           TCA_FLOWER_KEY_IPV6_SRC_MASK,
591                                           sizeof(mask.ipv6->hdr.src_addr),
592                                           mask.ipv6->hdr.src_addr))) ||
593                     (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr) &&
594                      (!mnl_attr_put_check(buf, size,
595                                           TCA_FLOWER_KEY_IPV6_DST,
596                                           sizeof(spec.ipv6->hdr.dst_addr),
597                                           spec.ipv6->hdr.dst_addr) ||
598                       !mnl_attr_put_check(buf, size,
599                                           TCA_FLOWER_KEY_IPV6_DST_MASK,
600                                           sizeof(mask.ipv6->hdr.dst_addr),
601                                           mask.ipv6->hdr.dst_addr))))
602                         goto error_nobufs;
603                 ++item;
604                 break;
605         case ITEM_TCP:
606                 if (item->type != RTE_FLOW_ITEM_TYPE_TCP)
607                         goto trans;
608                 mask.tcp = mlx5_nl_flow_item_mask
609                         (item, &rte_flow_item_tcp_mask,
610                          &mlx5_nl_flow_mask_supported.tcp,
611                          &mlx5_nl_flow_mask_empty.tcp,
612                          sizeof(mlx5_nl_flow_mask_supported.tcp), error);
613                 if (!mask.tcp)
614                         return -rte_errno;
615                 if (!ip_proto_set &&
616                     !mnl_attr_put_u8_check(buf, size,
617                                            TCA_FLOWER_KEY_IP_PROTO,
618                                            IPPROTO_TCP))
619                         goto error_nobufs;
620                 if (mask.tcp == &mlx5_nl_flow_mask_empty.tcp) {
621                         ++item;
622                         break;
623                 }
624                 spec.tcp = item->spec;
625                 if ((mask.tcp->hdr.src_port &&
626                      mask.tcp->hdr.src_port != RTE_BE16(0xffff)) ||
627                     (mask.tcp->hdr.dst_port &&
628                      mask.tcp->hdr.dst_port != RTE_BE16(0xffff)))
629                         return rte_flow_error_set
630                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
631                                  mask.tcp,
632                                  "no support for partial masks on"
633                                  " \"hdr.src_port\" and \"hdr.dst_port\""
634                                  " fields");
635                 if ((mask.tcp->hdr.src_port &&
636                      (!mnl_attr_put_u16_check(buf, size,
637                                               TCA_FLOWER_KEY_TCP_SRC,
638                                               spec.tcp->hdr.src_port) ||
639                       !mnl_attr_put_u16_check(buf, size,
640                                               TCA_FLOWER_KEY_TCP_SRC_MASK,
641                                               mask.tcp->hdr.src_port))) ||
642                     (mask.tcp->hdr.dst_port &&
643                      (!mnl_attr_put_u16_check(buf, size,
644                                               TCA_FLOWER_KEY_TCP_DST,
645                                               spec.tcp->hdr.dst_port) ||
646                       !mnl_attr_put_u16_check(buf, size,
647                                               TCA_FLOWER_KEY_TCP_DST_MASK,
648                                               mask.tcp->hdr.dst_port))))
649                         goto error_nobufs;
650                 ++item;
651                 break;
652         case ITEM_UDP:
653                 if (item->type != RTE_FLOW_ITEM_TYPE_UDP)
654                         goto trans;
655                 mask.udp = mlx5_nl_flow_item_mask
656                         (item, &rte_flow_item_udp_mask,
657                          &mlx5_nl_flow_mask_supported.udp,
658                          &mlx5_nl_flow_mask_empty.udp,
659                          sizeof(mlx5_nl_flow_mask_supported.udp), error);
660                 if (!mask.udp)
661                         return -rte_errno;
662                 if (!ip_proto_set &&
663                     !mnl_attr_put_u8_check(buf, size,
664                                            TCA_FLOWER_KEY_IP_PROTO,
665                                            IPPROTO_UDP))
666                         goto error_nobufs;
667                 if (mask.udp == &mlx5_nl_flow_mask_empty.udp) {
668                         ++item;
669                         break;
670                 }
671                 spec.udp = item->spec;
672                 if ((mask.udp->hdr.src_port &&
673                      mask.udp->hdr.src_port != RTE_BE16(0xffff)) ||
674                     (mask.udp->hdr.dst_port &&
675                      mask.udp->hdr.dst_port != RTE_BE16(0xffff)))
676                         return rte_flow_error_set
677                                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
678                                  mask.udp,
679                                  "no support for partial masks on"
680                                  " \"hdr.src_port\" and \"hdr.dst_port\""
681                                  " fields");
682                 if ((mask.udp->hdr.src_port &&
683                      (!mnl_attr_put_u16_check(buf, size,
684                                               TCA_FLOWER_KEY_UDP_SRC,
685                                               spec.udp->hdr.src_port) ||
686                       !mnl_attr_put_u16_check(buf, size,
687                                               TCA_FLOWER_KEY_UDP_SRC_MASK,
688                                               mask.udp->hdr.src_port))) ||
689                     (mask.udp->hdr.dst_port &&
690                      (!mnl_attr_put_u16_check(buf, size,
691                                               TCA_FLOWER_KEY_UDP_DST,
692                                               spec.udp->hdr.dst_port) ||
693                       !mnl_attr_put_u16_check(buf, size,
694                                               TCA_FLOWER_KEY_UDP_DST_MASK,
695                                               mask.udp->hdr.dst_port))))
696                         goto error_nobufs;
697                 ++item;
698                 break;
699         case ACTIONS:
700                 if (item->type != RTE_FLOW_ITEM_TYPE_END)
701                         goto trans;
702                 assert(na_flower);
703                 assert(!na_flower_act);
704                 na_flower_act =
705                         mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
706                 if (!na_flower_act)
707                         goto error_nobufs;
708                 act_index_cur = 1;
709                 break;
710         case ACTION_VOID:
711                 if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
712                         goto trans;
713                 ++action;
714                 break;
715         case ACTION_PORT_ID:
716                 if (action->type != RTE_FLOW_ACTION_TYPE_PORT_ID)
717                         goto trans;
718                 conf.port_id = action->conf;
719                 if (conf.port_id->original)
720                         i = 0;
721                 else
722                         for (i = 0; ptoi[i].ifindex; ++i)
723                                 if (ptoi[i].port_id == conf.port_id->id)
724                                         break;
725                 if (!ptoi[i].ifindex)
726                         return rte_flow_error_set
727                                 (error, ENODEV, RTE_FLOW_ERROR_TYPE_ACTION_CONF,
728                                  conf.port_id,
729                                  "missing data to convert port ID to ifindex");
730                 act_index =
731                         mnl_attr_nest_start_check(buf, size, act_index_cur++);
732                 if (!act_index ||
733                     !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "mirred"))
734                         goto error_nobufs;
735                 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
736                 if (!act)
737                         goto error_nobufs;
738                 if (!mnl_attr_put_check(buf, size, TCA_MIRRED_PARMS,
739                                         sizeof(struct tc_mirred),
740                                         &(struct tc_mirred){
741                                                 .action = TC_ACT_STOLEN,
742                                                 .eaction = TCA_EGRESS_REDIR,
743                                                 .ifindex = ptoi[i].ifindex,
744                                         }))
745                         goto error_nobufs;
746                 mnl_attr_nest_end(buf, act);
747                 mnl_attr_nest_end(buf, act_index);
748                 ++action;
749                 break;
750         case ACTION_DROP:
751                 if (action->type != RTE_FLOW_ACTION_TYPE_DROP)
752                         goto trans;
753                 act_index =
754                         mnl_attr_nest_start_check(buf, size, act_index_cur++);
755                 if (!act_index ||
756                     !mnl_attr_put_strz_check(buf, size, TCA_ACT_KIND, "gact"))
757                         goto error_nobufs;
758                 act = mnl_attr_nest_start_check(buf, size, TCA_ACT_OPTIONS);
759                 if (!act)
760                         goto error_nobufs;
761                 if (!mnl_attr_put_check(buf, size, TCA_GACT_PARMS,
762                                         sizeof(struct tc_gact),
763                                         &(struct tc_gact){
764                                                 .action = TC_ACT_SHOT,
765                                         }))
766                         goto error_nobufs;
767                 mnl_attr_nest_end(buf, act);
768                 mnl_attr_nest_end(buf, act_index);
769                 ++action;
770                 break;
771         case END:
772                 if (item->type != RTE_FLOW_ITEM_TYPE_END ||
773                     action->type != RTE_FLOW_ACTION_TYPE_END)
774                         goto trans;
775                 if (na_flower_act)
776                         mnl_attr_nest_end(buf, na_flower_act);
777                 if (na_flower)
778                         mnl_attr_nest_end(buf, na_flower);
779                 nlh = buf;
780                 return nlh->nlmsg_len;
781         }
782         back = trans;
783         trans = mlx5_nl_flow_trans[trans[n - 1]];
784         n = 0;
785         goto trans;
786 error_nobufs:
787         if (buf != buf_tmp) {
788                 buf = buf_tmp;
789                 size = sizeof(buf_tmp);
790                 goto init;
791         }
792         return rte_flow_error_set
793                 (error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
794                  "generated TC message is too large");
795 }
796
797 /**
798  * Brand rtnetlink buffer with unique handle.
799  *
800  * This handle should be unique for a given network interface to avoid
801  * collisions.
802  *
803  * @param buf
804  *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
805  * @param handle
806  *   Unique 32-bit handle to use.
807  */
808 void
809 mlx5_nl_flow_brand(void *buf, uint32_t handle)
810 {
811         struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
812
813         tcm->tcm_handle = handle;
814 }
815
816 /**
817  * Send Netlink message with acknowledgment.
818  *
819  * @param nl
820  *   Libmnl socket to use.
821  * @param nlh
822  *   Message to send. This function always raises the NLM_F_ACK flag before
823  *   sending.
824  *
825  * @return
826  *   0 on success, a negative errno value otherwise and rte_errno is set.
827  */
828 static int
829 mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
830 {
831         alignas(struct nlmsghdr)
832         uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
833                     nlh->nlmsg_len - sizeof(*nlh)];
834         uint32_t seq = random();
835         int ret;
836
837         nlh->nlmsg_flags |= NLM_F_ACK;
838         nlh->nlmsg_seq = seq;
839         ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
840         if (ret != -1)
841                 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
842         if (ret != -1)
843                 ret = mnl_cb_run
844                         (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
845         if (!ret)
846                 return 0;
847         rte_errno = errno;
848         return -rte_errno;
849 }
850
851 /**
852  * Create a Netlink flow rule.
853  *
854  * @param nl
855  *   Libmnl socket to use.
856  * @param buf
857  *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
858  * @param[out] error
859  *   Perform verbose error reporting if not NULL.
860  *
861  * @return
862  *   0 on success, a negative errno value otherwise and rte_errno is set.
863  */
864 int
865 mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
866                     struct rte_flow_error *error)
867 {
868         struct nlmsghdr *nlh = buf;
869
870         nlh->nlmsg_type = RTM_NEWTFILTER;
871         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
872         if (!mlx5_nl_flow_nl_ack(nl, nlh))
873                 return 0;
874         return rte_flow_error_set
875                 (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
876                  "netlink: failed to create TC flow rule");
877 }
878
879 /**
880  * Destroy a Netlink flow rule.
881  *
882  * @param nl
883  *   Libmnl socket to use.
884  * @param buf
885  *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
886  * @param[out] error
887  *   Perform verbose error reporting if not NULL.
888  *
889  * @return
890  *   0 on success, a negative errno value otherwise and rte_errno is set.
891  */
892 int
893 mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
894                      struct rte_flow_error *error)
895 {
896         struct nlmsghdr *nlh = buf;
897
898         nlh->nlmsg_type = RTM_DELTFILTER;
899         nlh->nlmsg_flags = NLM_F_REQUEST;
900         if (!mlx5_nl_flow_nl_ack(nl, nlh))
901                 return 0;
902         return rte_flow_error_set
903                 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
904                  "netlink: failed to destroy TC flow rule");
905 }
906
907 /**
908  * Initialize ingress qdisc of a given network interface.
909  *
910  * @param nl
911  *   Libmnl socket of the @p NETLINK_ROUTE kind.
912  * @param ifindex
913  *   Index of network interface to initialize.
914  * @param[out] error
915  *   Perform verbose error reporting if not NULL.
916  *
917  * @return
918  *   0 on success, a negative errno value otherwise and rte_errno is set.
919  */
920 int
921 mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
922                   struct rte_flow_error *error)
923 {
924         struct nlmsghdr *nlh;
925         struct tcmsg *tcm;
926         alignas(struct nlmsghdr)
927         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
928
929         /* Destroy existing ingress qdisc and everything attached to it. */
930         nlh = mnl_nlmsg_put_header(buf);
931         nlh->nlmsg_type = RTM_DELQDISC;
932         nlh->nlmsg_flags = NLM_F_REQUEST;
933         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
934         tcm->tcm_family = AF_UNSPEC;
935         tcm->tcm_ifindex = ifindex;
936         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
937         tcm->tcm_parent = TC_H_INGRESS;
938         /* Ignore errors when qdisc is already absent. */
939         if (mlx5_nl_flow_nl_ack(nl, nlh) &&
940             rte_errno != EINVAL && rte_errno != ENOENT)
941                 return rte_flow_error_set
942                         (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
943                          NULL, "netlink: failed to remove ingress qdisc");
944         /* Create fresh ingress qdisc. */
945         nlh = mnl_nlmsg_put_header(buf);
946         nlh->nlmsg_type = RTM_NEWQDISC;
947         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
948         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
949         tcm->tcm_family = AF_UNSPEC;
950         tcm->tcm_ifindex = ifindex;
951         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
952         tcm->tcm_parent = TC_H_INGRESS;
953         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
954         if (mlx5_nl_flow_nl_ack(nl, nlh))
955                 return rte_flow_error_set
956                         (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
957                          NULL, "netlink: failed to create ingress qdisc");
958         return 0;
959 }
960
961 /**
962  * Create and configure a libmnl socket for Netlink flow rules.
963  *
964  * @return
965  *   A valid libmnl socket object pointer on success, NULL otherwise and
966  *   rte_errno is set.
967  */
968 struct mnl_socket *
969 mlx5_nl_flow_socket_create(void)
970 {
971         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
972
973         if (nl) {
974                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
975                                       sizeof(int));
976                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
977                         return nl;
978         }
979         rte_errno = errno;
980         if (nl)
981                 mnl_socket_close(nl);
982         return NULL;
983 }
984
985 /**
986  * Destroy a libmnl socket.
987  */
988 void
989 mlx5_nl_flow_socket_destroy(struct mnl_socket *nl)
990 {
991         mnl_socket_close(nl);
992 }