net/mlx5: support e-switch TCP-flags flow filter
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/if_ether.h>
10 #include <linux/netlink.h>
11 #include <linux/pkt_cls.h>
12 #include <linux/pkt_sched.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/tc_act/tc_gact.h>
15 #include <linux/tc_act/tc_mirred.h>
16 #include <netinet/in.h>
17 #include <stdalign.h>
18 #include <stdbool.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <sys/socket.h>
23
24 #include <rte_byteorder.h>
25 #include <rte_errno.h>
26 #include <rte_ether.h>
27 #include <rte_flow.h>
28 #include <rte_malloc.h>
29
30 #include "mlx5.h"
31 #include "mlx5_flow.h"
32 #include "mlx5_autoconf.h"
33
34 #ifdef HAVE_TC_ACT_VLAN
35
36 #include <linux/tc_act/tc_vlan.h>
37
38 #else /* HAVE_TC_ACT_VLAN */
39
40 #define TCA_VLAN_ACT_POP 1
41 #define TCA_VLAN_ACT_PUSH 2
42 #define TCA_VLAN_ACT_MODIFY 3
43 #define TCA_VLAN_PARMS 2
44 #define TCA_VLAN_PUSH_VLAN_ID 3
45 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
46 #define TCA_VLAN_PAD 5
47 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
48
49 struct tc_vlan {
50         tc_gen;
51         int v_action;
52 };
53
54 #endif /* HAVE_TC_ACT_VLAN */
55
56 /* Normally found in linux/netlink.h. */
57 #ifndef NETLINK_CAP_ACK
58 #define NETLINK_CAP_ACK 10
59 #endif
60
61 /* Normally found in linux/pkt_sched.h. */
62 #ifndef TC_H_MIN_INGRESS
63 #define TC_H_MIN_INGRESS 0xfff2u
64 #endif
65
66 /* Normally found in linux/pkt_cls.h. */
67 #ifndef TCA_CLS_FLAGS_SKIP_SW
68 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
69 #endif
70 #ifndef HAVE_TCA_FLOWER_ACT
71 #define TCA_FLOWER_ACT 3
72 #endif
73 #ifndef HAVE_TCA_FLOWER_FLAGS
74 #define TCA_FLOWER_FLAGS 22
75 #endif
76 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
77 #define TCA_FLOWER_KEY_ETH_TYPE 8
78 #endif
79 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
80 #define TCA_FLOWER_KEY_ETH_DST 4
81 #endif
82 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
83 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
84 #endif
85 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
86 #define TCA_FLOWER_KEY_ETH_SRC 6
87 #endif
88 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
89 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
90 #endif
91 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
92 #define TCA_FLOWER_KEY_IP_PROTO 9
93 #endif
94 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
95 #define TCA_FLOWER_KEY_IPV4_SRC 10
96 #endif
97 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
98 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
99 #endif
100 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
101 #define TCA_FLOWER_KEY_IPV4_DST 12
102 #endif
103 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
104 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
105 #endif
106 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
107 #define TCA_FLOWER_KEY_IPV6_SRC 14
108 #endif
109 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
110 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
111 #endif
112 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
113 #define TCA_FLOWER_KEY_IPV6_DST 16
114 #endif
115 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
116 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
117 #endif
118 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
119 #define TCA_FLOWER_KEY_TCP_SRC 18
120 #endif
121 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
122 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
123 #endif
124 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
125 #define TCA_FLOWER_KEY_TCP_DST 19
126 #endif
127 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
128 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
129 #endif
130 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
131 #define TCA_FLOWER_KEY_UDP_SRC 20
132 #endif
133 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
134 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
135 #endif
136 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
137 #define TCA_FLOWER_KEY_UDP_DST 21
138 #endif
139 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
140 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
141 #endif
142 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
143 #define TCA_FLOWER_KEY_VLAN_ID 23
144 #endif
145 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
146 #define TCA_FLOWER_KEY_VLAN_PRIO 24
147 #endif
148 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
149 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
150 #endif
151 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
152 #define TCA_FLOWER_KEY_TCP_FLAGS 71
153 #endif
154 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
155 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
156 #endif
157
158 #ifndef IPV6_ADDR_LEN
159 #define IPV6_ADDR_LEN 16
160 #endif
161
162 /** Empty masks for known item types. */
163 static const union {
164         struct rte_flow_item_port_id port_id;
165         struct rte_flow_item_eth eth;
166         struct rte_flow_item_vlan vlan;
167         struct rte_flow_item_ipv4 ipv4;
168         struct rte_flow_item_ipv6 ipv6;
169         struct rte_flow_item_tcp tcp;
170         struct rte_flow_item_udp udp;
171 } flow_tcf_mask_empty;
172
173 /** Supported masks for known item types. */
174 static const struct {
175         struct rte_flow_item_port_id port_id;
176         struct rte_flow_item_eth eth;
177         struct rte_flow_item_vlan vlan;
178         struct rte_flow_item_ipv4 ipv4;
179         struct rte_flow_item_ipv6 ipv6;
180         struct rte_flow_item_tcp tcp;
181         struct rte_flow_item_udp udp;
182 } flow_tcf_mask_supported = {
183         .port_id = {
184                 .id = 0xffffffff,
185         },
186         .eth = {
187                 .type = RTE_BE16(0xffff),
188                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
189                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
190         },
191         .vlan = {
192                 /* PCP and VID only, no DEI. */
193                 .tci = RTE_BE16(0xefff),
194                 .inner_type = RTE_BE16(0xffff),
195         },
196         .ipv4.hdr = {
197                 .next_proto_id = 0xff,
198                 .src_addr = RTE_BE32(0xffffffff),
199                 .dst_addr = RTE_BE32(0xffffffff),
200         },
201         .ipv6.hdr = {
202                 .proto = 0xff,
203                 .src_addr =
204                         "\xff\xff\xff\xff\xff\xff\xff\xff"
205                         "\xff\xff\xff\xff\xff\xff\xff\xff",
206                 .dst_addr =
207                         "\xff\xff\xff\xff\xff\xff\xff\xff"
208                         "\xff\xff\xff\xff\xff\xff\xff\xff",
209         },
210         .tcp.hdr = {
211                 .src_port = RTE_BE16(0xffff),
212                 .dst_port = RTE_BE16(0xffff),
213                 .tcp_flags = 0xff,
214         },
215         .udp.hdr = {
216                 .src_port = RTE_BE16(0xffff),
217                 .dst_port = RTE_BE16(0xffff),
218         },
219 };
220
221 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
222 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
223 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
224 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
225 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
226
227 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
228
229 /** DPDK port to network interface index (ifindex) conversion. */
230 struct flow_tcf_ptoi {
231         uint16_t port_id; /**< DPDK port ID. */
232         unsigned int ifindex; /**< Network interface index. */
233 };
234
235 #define MLX5_TCF_FATE_ACTIONS (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID)
236 #define MLX5_TCF_VLAN_ACTIONS \
237         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
238          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
239
240 /**
241  * Retrieve mask for pattern item.
242  *
243  * This function does basic sanity checks on a pattern item in order to
244  * return the most appropriate mask for it.
245  *
246  * @param[in] item
247  *   Item specification.
248  * @param[in] mask_default
249  *   Default mask for pattern item as specified by the flow API.
250  * @param[in] mask_supported
251  *   Mask fields supported by the implementation.
252  * @param[in] mask_empty
253  *   Empty mask to return when there is no specification.
254  * @param[out] error
255  *   Perform verbose error reporting if not NULL.
256  *
257  * @return
258  *   Either @p item->mask or one of the mask parameters on success, NULL
259  *   otherwise and rte_errno is set.
260  */
261 static const void *
262 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
263                    const void *mask_supported, const void *mask_empty,
264                    size_t mask_size, struct rte_flow_error *error)
265 {
266         const uint8_t *mask;
267         size_t i;
268
269         /* item->last and item->mask cannot exist without item->spec. */
270         if (!item->spec && (item->mask || item->last)) {
271                 rte_flow_error_set(error, EINVAL,
272                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
273                                    "\"mask\" or \"last\" field provided without"
274                                    " a corresponding \"spec\"");
275                 return NULL;
276         }
277         /* No spec, no mask, no problem. */
278         if (!item->spec)
279                 return mask_empty;
280         mask = item->mask ? item->mask : mask_default;
281         assert(mask);
282         /*
283          * Single-pass check to make sure that:
284          * - Mask is supported, no bits are set outside mask_supported.
285          * - Both item->spec and item->last are included in mask.
286          */
287         for (i = 0; i != mask_size; ++i) {
288                 if (!mask[i])
289                         continue;
290                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
291                     ((const uint8_t *)mask_supported)[i]) {
292                         rte_flow_error_set(error, ENOTSUP,
293                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
294                                            "unsupported field found"
295                                            " in \"mask\"");
296                         return NULL;
297                 }
298                 if (item->last &&
299                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
300                     (((const uint8_t *)item->last)[i] & mask[i])) {
301                         rte_flow_error_set(error, EINVAL,
302                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
303                                            item->last,
304                                            "range between \"spec\" and \"last\""
305                                            " not comprised in \"mask\"");
306                         return NULL;
307                 }
308         }
309         return mask;
310 }
311
312 /**
313  * Build a conversion table between port ID and ifindex.
314  *
315  * @param[in] dev
316  *   Pointer to Ethernet device.
317  * @param[out] ptoi
318  *   Pointer to ptoi table.
319  * @param[in] len
320  *   Size of ptoi table provided.
321  *
322  * @return
323  *   Size of ptoi table filled.
324  */
325 static unsigned int
326 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
327                           unsigned int len)
328 {
329         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
330         uint16_t port_id[n + 1];
331         unsigned int i;
332         unsigned int own = 0;
333
334         /* At least one port is needed when no switch domain is present. */
335         if (!n) {
336                 n = 1;
337                 port_id[0] = dev->data->port_id;
338         } else {
339                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
340         }
341         if (n > len)
342                 return 0;
343         for (i = 0; i != n; ++i) {
344                 struct rte_eth_dev_info dev_info;
345
346                 rte_eth_dev_info_get(port_id[i], &dev_info);
347                 if (port_id[i] == dev->data->port_id)
348                         own = i;
349                 ptoi[i].port_id = port_id[i];
350                 ptoi[i].ifindex = dev_info.if_index;
351         }
352         /* Ensure first entry of ptoi[] is the current device. */
353         if (own) {
354                 ptoi[n] = ptoi[0];
355                 ptoi[0] = ptoi[own];
356                 ptoi[own] = ptoi[n];
357         }
358         /* An entry with zero ifindex terminates ptoi[]. */
359         ptoi[n].port_id = 0;
360         ptoi[n].ifindex = 0;
361         return n;
362 }
363
364 /**
365  * Verify the @p attr will be correctly understood by the E-switch.
366  *
367  * @param[in] attr
368  *   Pointer to flow attributes
369  * @param[out] error
370  *   Pointer to error structure.
371  *
372  * @return
373  *   0 on success, a negative errno value otherwise and rte_errno is set.
374  */
375 static int
376 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
377                              struct rte_flow_error *error)
378 {
379         /*
380          * Supported attributes: no groups, some priorities and ingress only.
381          * Don't care about transfer as it is the caller's problem.
382          */
383         if (attr->group)
384                 return rte_flow_error_set(error, ENOTSUP,
385                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
386                                           "groups are not supported");
387         if (attr->priority > 0xfffe)
388                 return rte_flow_error_set(error, ENOTSUP,
389                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
390                                           attr,
391                                           "lowest priority level is 0xfffe");
392         if (!attr->ingress)
393                 return rte_flow_error_set(error, EINVAL,
394                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
395                                           attr, "only ingress is supported");
396         if (attr->egress)
397                 return rte_flow_error_set(error, ENOTSUP,
398                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
399                                           attr, "egress is not supported");
400         return 0;
401 }
402
403 /**
404  * Validate flow for E-Switch.
405  *
406  * @param[in] priv
407  *   Pointer to the priv structure.
408  * @param[in] attr
409  *   Pointer to the flow attributes.
410  * @param[in] items
411  *   Pointer to the list of items.
412  * @param[in] actions
413  *   Pointer to the list of actions.
414  * @param[out] error
415  *   Pointer to the error structure.
416  *
417  * @return
418  *   0 on success, a negative errno value otherwise and rte_ernno is set.
419  */
420 static int
421 flow_tcf_validate(struct rte_eth_dev *dev,
422                   const struct rte_flow_attr *attr,
423                   const struct rte_flow_item items[],
424                   const struct rte_flow_action actions[],
425                   struct rte_flow_error *error)
426 {
427         union {
428                 const struct rte_flow_item_port_id *port_id;
429                 const struct rte_flow_item_eth *eth;
430                 const struct rte_flow_item_vlan *vlan;
431                 const struct rte_flow_item_ipv4 *ipv4;
432                 const struct rte_flow_item_ipv6 *ipv6;
433                 const struct rte_flow_item_tcp *tcp;
434                 const struct rte_flow_item_udp *udp;
435         } spec, mask;
436         union {
437                 const struct rte_flow_action_port_id *port_id;
438                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
439                 const struct rte_flow_action_of_set_vlan_vid *
440                         of_set_vlan_vid;
441                 const struct rte_flow_action_of_set_vlan_pcp *
442                         of_set_vlan_pcp;
443         } conf;
444         uint32_t item_flags = 0;
445         uint32_t action_flags = 0;
446         uint8_t next_protocol = -1;
447         unsigned int tcm_ifindex = 0;
448         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
449         struct rte_eth_dev *port_id_dev = NULL;
450         bool in_port_id_set;
451         int ret;
452
453         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
454                                                 PTOI_TABLE_SZ_MAX(dev)));
455         ret = flow_tcf_validate_attributes(attr, error);
456         if (ret < 0)
457                 return ret;
458         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
459                 unsigned int i;
460
461                 switch (items->type) {
462                 case RTE_FLOW_ITEM_TYPE_VOID:
463                         break;
464                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
465                         mask.port_id = flow_tcf_item_mask
466                                 (items, &rte_flow_item_port_id_mask,
467                                  &flow_tcf_mask_supported.port_id,
468                                  &flow_tcf_mask_empty.port_id,
469                                  sizeof(flow_tcf_mask_supported.port_id),
470                                  error);
471                         if (!mask.port_id)
472                                 return -rte_errno;
473                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
474                                 in_port_id_set = 1;
475                                 break;
476                         }
477                         spec.port_id = items->spec;
478                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
479                                 return rte_flow_error_set
480                                         (error, ENOTSUP,
481                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
482                                          mask.port_id,
483                                          "no support for partial mask on"
484                                          " \"id\" field");
485                         if (!mask.port_id->id)
486                                 i = 0;
487                         else
488                                 for (i = 0; ptoi[i].ifindex; ++i)
489                                         if (ptoi[i].port_id == spec.port_id->id)
490                                                 break;
491                         if (!ptoi[i].ifindex)
492                                 return rte_flow_error_set
493                                         (error, ENODEV,
494                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
495                                          spec.port_id,
496                                          "missing data to convert port ID to"
497                                          " ifindex");
498                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
499                                 return rte_flow_error_set
500                                         (error, ENOTSUP,
501                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
502                                          spec.port_id,
503                                          "cannot match traffic for"
504                                          " several port IDs through"
505                                          " a single flow rule");
506                         tcm_ifindex = ptoi[i].ifindex;
507                         in_port_id_set = 1;
508                         break;
509                 case RTE_FLOW_ITEM_TYPE_ETH:
510                         ret = mlx5_flow_validate_item_eth(items, item_flags,
511                                                           error);
512                         if (ret < 0)
513                                 return ret;
514                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
515                         /* TODO:
516                          * Redundant check due to different supported mask.
517                          * Same for the rest of items.
518                          */
519                         mask.eth = flow_tcf_item_mask
520                                 (items, &rte_flow_item_eth_mask,
521                                  &flow_tcf_mask_supported.eth,
522                                  &flow_tcf_mask_empty.eth,
523                                  sizeof(flow_tcf_mask_supported.eth),
524                                  error);
525                         if (!mask.eth)
526                                 return -rte_errno;
527                         if (mask.eth->type && mask.eth->type !=
528                             RTE_BE16(0xffff))
529                                 return rte_flow_error_set
530                                         (error, ENOTSUP,
531                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
532                                          mask.eth,
533                                          "no support for partial mask on"
534                                          " \"type\" field");
535                         break;
536                 case RTE_FLOW_ITEM_TYPE_VLAN:
537                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
538                                                            error);
539                         if (ret < 0)
540                                 return ret;
541                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
542                         mask.vlan = flow_tcf_item_mask
543                                 (items, &rte_flow_item_vlan_mask,
544                                  &flow_tcf_mask_supported.vlan,
545                                  &flow_tcf_mask_empty.vlan,
546                                  sizeof(flow_tcf_mask_supported.vlan),
547                                  error);
548                         if (!mask.vlan)
549                                 return -rte_errno;
550                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
551                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
552                               RTE_BE16(0xe000)) ||
553                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
554                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
555                               RTE_BE16(0x0fff)) ||
556                             (mask.vlan->inner_type &&
557                              mask.vlan->inner_type != RTE_BE16(0xffff)))
558                                 return rte_flow_error_set
559                                         (error, ENOTSUP,
560                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
561                                          mask.vlan,
562                                          "no support for partial masks on"
563                                          " \"tci\" (PCP and VID parts) and"
564                                          " \"inner_type\" fields");
565                         break;
566                 case RTE_FLOW_ITEM_TYPE_IPV4:
567                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
568                                                            error);
569                         if (ret < 0)
570                                 return ret;
571                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
572                         mask.ipv4 = flow_tcf_item_mask
573                                 (items, &rte_flow_item_ipv4_mask,
574                                  &flow_tcf_mask_supported.ipv4,
575                                  &flow_tcf_mask_empty.ipv4,
576                                  sizeof(flow_tcf_mask_supported.ipv4),
577                                  error);
578                         if (!mask.ipv4)
579                                 return -rte_errno;
580                         if (mask.ipv4->hdr.next_proto_id &&
581                             mask.ipv4->hdr.next_proto_id != 0xff)
582                                 return rte_flow_error_set
583                                         (error, ENOTSUP,
584                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
585                                          mask.ipv4,
586                                          "no support for partial mask on"
587                                          " \"hdr.next_proto_id\" field");
588                         else if (mask.ipv4->hdr.next_proto_id)
589                                 next_protocol =
590                                         ((const struct rte_flow_item_ipv4 *)
591                                          (items->spec))->hdr.next_proto_id;
592                         break;
593                 case RTE_FLOW_ITEM_TYPE_IPV6:
594                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
595                                                            error);
596                         if (ret < 0)
597                                 return ret;
598                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
599                         mask.ipv6 = flow_tcf_item_mask
600                                 (items, &rte_flow_item_ipv6_mask,
601                                  &flow_tcf_mask_supported.ipv6,
602                                  &flow_tcf_mask_empty.ipv6,
603                                  sizeof(flow_tcf_mask_supported.ipv6),
604                                  error);
605                         if (!mask.ipv6)
606                                 return -rte_errno;
607                         if (mask.ipv6->hdr.proto &&
608                             mask.ipv6->hdr.proto != 0xff)
609                                 return rte_flow_error_set
610                                         (error, ENOTSUP,
611                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
612                                          mask.ipv6,
613                                          "no support for partial mask on"
614                                          " \"hdr.proto\" field");
615                         else if (mask.ipv6->hdr.proto)
616                                 next_protocol =
617                                         ((const struct rte_flow_item_ipv6 *)
618                                          (items->spec))->hdr.proto;
619                         break;
620                 case RTE_FLOW_ITEM_TYPE_UDP:
621                         ret = mlx5_flow_validate_item_udp(items, item_flags,
622                                                           next_protocol, error);
623                         if (ret < 0)
624                                 return ret;
625                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
626                         mask.udp = flow_tcf_item_mask
627                                 (items, &rte_flow_item_udp_mask,
628                                  &flow_tcf_mask_supported.udp,
629                                  &flow_tcf_mask_empty.udp,
630                                  sizeof(flow_tcf_mask_supported.udp),
631                                  error);
632                         if (!mask.udp)
633                                 return -rte_errno;
634                         break;
635                 case RTE_FLOW_ITEM_TYPE_TCP:
636                         ret = mlx5_flow_validate_item_tcp
637                                              (items, item_flags,
638                                               next_protocol,
639                                               &flow_tcf_mask_supported.tcp,
640                                               error);
641                         if (ret < 0)
642                                 return ret;
643                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
644                         mask.tcp = flow_tcf_item_mask
645                                 (items, &rte_flow_item_tcp_mask,
646                                  &flow_tcf_mask_supported.tcp,
647                                  &flow_tcf_mask_empty.tcp,
648                                  sizeof(flow_tcf_mask_supported.tcp),
649                                  error);
650                         if (!mask.tcp)
651                                 return -rte_errno;
652                         break;
653                 default:
654                         return rte_flow_error_set(error, ENOTSUP,
655                                                   RTE_FLOW_ERROR_TYPE_ITEM,
656                                                   NULL, "item not supported");
657                 }
658         }
659         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
660                 unsigned int i;
661
662                 switch (actions->type) {
663                 case RTE_FLOW_ACTION_TYPE_VOID:
664                         break;
665                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
666                         if (action_flags & MLX5_TCF_FATE_ACTIONS)
667                                 return rte_flow_error_set
668                                         (error, EINVAL,
669                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
670                                          "can't have multiple fate actions");
671                         conf.port_id = actions->conf;
672                         if (conf.port_id->original)
673                                 i = 0;
674                         else
675                                 for (i = 0; ptoi[i].ifindex; ++i)
676                                         if (ptoi[i].port_id == conf.port_id->id)
677                                                 break;
678                         if (!ptoi[i].ifindex)
679                                 return rte_flow_error_set
680                                         (error, ENODEV,
681                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
682                                          conf.port_id,
683                                          "missing data to convert port ID to"
684                                          " ifindex");
685                         action_flags |= MLX5_FLOW_ACTION_PORT_ID;
686                         port_id_dev = &rte_eth_devices[conf.port_id->id];
687                         break;
688                 case RTE_FLOW_ACTION_TYPE_DROP:
689                         if (action_flags & MLX5_TCF_FATE_ACTIONS)
690                                 return rte_flow_error_set
691                                         (error, EINVAL,
692                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
693                                          "can't have multiple fate actions");
694                         action_flags |= MLX5_FLOW_ACTION_DROP;
695                         break;
696                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
697                         action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
698                         break;
699                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
700                         action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
701                         break;
702                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
703                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
704                                 return rte_flow_error_set
705                                         (error, ENOTSUP,
706                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
707                                          "vlan modify is not supported,"
708                                          " set action must follow push action");
709                         action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
710                         break;
711                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
712                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
713                                 return rte_flow_error_set
714                                         (error, ENOTSUP,
715                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
716                                          "vlan modify is not supported,"
717                                          " set action must follow push action");
718                         action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
719                         break;
720                 default:
721                         return rte_flow_error_set(error, ENOTSUP,
722                                                   RTE_FLOW_ERROR_TYPE_ACTION,
723                                                   actions,
724                                                   "action not supported");
725                 }
726         }
727         /*
728          * FW syndrome (0xA9C090):
729          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
730          *     forward to the uplink.
731          */
732         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
733             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
734             ((struct priv *)port_id_dev->data->dev_private)->representor)
735                 return rte_flow_error_set(error, ENOTSUP,
736                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
737                                           "vlan push can only be applied"
738                                           " when forwarding to uplink port");
739         /*
740          * FW syndrome (0x294609):
741          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
742          *     are supported only while forwarding to vport.
743          */
744         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
745             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
746                 return rte_flow_error_set(error, ENOTSUP,
747                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
748                                           "vlan actions are supported"
749                                           " only with port_id action");
750         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
751                 return rte_flow_error_set(error, EINVAL,
752                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
753                                           "no fate action is found");
754         return 0;
755 }
756
757 /**
758  * Calculate maximum size of memory for flow items of Linux TC flower and
759  * extract specified items.
760  *
761  * @param[in] items
762  *   Pointer to the list of items.
763  * @param[out] item_flags
764  *   Pointer to the detected items.
765  *
766  * @return
767  *   Maximum size of memory for items.
768  */
769 static int
770 flow_tcf_get_items_and_size(const struct rte_flow_item items[],
771                             uint64_t *item_flags)
772 {
773         int size = 0;
774         uint64_t flags = 0;
775
776         size += SZ_NLATTR_STRZ_OF("flower") +
777                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
778                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
779         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
780                 switch (items->type) {
781                 case RTE_FLOW_ITEM_TYPE_VOID:
782                         break;
783                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
784                         break;
785                 case RTE_FLOW_ITEM_TYPE_ETH:
786                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
787                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
788                                 /* dst/src MAC addr and mask. */
789                         flags |= MLX5_FLOW_LAYER_OUTER_L2;
790                         break;
791                 case RTE_FLOW_ITEM_TYPE_VLAN:
792                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
793                                 SZ_NLATTR_TYPE_OF(uint16_t) +
794                                 /* VLAN Ether type. */
795                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
796                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
797                         flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
798                         break;
799                 case RTE_FLOW_ITEM_TYPE_IPV4:
800                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
801                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
802                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
803                                 /* dst/src IP addr and mask. */
804                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
805                         break;
806                 case RTE_FLOW_ITEM_TYPE_IPV6:
807                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
808                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
809                                 SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
810                                 /* dst/src IP addr and mask. */
811                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
812                         break;
813                 case RTE_FLOW_ITEM_TYPE_UDP:
814                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
815                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
816                                 /* dst/src port and mask. */
817                         flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
818                         break;
819                 case RTE_FLOW_ITEM_TYPE_TCP:
820                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
821                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
822                                 /* dst/src port and mask. */
823                         flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
824                         break;
825                 default:
826                         DRV_LOG(WARNING,
827                                 "unsupported item %p type %d,"
828                                 " items must be validated before flow creation",
829                                 (const void *)items, items->type);
830                         break;
831                 }
832         }
833         *item_flags = flags;
834         return size;
835 }
836
837 /**
838  * Calculate maximum size of memory for flow actions of Linux TC flower and
839  * extract specified actions.
840  *
841  * @param[in] actions
842  *   Pointer to the list of actions.
843  * @param[out] action_flags
844  *   Pointer to the detected actions.
845  *
846  * @return
847  *   Maximum size of memory for actions.
848  */
849 static int
850 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
851                               uint64_t *action_flags)
852 {
853         int size = 0;
854         uint64_t flags = 0;
855
856         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
857         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
858                 switch (actions->type) {
859                 case RTE_FLOW_ACTION_TYPE_VOID:
860                         break;
861                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
862                         size += SZ_NLATTR_NEST + /* na_act_index. */
863                                 SZ_NLATTR_STRZ_OF("mirred") +
864                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
865                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
866                         flags |= MLX5_FLOW_ACTION_PORT_ID;
867                         break;
868                 case RTE_FLOW_ACTION_TYPE_DROP:
869                         size += SZ_NLATTR_NEST + /* na_act_index. */
870                                 SZ_NLATTR_STRZ_OF("gact") +
871                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
872                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
873                         flags |= MLX5_FLOW_ACTION_DROP;
874                         break;
875                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
876                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
877                         goto action_of_vlan;
878                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
879                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
880                         goto action_of_vlan;
881                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
882                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
883                         goto action_of_vlan;
884                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
885                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
886                         goto action_of_vlan;
887 action_of_vlan:
888                         size += SZ_NLATTR_NEST + /* na_act_index. */
889                                 SZ_NLATTR_STRZ_OF("vlan") +
890                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
891                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
892                                 SZ_NLATTR_TYPE_OF(uint16_t) +
893                                 /* VLAN protocol. */
894                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
895                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
896                         break;
897                 default:
898                         DRV_LOG(WARNING,
899                                 "unsupported action %p type %d,"
900                                 " items must be validated before flow creation",
901                                 (const void *)actions, actions->type);
902                         break;
903                 }
904         }
905         *action_flags = flags;
906         return size;
907 }
908
909 /**
910  * Brand rtnetlink buffer with unique handle.
911  *
912  * This handle should be unique for a given network interface to avoid
913  * collisions.
914  *
915  * @param nlh
916  *   Pointer to Netlink message.
917  * @param handle
918  *   Unique 32-bit handle to use.
919  */
920 static void
921 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
922 {
923         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
924
925         tcm->tcm_handle = handle;
926         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
927                 (void *)nlh, handle);
928 }
929
930 /**
931  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
932  * memory required, allocates the memory, initializes Netlink message headers
933  * and set unique TC message handle.
934  *
935  * @param[in] attr
936  *   Pointer to the flow attributes.
937  * @param[in] items
938  *   Pointer to the list of items.
939  * @param[in] actions
940  *   Pointer to the list of actions.
941  * @param[out] item_flags
942  *   Pointer to bit mask of all items detected.
943  * @param[out] action_flags
944  *   Pointer to bit mask of all actions detected.
945  * @param[out] error
946  *   Pointer to the error structure.
947  *
948  * @return
949  *   Pointer to mlx5_flow object on success,
950  *   otherwise NULL and rte_ernno is set.
951  */
952 static struct mlx5_flow *
953 flow_tcf_prepare(const struct rte_flow_attr *attr __rte_unused,
954                  const struct rte_flow_item items[],
955                  const struct rte_flow_action actions[],
956                  uint64_t *item_flags, uint64_t *action_flags,
957                  struct rte_flow_error *error)
958 {
959         size_t size = sizeof(struct mlx5_flow) +
960                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
961                       MNL_ALIGN(sizeof(struct tcmsg));
962         struct mlx5_flow *dev_flow;
963         struct nlmsghdr *nlh;
964         struct tcmsg *tcm;
965
966         size += flow_tcf_get_items_and_size(items, item_flags);
967         size += flow_tcf_get_actions_and_size(actions, action_flags);
968         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
969         if (!dev_flow) {
970                 rte_flow_error_set(error, ENOMEM,
971                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
972                                    "not enough memory to create E-Switch flow");
973                 return NULL;
974         }
975         nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
976         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
977         *dev_flow = (struct mlx5_flow){
978                 .tcf = (struct mlx5_flow_tcf){
979                         .nlh = nlh,
980                         .tcm = tcm,
981                 },
982         };
983         /*
984          * Generate a reasonably unique handle based on the address of the
985          * target buffer.
986          *
987          * This is straightforward on 32-bit systems where the flow pointer can
988          * be used directly. Otherwise, its least significant part is taken
989          * after shifting it by the previous power of two of the pointed buffer
990          * size.
991          */
992         if (sizeof(dev_flow) <= 4)
993                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
994         else
995                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
996                                        rte_log2_u32(rte_align32prevpow2(size)));
997         return dev_flow;
998 }
999
1000 /**
1001  * Translate flow for Linux TC flower and construct Netlink message.
1002  *
1003  * @param[in] priv
1004  *   Pointer to the priv structure.
1005  * @param[in, out] flow
1006  *   Pointer to the sub flow.
1007  * @param[in] attr
1008  *   Pointer to the flow attributes.
1009  * @param[in] items
1010  *   Pointer to the list of items.
1011  * @param[in] actions
1012  *   Pointer to the list of actions.
1013  * @param[out] error
1014  *   Pointer to the error structure.
1015  *
1016  * @return
1017  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1018  */
1019 static int
1020 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
1021                    const struct rte_flow_attr *attr,
1022                    const struct rte_flow_item items[],
1023                    const struct rte_flow_action actions[],
1024                    struct rte_flow_error *error)
1025 {
1026         union {
1027                 const struct rte_flow_item_port_id *port_id;
1028                 const struct rte_flow_item_eth *eth;
1029                 const struct rte_flow_item_vlan *vlan;
1030                 const struct rte_flow_item_ipv4 *ipv4;
1031                 const struct rte_flow_item_ipv6 *ipv6;
1032                 const struct rte_flow_item_tcp *tcp;
1033                 const struct rte_flow_item_udp *udp;
1034         } spec, mask;
1035         union {
1036                 const struct rte_flow_action_port_id *port_id;
1037                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1038                 const struct rte_flow_action_of_set_vlan_vid *
1039                         of_set_vlan_vid;
1040                 const struct rte_flow_action_of_set_vlan_pcp *
1041                         of_set_vlan_pcp;
1042         } conf;
1043         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1044         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
1045         struct tcmsg *tcm = dev_flow->tcf.tcm;
1046         uint32_t na_act_index_cur;
1047         bool eth_type_set = 0;
1048         bool vlan_present = 0;
1049         bool vlan_eth_type_set = 0;
1050         bool ip_proto_set = 0;
1051         struct nlattr *na_flower;
1052         struct nlattr *na_flower_act;
1053         struct nlattr *na_vlan_id = NULL;
1054         struct nlattr *na_vlan_priority = NULL;
1055
1056         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1057                                                 PTOI_TABLE_SZ_MAX(dev)));
1058         nlh = dev_flow->tcf.nlh;
1059         tcm = dev_flow->tcf.tcm;
1060         /* Prepare API must have been called beforehand. */
1061         assert(nlh != NULL && tcm != NULL);
1062         tcm->tcm_family = AF_UNSPEC;
1063         tcm->tcm_ifindex = ptoi[0].ifindex;
1064         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
1065         /*
1066          * Priority cannot be zero to prevent the kernel from picking one
1067          * automatically.
1068          */
1069         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
1070                                   RTE_BE16(ETH_P_ALL));
1071         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
1072         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
1073         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
1074         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1075                 unsigned int i;
1076
1077                 switch (items->type) {
1078                 case RTE_FLOW_ITEM_TYPE_VOID:
1079                         break;
1080                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1081                         mask.port_id = flow_tcf_item_mask
1082                                 (items, &rte_flow_item_port_id_mask,
1083                                  &flow_tcf_mask_supported.port_id,
1084                                  &flow_tcf_mask_empty.port_id,
1085                                  sizeof(flow_tcf_mask_supported.port_id),
1086                                  error);
1087                         assert(mask.port_id);
1088                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
1089                                 break;
1090                         spec.port_id = items->spec;
1091                         if (!mask.port_id->id)
1092                                 i = 0;
1093                         else
1094                                 for (i = 0; ptoi[i].ifindex; ++i)
1095                                         if (ptoi[i].port_id == spec.port_id->id)
1096                                                 break;
1097                         assert(ptoi[i].ifindex);
1098                         tcm->tcm_ifindex = ptoi[i].ifindex;
1099                         break;
1100                 case RTE_FLOW_ITEM_TYPE_ETH:
1101                         mask.eth = flow_tcf_item_mask
1102                                 (items, &rte_flow_item_eth_mask,
1103                                  &flow_tcf_mask_supported.eth,
1104                                  &flow_tcf_mask_empty.eth,
1105                                  sizeof(flow_tcf_mask_supported.eth),
1106                                  error);
1107                         assert(mask.eth);
1108                         if (mask.eth == &flow_tcf_mask_empty.eth)
1109                                 break;
1110                         spec.eth = items->spec;
1111                         if (mask.eth->type) {
1112                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
1113                                                  spec.eth->type);
1114                                 eth_type_set = 1;
1115                         }
1116                         if (!is_zero_ether_addr(&mask.eth->dst)) {
1117                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
1118                                              ETHER_ADDR_LEN,
1119                                              spec.eth->dst.addr_bytes);
1120                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
1121                                              ETHER_ADDR_LEN,
1122                                              mask.eth->dst.addr_bytes);
1123                         }
1124                         if (!is_zero_ether_addr(&mask.eth->src)) {
1125                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
1126                                              ETHER_ADDR_LEN,
1127                                              spec.eth->src.addr_bytes);
1128                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
1129                                              ETHER_ADDR_LEN,
1130                                              mask.eth->src.addr_bytes);
1131                         }
1132                         break;
1133                 case RTE_FLOW_ITEM_TYPE_VLAN:
1134                         mask.vlan = flow_tcf_item_mask
1135                                 (items, &rte_flow_item_vlan_mask,
1136                                  &flow_tcf_mask_supported.vlan,
1137                                  &flow_tcf_mask_empty.vlan,
1138                                  sizeof(flow_tcf_mask_supported.vlan),
1139                                  error);
1140                         assert(mask.vlan);
1141                         if (!eth_type_set)
1142                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
1143                                                  RTE_BE16(ETH_P_8021Q));
1144                         eth_type_set = 1;
1145                         vlan_present = 1;
1146                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
1147                                 break;
1148                         spec.vlan = items->spec;
1149                         if (mask.vlan->inner_type) {
1150                                 mnl_attr_put_u16(nlh,
1151                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1152                                                  spec.vlan->inner_type);
1153                                 vlan_eth_type_set = 1;
1154                         }
1155                         if (mask.vlan->tci & RTE_BE16(0xe000))
1156                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
1157                                                 (rte_be_to_cpu_16
1158                                                  (spec.vlan->tci) >> 13) & 0x7);
1159                         if (mask.vlan->tci & RTE_BE16(0x0fff))
1160                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
1161                                                  rte_be_to_cpu_16
1162                                                  (spec.vlan->tci &
1163                                                   RTE_BE16(0x0fff)));
1164                         break;
1165                 case RTE_FLOW_ITEM_TYPE_IPV4:
1166                         mask.ipv4 = flow_tcf_item_mask
1167                                 (items, &rte_flow_item_ipv4_mask,
1168                                  &flow_tcf_mask_supported.ipv4,
1169                                  &flow_tcf_mask_empty.ipv4,
1170                                  sizeof(flow_tcf_mask_supported.ipv4),
1171                                  error);
1172                         assert(mask.ipv4);
1173                         if (!eth_type_set || !vlan_eth_type_set)
1174                                 mnl_attr_put_u16(nlh,
1175                                                  vlan_present ?
1176                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
1177                                                  TCA_FLOWER_KEY_ETH_TYPE,
1178                                                  RTE_BE16(ETH_P_IP));
1179                         eth_type_set = 1;
1180                         vlan_eth_type_set = 1;
1181                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
1182                                 break;
1183                         spec.ipv4 = items->spec;
1184                         if (mask.ipv4->hdr.next_proto_id) {
1185                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1186                                                 spec.ipv4->hdr.next_proto_id);
1187                                 ip_proto_set = 1;
1188                         }
1189                         if (mask.ipv4->hdr.src_addr) {
1190                                 mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
1191                                                  spec.ipv4->hdr.src_addr);
1192                                 mnl_attr_put_u32(nlh,
1193                                                  TCA_FLOWER_KEY_IPV4_SRC_MASK,
1194                                                  mask.ipv4->hdr.src_addr);
1195                         }
1196                         if (mask.ipv4->hdr.dst_addr) {
1197                                 mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
1198                                                  spec.ipv4->hdr.dst_addr);
1199                                 mnl_attr_put_u32(nlh,
1200                                                  TCA_FLOWER_KEY_IPV4_DST_MASK,
1201                                                  mask.ipv4->hdr.dst_addr);
1202                         }
1203                         break;
1204                 case RTE_FLOW_ITEM_TYPE_IPV6:
1205                         mask.ipv6 = flow_tcf_item_mask
1206                                 (items, &rte_flow_item_ipv6_mask,
1207                                  &flow_tcf_mask_supported.ipv6,
1208                                  &flow_tcf_mask_empty.ipv6,
1209                                  sizeof(flow_tcf_mask_supported.ipv6),
1210                                  error);
1211                         assert(mask.ipv6);
1212                         if (!eth_type_set || !vlan_eth_type_set)
1213                                 mnl_attr_put_u16(nlh,
1214                                                  vlan_present ?
1215                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
1216                                                  TCA_FLOWER_KEY_ETH_TYPE,
1217                                                  RTE_BE16(ETH_P_IPV6));
1218                         eth_type_set = 1;
1219                         vlan_eth_type_set = 1;
1220                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
1221                                 break;
1222                         spec.ipv6 = items->spec;
1223                         if (mask.ipv6->hdr.proto) {
1224                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1225                                                 spec.ipv6->hdr.proto);
1226                                 ip_proto_set = 1;
1227                         }
1228                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
1229                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
1230                                              sizeof(spec.ipv6->hdr.src_addr),
1231                                              spec.ipv6->hdr.src_addr);
1232                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
1233                                              sizeof(mask.ipv6->hdr.src_addr),
1234                                              mask.ipv6->hdr.src_addr);
1235                         }
1236                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
1237                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
1238                                              sizeof(spec.ipv6->hdr.dst_addr),
1239                                              spec.ipv6->hdr.dst_addr);
1240                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
1241                                              sizeof(mask.ipv6->hdr.dst_addr),
1242                                              mask.ipv6->hdr.dst_addr);
1243                         }
1244                         break;
1245                 case RTE_FLOW_ITEM_TYPE_UDP:
1246                         mask.udp = flow_tcf_item_mask
1247                                 (items, &rte_flow_item_udp_mask,
1248                                  &flow_tcf_mask_supported.udp,
1249                                  &flow_tcf_mask_empty.udp,
1250                                  sizeof(flow_tcf_mask_supported.udp),
1251                                  error);
1252                         assert(mask.udp);
1253                         if (!ip_proto_set)
1254                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1255                                                 IPPROTO_UDP);
1256                         if (mask.udp == &flow_tcf_mask_empty.udp)
1257                                 break;
1258                         spec.udp = items->spec;
1259                         if (mask.udp->hdr.src_port) {
1260                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
1261                                                  spec.udp->hdr.src_port);
1262                                 mnl_attr_put_u16(nlh,
1263                                                  TCA_FLOWER_KEY_UDP_SRC_MASK,
1264                                                  mask.udp->hdr.src_port);
1265                         }
1266                         if (mask.udp->hdr.dst_port) {
1267                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
1268                                                  spec.udp->hdr.dst_port);
1269                                 mnl_attr_put_u16(nlh,
1270                                                  TCA_FLOWER_KEY_UDP_DST_MASK,
1271                                                  mask.udp->hdr.dst_port);
1272                         }
1273                         break;
1274                 case RTE_FLOW_ITEM_TYPE_TCP:
1275                         mask.tcp = flow_tcf_item_mask
1276                                 (items, &rte_flow_item_tcp_mask,
1277                                  &flow_tcf_mask_supported.tcp,
1278                                  &flow_tcf_mask_empty.tcp,
1279                                  sizeof(flow_tcf_mask_supported.tcp),
1280                                  error);
1281                         assert(mask.tcp);
1282                         if (!ip_proto_set)
1283                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1284                                                 IPPROTO_TCP);
1285                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
1286                                 break;
1287                         spec.tcp = items->spec;
1288                         if (mask.tcp->hdr.src_port) {
1289                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
1290                                                  spec.tcp->hdr.src_port);
1291                                 mnl_attr_put_u16(nlh,
1292                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
1293                                                  mask.tcp->hdr.src_port);
1294                         }
1295                         if (mask.tcp->hdr.dst_port) {
1296                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
1297                                                  spec.tcp->hdr.dst_port);
1298                                 mnl_attr_put_u16(nlh,
1299                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
1300                                                  mask.tcp->hdr.dst_port);
1301                         }
1302                         if (mask.tcp->hdr.tcp_flags) {
1303                                 mnl_attr_put_u16
1304                                         (nlh,
1305                                          TCA_FLOWER_KEY_TCP_FLAGS,
1306                                          rte_cpu_to_be_16
1307                                                 (spec.tcp->hdr.tcp_flags));
1308                                 mnl_attr_put_u16
1309                                         (nlh,
1310                                          TCA_FLOWER_KEY_TCP_FLAGS_MASK,
1311                                          rte_cpu_to_be_16
1312                                                 (mask.tcp->hdr.tcp_flags));
1313                         }
1314                         break;
1315                 default:
1316                         return rte_flow_error_set(error, ENOTSUP,
1317                                                   RTE_FLOW_ERROR_TYPE_ITEM,
1318                                                   NULL, "item not supported");
1319                 }
1320         }
1321         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
1322         na_act_index_cur = 1;
1323         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1324                 struct nlattr *na_act_index;
1325                 struct nlattr *na_act;
1326                 unsigned int vlan_act;
1327                 unsigned int i;
1328
1329                 switch (actions->type) {
1330                 case RTE_FLOW_ACTION_TYPE_VOID:
1331                         break;
1332                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1333                         conf.port_id = actions->conf;
1334                         if (conf.port_id->original)
1335                                 i = 0;
1336                         else
1337                                 for (i = 0; ptoi[i].ifindex; ++i)
1338                                         if (ptoi[i].port_id == conf.port_id->id)
1339                                                 break;
1340                         assert(ptoi[i].ifindex);
1341                         na_act_index =
1342                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
1343                         assert(na_act_index);
1344                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
1345                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
1346                         assert(na_act);
1347                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
1348                                      sizeof(struct tc_mirred),
1349                                      &(struct tc_mirred){
1350                                         .action = TC_ACT_STOLEN,
1351                                         .eaction = TCA_EGRESS_REDIR,
1352                                         .ifindex = ptoi[i].ifindex,
1353                                      });
1354                         mnl_attr_nest_end(nlh, na_act);
1355                         mnl_attr_nest_end(nlh, na_act_index);
1356                         break;
1357                 case RTE_FLOW_ACTION_TYPE_DROP:
1358                         na_act_index =
1359                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
1360                         assert(na_act_index);
1361                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
1362                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
1363                         assert(na_act);
1364                         mnl_attr_put(nlh, TCA_GACT_PARMS,
1365                                      sizeof(struct tc_gact),
1366                                      &(struct tc_gact){
1367                                         .action = TC_ACT_SHOT,
1368                                      });
1369                         mnl_attr_nest_end(nlh, na_act);
1370                         mnl_attr_nest_end(nlh, na_act_index);
1371                         break;
1372                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1373                         conf.of_push_vlan = NULL;
1374                         vlan_act = TCA_VLAN_ACT_POP;
1375                         goto action_of_vlan;
1376                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1377                         conf.of_push_vlan = actions->conf;
1378                         vlan_act = TCA_VLAN_ACT_PUSH;
1379                         goto action_of_vlan;
1380                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1381                         conf.of_set_vlan_vid = actions->conf;
1382                         if (na_vlan_id)
1383                                 goto override_na_vlan_id;
1384                         vlan_act = TCA_VLAN_ACT_MODIFY;
1385                         goto action_of_vlan;
1386                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1387                         conf.of_set_vlan_pcp = actions->conf;
1388                         if (na_vlan_priority)
1389                                 goto override_na_vlan_priority;
1390                         vlan_act = TCA_VLAN_ACT_MODIFY;
1391                         goto action_of_vlan;
1392 action_of_vlan:
1393                         na_act_index =
1394                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
1395                         assert(na_act_index);
1396                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
1397                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
1398                         assert(na_act);
1399                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
1400                                      sizeof(struct tc_vlan),
1401                                      &(struct tc_vlan){
1402                                         .action = TC_ACT_PIPE,
1403                                         .v_action = vlan_act,
1404                                      });
1405                         if (vlan_act == TCA_VLAN_ACT_POP) {
1406                                 mnl_attr_nest_end(nlh, na_act);
1407                                 mnl_attr_nest_end(nlh, na_act_index);
1408                                 break;
1409                         }
1410                         if (vlan_act == TCA_VLAN_ACT_PUSH)
1411                                 mnl_attr_put_u16(nlh,
1412                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
1413                                                  conf.of_push_vlan->ethertype);
1414                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
1415                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
1416                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
1417                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
1418                         mnl_attr_nest_end(nlh, na_act);
1419                         mnl_attr_nest_end(nlh, na_act_index);
1420                         if (actions->type ==
1421                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
1422 override_na_vlan_id:
1423                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
1424                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
1425                                         rte_be_to_cpu_16
1426                                         (conf.of_set_vlan_vid->vlan_vid);
1427                         } else if (actions->type ==
1428                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
1429 override_na_vlan_priority:
1430                                 na_vlan_priority->nla_type =
1431                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
1432                                 *(uint8_t *)mnl_attr_get_payload
1433                                         (na_vlan_priority) =
1434                                         conf.of_set_vlan_pcp->vlan_pcp;
1435                         }
1436                         break;
1437                 default:
1438                         return rte_flow_error_set(error, ENOTSUP,
1439                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1440                                                   actions,
1441                                                   "action not supported");
1442                 }
1443         }
1444         assert(na_flower);
1445         assert(na_flower_act);
1446         mnl_attr_nest_end(nlh, na_flower_act);
1447         mnl_attr_nest_end(nlh, na_flower);
1448         return 0;
1449 }
1450
1451 /**
1452  * Send Netlink message with acknowledgment.
1453  *
1454  * @param nl
1455  *   Libmnl socket to use.
1456  * @param nlh
1457  *   Message to send. This function always raises the NLM_F_ACK flag before
1458  *   sending.
1459  *
1460  * @return
1461  *   0 on success, a negative errno value otherwise and rte_errno is set.
1462  */
1463 static int
1464 flow_tcf_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
1465 {
1466         alignas(struct nlmsghdr)
1467         uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
1468                     nlh->nlmsg_len - sizeof(*nlh)];
1469         uint32_t seq = random();
1470         int ret;
1471
1472         nlh->nlmsg_flags |= NLM_F_ACK;
1473         nlh->nlmsg_seq = seq;
1474         ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
1475         if (ret != -1)
1476                 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
1477         if (ret != -1)
1478                 ret = mnl_cb_run
1479                         (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
1480         if (ret > 0)
1481                 return 0;
1482         rte_errno = errno;
1483         return -rte_errno;
1484 }
1485
1486 /**
1487  * Apply flow to E-Switch by sending Netlink message.
1488  *
1489  * @param[in] dev
1490  *   Pointer to Ethernet device.
1491  * @param[in, out] flow
1492  *   Pointer to the sub flow.
1493  * @param[out] error
1494  *   Pointer to the error structure.
1495  *
1496  * @return
1497  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1498  */
1499 static int
1500 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
1501                struct rte_flow_error *error)
1502 {
1503         struct priv *priv = dev->data->dev_private;
1504         struct mnl_socket *nl = priv->mnl_socket;
1505         struct mlx5_flow *dev_flow;
1506         struct nlmsghdr *nlh;
1507
1508         dev_flow = LIST_FIRST(&flow->dev_flows);
1509         /* E-Switch flow can't be expanded. */
1510         assert(!LIST_NEXT(dev_flow, next));
1511         nlh = dev_flow->tcf.nlh;
1512         nlh->nlmsg_type = RTM_NEWTFILTER;
1513         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
1514         if (!flow_tcf_nl_ack(nl, nlh))
1515                 return 0;
1516         return rte_flow_error_set(error, rte_errno,
1517                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1518                                   "netlink: failed to create TC flow rule");
1519 }
1520
1521 /**
1522  * Remove flow from E-Switch by sending Netlink message.
1523  *
1524  * @param[in] dev
1525  *   Pointer to Ethernet device.
1526  * @param[in, out] flow
1527  *   Pointer to the sub flow.
1528  */
1529 static void
1530 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
1531 {
1532         struct priv *priv = dev->data->dev_private;
1533         struct mnl_socket *nl = priv->mnl_socket;
1534         struct mlx5_flow *dev_flow;
1535         struct nlmsghdr *nlh;
1536
1537         if (!flow)
1538                 return;
1539         dev_flow = LIST_FIRST(&flow->dev_flows);
1540         if (!dev_flow)
1541                 return;
1542         /* E-Switch flow can't be expanded. */
1543         assert(!LIST_NEXT(dev_flow, next));
1544         nlh = dev_flow->tcf.nlh;
1545         nlh->nlmsg_type = RTM_DELTFILTER;
1546         nlh->nlmsg_flags = NLM_F_REQUEST;
1547         flow_tcf_nl_ack(nl, nlh);
1548 }
1549
1550 /**
1551  * Remove flow from E-Switch and release resources of the device flow.
1552  *
1553  * @param[in] dev
1554  *   Pointer to Ethernet device.
1555  * @param[in, out] flow
1556  *   Pointer to the sub flow.
1557  */
1558 static void
1559 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
1560 {
1561         struct mlx5_flow *dev_flow;
1562
1563         if (!flow)
1564                 return;
1565         flow_tcf_remove(dev, flow);
1566         dev_flow = LIST_FIRST(&flow->dev_flows);
1567         if (!dev_flow)
1568                 return;
1569         /* E-Switch flow can't be expanded. */
1570         assert(!LIST_NEXT(dev_flow, next));
1571         LIST_REMOVE(dev_flow, next);
1572         rte_free(dev_flow);
1573 }
1574
1575 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
1576         .validate = flow_tcf_validate,
1577         .prepare = flow_tcf_prepare,
1578         .translate = flow_tcf_translate,
1579         .apply = flow_tcf_apply,
1580         .remove = flow_tcf_remove,
1581         .destroy = flow_tcf_destroy,
1582 };
1583
1584 /**
1585  * Initialize ingress qdisc of a given network interface.
1586  *
1587  * @param nl
1588  *   Libmnl socket of the @p NETLINK_ROUTE kind.
1589  * @param ifindex
1590  *   Index of network interface to initialize.
1591  * @param[out] error
1592  *   Perform verbose error reporting if not NULL.
1593  *
1594  * @return
1595  *   0 on success, a negative errno value otherwise and rte_errno is set.
1596  */
1597 int
1598 mlx5_flow_tcf_init(struct mnl_socket *nl, unsigned int ifindex,
1599                    struct rte_flow_error *error)
1600 {
1601         struct nlmsghdr *nlh;
1602         struct tcmsg *tcm;
1603         alignas(struct nlmsghdr)
1604         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
1605
1606         /* Destroy existing ingress qdisc and everything attached to it. */
1607         nlh = mnl_nlmsg_put_header(buf);
1608         nlh->nlmsg_type = RTM_DELQDISC;
1609         nlh->nlmsg_flags = NLM_F_REQUEST;
1610         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1611         tcm->tcm_family = AF_UNSPEC;
1612         tcm->tcm_ifindex = ifindex;
1613         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
1614         tcm->tcm_parent = TC_H_INGRESS;
1615         /* Ignore errors when qdisc is already absent. */
1616         if (flow_tcf_nl_ack(nl, nlh) &&
1617             rte_errno != EINVAL && rte_errno != ENOENT)
1618                 return rte_flow_error_set(error, rte_errno,
1619                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1620                                           "netlink: failed to remove ingress"
1621                                           " qdisc");
1622         /* Create fresh ingress qdisc. */
1623         nlh = mnl_nlmsg_put_header(buf);
1624         nlh->nlmsg_type = RTM_NEWQDISC;
1625         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
1626         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1627         tcm->tcm_family = AF_UNSPEC;
1628         tcm->tcm_ifindex = ifindex;
1629         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
1630         tcm->tcm_parent = TC_H_INGRESS;
1631         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
1632         if (flow_tcf_nl_ack(nl, nlh))
1633                 return rte_flow_error_set(error, rte_errno,
1634                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1635                                           "netlink: failed to create ingress"
1636                                           " qdisc");
1637         return 0;
1638 }
1639
1640 /**
1641  * Create and configure a libmnl socket for Netlink flow rules.
1642  *
1643  * @return
1644  *   A valid libmnl socket object pointer on success, NULL otherwise and
1645  *   rte_errno is set.
1646  */
1647 struct mnl_socket *
1648 mlx5_flow_tcf_socket_create(void)
1649 {
1650         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
1651
1652         if (nl) {
1653                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
1654                                       sizeof(int));
1655                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
1656                         return nl;
1657         }
1658         rte_errno = errno;
1659         if (nl)
1660                 mnl_socket_close(nl);
1661         return NULL;
1662 }
1663
1664 /**
1665  * Destroy a libmnl socket.
1666  *
1667  * @param nl
1668  *   Libmnl socket of the @p NETLINK_ROUTE kind.
1669  */
1670 void
1671 mlx5_flow_tcf_socket_destroy(struct mnl_socket *nl)
1672 {
1673         mnl_socket_close(nl);
1674 }