net/mlx5: add missing VLAN action constraints
[dpdk.git] / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018 6WIND S.A.
3  * Copyright 2018 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/if_ether.h>
10 #include <linux/netlink.h>
11 #include <linux/pkt_cls.h>
12 #include <linux/pkt_sched.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/tc_act/tc_gact.h>
15 #include <linux/tc_act/tc_mirred.h>
16 #include <netinet/in.h>
17 #include <stdalign.h>
18 #include <stdbool.h>
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <sys/socket.h>
23
24 #include <rte_byteorder.h>
25 #include <rte_errno.h>
26 #include <rte_ether.h>
27 #include <rte_flow.h>
28 #include <rte_malloc.h>
29
30 #include "mlx5.h"
31 #include "mlx5_flow.h"
32 #include "mlx5_autoconf.h"
33
34 #ifdef HAVE_TC_ACT_VLAN
35
36 #include <linux/tc_act/tc_vlan.h>
37
38 #else /* HAVE_TC_ACT_VLAN */
39
40 #define TCA_VLAN_ACT_POP 1
41 #define TCA_VLAN_ACT_PUSH 2
42 #define TCA_VLAN_ACT_MODIFY 3
43 #define TCA_VLAN_PARMS 2
44 #define TCA_VLAN_PUSH_VLAN_ID 3
45 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
46 #define TCA_VLAN_PAD 5
47 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
48
49 struct tc_vlan {
50         tc_gen;
51         int v_action;
52 };
53
54 #endif /* HAVE_TC_ACT_VLAN */
55
56 /* Normally found in linux/netlink.h. */
57 #ifndef NETLINK_CAP_ACK
58 #define NETLINK_CAP_ACK 10
59 #endif
60
61 /* Normally found in linux/pkt_sched.h. */
62 #ifndef TC_H_MIN_INGRESS
63 #define TC_H_MIN_INGRESS 0xfff2u
64 #endif
65
66 /* Normally found in linux/pkt_cls.h. */
67 #ifndef TCA_CLS_FLAGS_SKIP_SW
68 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
69 #endif
70 #ifndef HAVE_TCA_FLOWER_ACT
71 #define TCA_FLOWER_ACT 3
72 #endif
73 #ifndef HAVE_TCA_FLOWER_FLAGS
74 #define TCA_FLOWER_FLAGS 22
75 #endif
76 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
77 #define TCA_FLOWER_KEY_ETH_TYPE 8
78 #endif
79 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
80 #define TCA_FLOWER_KEY_ETH_DST 4
81 #endif
82 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
83 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
84 #endif
85 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
86 #define TCA_FLOWER_KEY_ETH_SRC 6
87 #endif
88 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
89 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
90 #endif
91 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
92 #define TCA_FLOWER_KEY_IP_PROTO 9
93 #endif
94 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
95 #define TCA_FLOWER_KEY_IPV4_SRC 10
96 #endif
97 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
98 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
99 #endif
100 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
101 #define TCA_FLOWER_KEY_IPV4_DST 12
102 #endif
103 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
104 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
105 #endif
106 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
107 #define TCA_FLOWER_KEY_IPV6_SRC 14
108 #endif
109 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
110 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
111 #endif
112 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
113 #define TCA_FLOWER_KEY_IPV6_DST 16
114 #endif
115 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
116 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
117 #endif
118 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
119 #define TCA_FLOWER_KEY_TCP_SRC 18
120 #endif
121 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
122 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
123 #endif
124 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
125 #define TCA_FLOWER_KEY_TCP_DST 19
126 #endif
127 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
128 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
129 #endif
130 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
131 #define TCA_FLOWER_KEY_UDP_SRC 20
132 #endif
133 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
134 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
135 #endif
136 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
137 #define TCA_FLOWER_KEY_UDP_DST 21
138 #endif
139 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
140 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
141 #endif
142 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
143 #define TCA_FLOWER_KEY_VLAN_ID 23
144 #endif
145 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
146 #define TCA_FLOWER_KEY_VLAN_PRIO 24
147 #endif
148 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
149 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
150 #endif
151
152 #ifndef IPV6_ADDR_LEN
153 #define IPV6_ADDR_LEN 16
154 #endif
155
156 /** Empty masks for known item types. */
157 static const union {
158         struct rte_flow_item_port_id port_id;
159         struct rte_flow_item_eth eth;
160         struct rte_flow_item_vlan vlan;
161         struct rte_flow_item_ipv4 ipv4;
162         struct rte_flow_item_ipv6 ipv6;
163         struct rte_flow_item_tcp tcp;
164         struct rte_flow_item_udp udp;
165 } flow_tcf_mask_empty;
166
167 /** Supported masks for known item types. */
168 static const struct {
169         struct rte_flow_item_port_id port_id;
170         struct rte_flow_item_eth eth;
171         struct rte_flow_item_vlan vlan;
172         struct rte_flow_item_ipv4 ipv4;
173         struct rte_flow_item_ipv6 ipv6;
174         struct rte_flow_item_tcp tcp;
175         struct rte_flow_item_udp udp;
176 } flow_tcf_mask_supported = {
177         .port_id = {
178                 .id = 0xffffffff,
179         },
180         .eth = {
181                 .type = RTE_BE16(0xffff),
182                 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
183                 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
184         },
185         .vlan = {
186                 /* PCP and VID only, no DEI. */
187                 .tci = RTE_BE16(0xefff),
188                 .inner_type = RTE_BE16(0xffff),
189         },
190         .ipv4.hdr = {
191                 .next_proto_id = 0xff,
192                 .src_addr = RTE_BE32(0xffffffff),
193                 .dst_addr = RTE_BE32(0xffffffff),
194         },
195         .ipv6.hdr = {
196                 .proto = 0xff,
197                 .src_addr =
198                         "\xff\xff\xff\xff\xff\xff\xff\xff"
199                         "\xff\xff\xff\xff\xff\xff\xff\xff",
200                 .dst_addr =
201                         "\xff\xff\xff\xff\xff\xff\xff\xff"
202                         "\xff\xff\xff\xff\xff\xff\xff\xff",
203         },
204         .tcp.hdr = {
205                 .src_port = RTE_BE16(0xffff),
206                 .dst_port = RTE_BE16(0xffff),
207         },
208         .udp.hdr = {
209                 .src_port = RTE_BE16(0xffff),
210                 .dst_port = RTE_BE16(0xffff),
211         },
212 };
213
214 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
215 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
216 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
217 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
218 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
219
220 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
221
222 /** DPDK port to network interface index (ifindex) conversion. */
223 struct flow_tcf_ptoi {
224         uint16_t port_id; /**< DPDK port ID. */
225         unsigned int ifindex; /**< Network interface index. */
226 };
227
228 #define MLX5_TCF_FATE_ACTIONS (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID)
229 #define MLX5_TCF_VLAN_ACTIONS \
230         (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
231          MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
232
233 /**
234  * Retrieve mask for pattern item.
235  *
236  * This function does basic sanity checks on a pattern item in order to
237  * return the most appropriate mask for it.
238  *
239  * @param[in] item
240  *   Item specification.
241  * @param[in] mask_default
242  *   Default mask for pattern item as specified by the flow API.
243  * @param[in] mask_supported
244  *   Mask fields supported by the implementation.
245  * @param[in] mask_empty
246  *   Empty mask to return when there is no specification.
247  * @param[out] error
248  *   Perform verbose error reporting if not NULL.
249  *
250  * @return
251  *   Either @p item->mask or one of the mask parameters on success, NULL
252  *   otherwise and rte_errno is set.
253  */
254 static const void *
255 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
256                    const void *mask_supported, const void *mask_empty,
257                    size_t mask_size, struct rte_flow_error *error)
258 {
259         const uint8_t *mask;
260         size_t i;
261
262         /* item->last and item->mask cannot exist without item->spec. */
263         if (!item->spec && (item->mask || item->last)) {
264                 rte_flow_error_set(error, EINVAL,
265                                    RTE_FLOW_ERROR_TYPE_ITEM, item,
266                                    "\"mask\" or \"last\" field provided without"
267                                    " a corresponding \"spec\"");
268                 return NULL;
269         }
270         /* No spec, no mask, no problem. */
271         if (!item->spec)
272                 return mask_empty;
273         mask = item->mask ? item->mask : mask_default;
274         assert(mask);
275         /*
276          * Single-pass check to make sure that:
277          * - Mask is supported, no bits are set outside mask_supported.
278          * - Both item->spec and item->last are included in mask.
279          */
280         for (i = 0; i != mask_size; ++i) {
281                 if (!mask[i])
282                         continue;
283                 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
284                     ((const uint8_t *)mask_supported)[i]) {
285                         rte_flow_error_set(error, ENOTSUP,
286                                            RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
287                                            "unsupported field found"
288                                            " in \"mask\"");
289                         return NULL;
290                 }
291                 if (item->last &&
292                     (((const uint8_t *)item->spec)[i] & mask[i]) !=
293                     (((const uint8_t *)item->last)[i] & mask[i])) {
294                         rte_flow_error_set(error, EINVAL,
295                                            RTE_FLOW_ERROR_TYPE_ITEM_LAST,
296                                            item->last,
297                                            "range between \"spec\" and \"last\""
298                                            " not comprised in \"mask\"");
299                         return NULL;
300                 }
301         }
302         return mask;
303 }
304
305 /**
306  * Build a conversion table between port ID and ifindex.
307  *
308  * @param[in] dev
309  *   Pointer to Ethernet device.
310  * @param[out] ptoi
311  *   Pointer to ptoi table.
312  * @param[in] len
313  *   Size of ptoi table provided.
314  *
315  * @return
316  *   Size of ptoi table filled.
317  */
318 static unsigned int
319 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
320                           unsigned int len)
321 {
322         unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
323         uint16_t port_id[n + 1];
324         unsigned int i;
325         unsigned int own = 0;
326
327         /* At least one port is needed when no switch domain is present. */
328         if (!n) {
329                 n = 1;
330                 port_id[0] = dev->data->port_id;
331         } else {
332                 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
333         }
334         if (n > len)
335                 return 0;
336         for (i = 0; i != n; ++i) {
337                 struct rte_eth_dev_info dev_info;
338
339                 rte_eth_dev_info_get(port_id[i], &dev_info);
340                 if (port_id[i] == dev->data->port_id)
341                         own = i;
342                 ptoi[i].port_id = port_id[i];
343                 ptoi[i].ifindex = dev_info.if_index;
344         }
345         /* Ensure first entry of ptoi[] is the current device. */
346         if (own) {
347                 ptoi[n] = ptoi[0];
348                 ptoi[0] = ptoi[own];
349                 ptoi[own] = ptoi[n];
350         }
351         /* An entry with zero ifindex terminates ptoi[]. */
352         ptoi[n].port_id = 0;
353         ptoi[n].ifindex = 0;
354         return n;
355 }
356
357 /**
358  * Verify the @p attr will be correctly understood by the E-switch.
359  *
360  * @param[in] attr
361  *   Pointer to flow attributes
362  * @param[out] error
363  *   Pointer to error structure.
364  *
365  * @return
366  *   0 on success, a negative errno value otherwise and rte_errno is set.
367  */
368 static int
369 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
370                              struct rte_flow_error *error)
371 {
372         /*
373          * Supported attributes: no groups, some priorities and ingress only.
374          * Don't care about transfer as it is the caller's problem.
375          */
376         if (attr->group)
377                 return rte_flow_error_set(error, ENOTSUP,
378                                           RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
379                                           "groups are not supported");
380         if (attr->priority > 0xfffe)
381                 return rte_flow_error_set(error, ENOTSUP,
382                                           RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
383                                           attr,
384                                           "lowest priority level is 0xfffe");
385         if (!attr->ingress)
386                 return rte_flow_error_set(error, EINVAL,
387                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
388                                           attr, "only ingress is supported");
389         if (attr->egress)
390                 return rte_flow_error_set(error, ENOTSUP,
391                                           RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
392                                           attr, "egress is not supported");
393         return 0;
394 }
395
396 /**
397  * Validate flow for E-Switch.
398  *
399  * @param[in] priv
400  *   Pointer to the priv structure.
401  * @param[in] attr
402  *   Pointer to the flow attributes.
403  * @param[in] items
404  *   Pointer to the list of items.
405  * @param[in] actions
406  *   Pointer to the list of actions.
407  * @param[out] error
408  *   Pointer to the error structure.
409  *
410  * @return
411  *   0 on success, a negative errno value otherwise and rte_ernno is set.
412  */
413 static int
414 flow_tcf_validate(struct rte_eth_dev *dev,
415                   const struct rte_flow_attr *attr,
416                   const struct rte_flow_item items[],
417                   const struct rte_flow_action actions[],
418                   struct rte_flow_error *error)
419 {
420         union {
421                 const struct rte_flow_item_port_id *port_id;
422                 const struct rte_flow_item_eth *eth;
423                 const struct rte_flow_item_vlan *vlan;
424                 const struct rte_flow_item_ipv4 *ipv4;
425                 const struct rte_flow_item_ipv6 *ipv6;
426                 const struct rte_flow_item_tcp *tcp;
427                 const struct rte_flow_item_udp *udp;
428         } spec, mask;
429         union {
430                 const struct rte_flow_action_port_id *port_id;
431                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
432                 const struct rte_flow_action_of_set_vlan_vid *
433                         of_set_vlan_vid;
434                 const struct rte_flow_action_of_set_vlan_pcp *
435                         of_set_vlan_pcp;
436         } conf;
437         uint32_t item_flags = 0;
438         uint32_t action_flags = 0;
439         uint8_t next_protocol = -1;
440         unsigned int tcm_ifindex = 0;
441         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
442         struct rte_eth_dev *port_id_dev = NULL;
443         bool in_port_id_set;
444         int ret;
445
446         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
447                                                 PTOI_TABLE_SZ_MAX(dev)));
448         ret = flow_tcf_validate_attributes(attr, error);
449         if (ret < 0)
450                 return ret;
451         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
452                 unsigned int i;
453
454                 switch (items->type) {
455                 case RTE_FLOW_ITEM_TYPE_VOID:
456                         break;
457                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
458                         mask.port_id = flow_tcf_item_mask
459                                 (items, &rte_flow_item_port_id_mask,
460                                  &flow_tcf_mask_supported.port_id,
461                                  &flow_tcf_mask_empty.port_id,
462                                  sizeof(flow_tcf_mask_supported.port_id),
463                                  error);
464                         if (!mask.port_id)
465                                 return -rte_errno;
466                         if (mask.port_id == &flow_tcf_mask_empty.port_id) {
467                                 in_port_id_set = 1;
468                                 break;
469                         }
470                         spec.port_id = items->spec;
471                         if (mask.port_id->id && mask.port_id->id != 0xffffffff)
472                                 return rte_flow_error_set
473                                         (error, ENOTSUP,
474                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
475                                          mask.port_id,
476                                          "no support for partial mask on"
477                                          " \"id\" field");
478                         if (!mask.port_id->id)
479                                 i = 0;
480                         else
481                                 for (i = 0; ptoi[i].ifindex; ++i)
482                                         if (ptoi[i].port_id == spec.port_id->id)
483                                                 break;
484                         if (!ptoi[i].ifindex)
485                                 return rte_flow_error_set
486                                         (error, ENODEV,
487                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
488                                          spec.port_id,
489                                          "missing data to convert port ID to"
490                                          " ifindex");
491                         if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
492                                 return rte_flow_error_set
493                                         (error, ENOTSUP,
494                                          RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
495                                          spec.port_id,
496                                          "cannot match traffic for"
497                                          " several port IDs through"
498                                          " a single flow rule");
499                         tcm_ifindex = ptoi[i].ifindex;
500                         in_port_id_set = 1;
501                         break;
502                 case RTE_FLOW_ITEM_TYPE_ETH:
503                         ret = mlx5_flow_validate_item_eth(items, item_flags,
504                                                           error);
505                         if (ret < 0)
506                                 return ret;
507                         item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
508                         /* TODO:
509                          * Redundant check due to different supported mask.
510                          * Same for the rest of items.
511                          */
512                         mask.eth = flow_tcf_item_mask
513                                 (items, &rte_flow_item_eth_mask,
514                                  &flow_tcf_mask_supported.eth,
515                                  &flow_tcf_mask_empty.eth,
516                                  sizeof(flow_tcf_mask_supported.eth),
517                                  error);
518                         if (!mask.eth)
519                                 return -rte_errno;
520                         if (mask.eth->type && mask.eth->type !=
521                             RTE_BE16(0xffff))
522                                 return rte_flow_error_set
523                                         (error, ENOTSUP,
524                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
525                                          mask.eth,
526                                          "no support for partial mask on"
527                                          " \"type\" field");
528                         break;
529                 case RTE_FLOW_ITEM_TYPE_VLAN:
530                         ret = mlx5_flow_validate_item_vlan(items, item_flags,
531                                                            error);
532                         if (ret < 0)
533                                 return ret;
534                         item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
535                         mask.vlan = flow_tcf_item_mask
536                                 (items, &rte_flow_item_vlan_mask,
537                                  &flow_tcf_mask_supported.vlan,
538                                  &flow_tcf_mask_empty.vlan,
539                                  sizeof(flow_tcf_mask_supported.vlan),
540                                  error);
541                         if (!mask.vlan)
542                                 return -rte_errno;
543                         if ((mask.vlan->tci & RTE_BE16(0xe000) &&
544                              (mask.vlan->tci & RTE_BE16(0xe000)) !=
545                               RTE_BE16(0xe000)) ||
546                             (mask.vlan->tci & RTE_BE16(0x0fff) &&
547                              (mask.vlan->tci & RTE_BE16(0x0fff)) !=
548                               RTE_BE16(0x0fff)) ||
549                             (mask.vlan->inner_type &&
550                              mask.vlan->inner_type != RTE_BE16(0xffff)))
551                                 return rte_flow_error_set
552                                         (error, ENOTSUP,
553                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
554                                          mask.vlan,
555                                          "no support for partial masks on"
556                                          " \"tci\" (PCP and VID parts) and"
557                                          " \"inner_type\" fields");
558                         break;
559                 case RTE_FLOW_ITEM_TYPE_IPV4:
560                         ret = mlx5_flow_validate_item_ipv4(items, item_flags,
561                                                            error);
562                         if (ret < 0)
563                                 return ret;
564                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
565                         mask.ipv4 = flow_tcf_item_mask
566                                 (items, &rte_flow_item_ipv4_mask,
567                                  &flow_tcf_mask_supported.ipv4,
568                                  &flow_tcf_mask_empty.ipv4,
569                                  sizeof(flow_tcf_mask_supported.ipv4),
570                                  error);
571                         if (!mask.ipv4)
572                                 return -rte_errno;
573                         if (mask.ipv4->hdr.next_proto_id &&
574                             mask.ipv4->hdr.next_proto_id != 0xff)
575                                 return rte_flow_error_set
576                                         (error, ENOTSUP,
577                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
578                                          mask.ipv4,
579                                          "no support for partial mask on"
580                                          " \"hdr.next_proto_id\" field");
581                         else if (mask.ipv4->hdr.next_proto_id)
582                                 next_protocol =
583                                         ((const struct rte_flow_item_ipv4 *)
584                                          (items->spec))->hdr.next_proto_id;
585                         break;
586                 case RTE_FLOW_ITEM_TYPE_IPV6:
587                         ret = mlx5_flow_validate_item_ipv6(items, item_flags,
588                                                            error);
589                         if (ret < 0)
590                                 return ret;
591                         item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
592                         mask.ipv6 = flow_tcf_item_mask
593                                 (items, &rte_flow_item_ipv6_mask,
594                                  &flow_tcf_mask_supported.ipv6,
595                                  &flow_tcf_mask_empty.ipv6,
596                                  sizeof(flow_tcf_mask_supported.ipv6),
597                                  error);
598                         if (!mask.ipv6)
599                                 return -rte_errno;
600                         if (mask.ipv6->hdr.proto &&
601                             mask.ipv6->hdr.proto != 0xff)
602                                 return rte_flow_error_set
603                                         (error, ENOTSUP,
604                                          RTE_FLOW_ERROR_TYPE_ITEM_MASK,
605                                          mask.ipv6,
606                                          "no support for partial mask on"
607                                          " \"hdr.proto\" field");
608                         else if (mask.ipv6->hdr.proto)
609                                 next_protocol =
610                                         ((const struct rte_flow_item_ipv6 *)
611                                          (items->spec))->hdr.proto;
612                         break;
613                 case RTE_FLOW_ITEM_TYPE_UDP:
614                         ret = mlx5_flow_validate_item_udp(items, item_flags,
615                                                           next_protocol, error);
616                         if (ret < 0)
617                                 return ret;
618                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
619                         mask.udp = flow_tcf_item_mask
620                                 (items, &rte_flow_item_udp_mask,
621                                  &flow_tcf_mask_supported.udp,
622                                  &flow_tcf_mask_empty.udp,
623                                  sizeof(flow_tcf_mask_supported.udp),
624                                  error);
625                         if (!mask.udp)
626                                 return -rte_errno;
627                         break;
628                 case RTE_FLOW_ITEM_TYPE_TCP:
629                         ret = mlx5_flow_validate_item_tcp(items, item_flags,
630                                                           next_protocol, error);
631                         if (ret < 0)
632                                 return ret;
633                         item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
634                         mask.tcp = flow_tcf_item_mask
635                                 (items, &rte_flow_item_tcp_mask,
636                                  &flow_tcf_mask_supported.tcp,
637                                  &flow_tcf_mask_empty.tcp,
638                                  sizeof(flow_tcf_mask_supported.tcp),
639                                  error);
640                         if (!mask.tcp)
641                                 return -rte_errno;
642                         break;
643                 default:
644                         return rte_flow_error_set(error, ENOTSUP,
645                                                   RTE_FLOW_ERROR_TYPE_ITEM,
646                                                   NULL, "item not supported");
647                 }
648         }
649         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
650                 unsigned int i;
651
652                 switch (actions->type) {
653                 case RTE_FLOW_ACTION_TYPE_VOID:
654                         break;
655                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
656                         if (action_flags & MLX5_TCF_FATE_ACTIONS)
657                                 return rte_flow_error_set
658                                         (error, EINVAL,
659                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
660                                          "can't have multiple fate actions");
661                         conf.port_id = actions->conf;
662                         if (conf.port_id->original)
663                                 i = 0;
664                         else
665                                 for (i = 0; ptoi[i].ifindex; ++i)
666                                         if (ptoi[i].port_id == conf.port_id->id)
667                                                 break;
668                         if (!ptoi[i].ifindex)
669                                 return rte_flow_error_set
670                                         (error, ENODEV,
671                                          RTE_FLOW_ERROR_TYPE_ACTION_CONF,
672                                          conf.port_id,
673                                          "missing data to convert port ID to"
674                                          " ifindex");
675                         action_flags |= MLX5_FLOW_ACTION_PORT_ID;
676                         port_id_dev = &rte_eth_devices[conf.port_id->id];
677                         break;
678                 case RTE_FLOW_ACTION_TYPE_DROP:
679                         if (action_flags & MLX5_TCF_FATE_ACTIONS)
680                                 return rte_flow_error_set
681                                         (error, EINVAL,
682                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
683                                          "can't have multiple fate actions");
684                         action_flags |= MLX5_FLOW_ACTION_DROP;
685                         break;
686                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
687                         action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
688                         break;
689                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
690                         action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
691                         break;
692                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
693                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
694                                 return rte_flow_error_set
695                                         (error, ENOTSUP,
696                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
697                                          "vlan modify is not supported,"
698                                          " set action must follow push action");
699                         action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
700                         break;
701                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
702                         if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
703                                 return rte_flow_error_set
704                                         (error, ENOTSUP,
705                                          RTE_FLOW_ERROR_TYPE_ACTION, actions,
706                                          "vlan modify is not supported,"
707                                          " set action must follow push action");
708                         action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
709                         break;
710                 default:
711                         return rte_flow_error_set(error, ENOTSUP,
712                                                   RTE_FLOW_ERROR_TYPE_ACTION,
713                                                   actions,
714                                                   "action not supported");
715                 }
716         }
717         /*
718          * FW syndrome (0xA9C090):
719          *     set_flow_table_entry: push vlan action fte in fdb can ONLY be
720          *     forward to the uplink.
721          */
722         if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
723             (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
724             ((struct priv *)port_id_dev->data->dev_private)->representor)
725                 return rte_flow_error_set(error, ENOTSUP,
726                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
727                                           "vlan push can only be applied"
728                                           " when forwarding to uplink port");
729         /*
730          * FW syndrome (0x294609):
731          *     set_flow_table_entry: modify/pop/push actions in fdb flow table
732          *     are supported only while forwarding to vport.
733          */
734         if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
735             !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
736                 return rte_flow_error_set(error, ENOTSUP,
737                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
738                                           "vlan actions are supported"
739                                           " only with port_id action");
740         if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
741                 return rte_flow_error_set(error, EINVAL,
742                                           RTE_FLOW_ERROR_TYPE_ACTION, actions,
743                                           "no fate action is found");
744         return 0;
745 }
746
747 /**
748  * Calculate maximum size of memory for flow items of Linux TC flower and
749  * extract specified items.
750  *
751  * @param[in] items
752  *   Pointer to the list of items.
753  * @param[out] item_flags
754  *   Pointer to the detected items.
755  *
756  * @return
757  *   Maximum size of memory for items.
758  */
759 static int
760 flow_tcf_get_items_and_size(const struct rte_flow_item items[],
761                             uint64_t *item_flags)
762 {
763         int size = 0;
764         uint64_t flags = 0;
765
766         size += SZ_NLATTR_STRZ_OF("flower") +
767                 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
768                 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
769         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
770                 switch (items->type) {
771                 case RTE_FLOW_ITEM_TYPE_VOID:
772                         break;
773                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
774                         break;
775                 case RTE_FLOW_ITEM_TYPE_ETH:
776                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
777                                 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
778                                 /* dst/src MAC addr and mask. */
779                         flags |= MLX5_FLOW_LAYER_OUTER_L2;
780                         break;
781                 case RTE_FLOW_ITEM_TYPE_VLAN:
782                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
783                                 SZ_NLATTR_TYPE_OF(uint16_t) +
784                                 /* VLAN Ether type. */
785                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
786                                 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
787                         flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
788                         break;
789                 case RTE_FLOW_ITEM_TYPE_IPV4:
790                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
791                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
792                                 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
793                                 /* dst/src IP addr and mask. */
794                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
795                         break;
796                 case RTE_FLOW_ITEM_TYPE_IPV6:
797                         size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
798                                 SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
799                                 SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
800                                 /* dst/src IP addr and mask. */
801                         flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
802                         break;
803                 case RTE_FLOW_ITEM_TYPE_UDP:
804                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
805                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
806                                 /* dst/src port and mask. */
807                         flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
808                         break;
809                 case RTE_FLOW_ITEM_TYPE_TCP:
810                         size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
811                                 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
812                                 /* dst/src port and mask. */
813                         flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
814                         break;
815                 default:
816                         DRV_LOG(WARNING,
817                                 "unsupported item %p type %d,"
818                                 " items must be validated before flow creation",
819                                 (const void *)items, items->type);
820                         break;
821                 }
822         }
823         *item_flags = flags;
824         return size;
825 }
826
827 /**
828  * Calculate maximum size of memory for flow actions of Linux TC flower and
829  * extract specified actions.
830  *
831  * @param[in] actions
832  *   Pointer to the list of actions.
833  * @param[out] action_flags
834  *   Pointer to the detected actions.
835  *
836  * @return
837  *   Maximum size of memory for actions.
838  */
839 static int
840 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
841                               uint64_t *action_flags)
842 {
843         int size = 0;
844         uint64_t flags = 0;
845
846         size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
847         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
848                 switch (actions->type) {
849                 case RTE_FLOW_ACTION_TYPE_VOID:
850                         break;
851                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
852                         size += SZ_NLATTR_NEST + /* na_act_index. */
853                                 SZ_NLATTR_STRZ_OF("mirred") +
854                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
855                                 SZ_NLATTR_TYPE_OF(struct tc_mirred);
856                         flags |= MLX5_FLOW_ACTION_PORT_ID;
857                         break;
858                 case RTE_FLOW_ACTION_TYPE_DROP:
859                         size += SZ_NLATTR_NEST + /* na_act_index. */
860                                 SZ_NLATTR_STRZ_OF("gact") +
861                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
862                                 SZ_NLATTR_TYPE_OF(struct tc_gact);
863                         flags |= MLX5_FLOW_ACTION_DROP;
864                         break;
865                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
866                         flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
867                         goto action_of_vlan;
868                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
869                         flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
870                         goto action_of_vlan;
871                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
872                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
873                         goto action_of_vlan;
874                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
875                         flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
876                         goto action_of_vlan;
877 action_of_vlan:
878                         size += SZ_NLATTR_NEST + /* na_act_index. */
879                                 SZ_NLATTR_STRZ_OF("vlan") +
880                                 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
881                                 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
882                                 SZ_NLATTR_TYPE_OF(uint16_t) +
883                                 /* VLAN protocol. */
884                                 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
885                                 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
886                         break;
887                 default:
888                         DRV_LOG(WARNING,
889                                 "unsupported action %p type %d,"
890                                 " items must be validated before flow creation",
891                                 (const void *)actions, actions->type);
892                         break;
893                 }
894         }
895         *action_flags = flags;
896         return size;
897 }
898
899 /**
900  * Brand rtnetlink buffer with unique handle.
901  *
902  * This handle should be unique for a given network interface to avoid
903  * collisions.
904  *
905  * @param nlh
906  *   Pointer to Netlink message.
907  * @param handle
908  *   Unique 32-bit handle to use.
909  */
910 static void
911 flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
912 {
913         struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
914
915         tcm->tcm_handle = handle;
916         DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
917                 (void *)nlh, handle);
918 }
919
920 /**
921  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
922  * memory required, allocates the memory, initializes Netlink message headers
923  * and set unique TC message handle.
924  *
925  * @param[in] attr
926  *   Pointer to the flow attributes.
927  * @param[in] items
928  *   Pointer to the list of items.
929  * @param[in] actions
930  *   Pointer to the list of actions.
931  * @param[out] item_flags
932  *   Pointer to bit mask of all items detected.
933  * @param[out] action_flags
934  *   Pointer to bit mask of all actions detected.
935  * @param[out] error
936  *   Pointer to the error structure.
937  *
938  * @return
939  *   Pointer to mlx5_flow object on success,
940  *   otherwise NULL and rte_ernno is set.
941  */
942 static struct mlx5_flow *
943 flow_tcf_prepare(const struct rte_flow_attr *attr __rte_unused,
944                  const struct rte_flow_item items[],
945                  const struct rte_flow_action actions[],
946                  uint64_t *item_flags, uint64_t *action_flags,
947                  struct rte_flow_error *error)
948 {
949         size_t size = sizeof(struct mlx5_flow) +
950                       MNL_ALIGN(sizeof(struct nlmsghdr)) +
951                       MNL_ALIGN(sizeof(struct tcmsg));
952         struct mlx5_flow *dev_flow;
953         struct nlmsghdr *nlh;
954         struct tcmsg *tcm;
955
956         size += flow_tcf_get_items_and_size(items, item_flags);
957         size += flow_tcf_get_actions_and_size(actions, action_flags);
958         dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
959         if (!dev_flow) {
960                 rte_flow_error_set(error, ENOMEM,
961                                    RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
962                                    "not enough memory to create E-Switch flow");
963                 return NULL;
964         }
965         nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
966         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
967         *dev_flow = (struct mlx5_flow){
968                 .tcf = (struct mlx5_flow_tcf){
969                         .nlh = nlh,
970                         .tcm = tcm,
971                 },
972         };
973         /*
974          * Generate a reasonably unique handle based on the address of the
975          * target buffer.
976          *
977          * This is straightforward on 32-bit systems where the flow pointer can
978          * be used directly. Otherwise, its least significant part is taken
979          * after shifting it by the previous power of two of the pointed buffer
980          * size.
981          */
982         if (sizeof(dev_flow) <= 4)
983                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
984         else
985                 flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
986                                        rte_log2_u32(rte_align32prevpow2(size)));
987         return dev_flow;
988 }
989
990 /**
991  * Translate flow for Linux TC flower and construct Netlink message.
992  *
993  * @param[in] priv
994  *   Pointer to the priv structure.
995  * @param[in, out] flow
996  *   Pointer to the sub flow.
997  * @param[in] attr
998  *   Pointer to the flow attributes.
999  * @param[in] items
1000  *   Pointer to the list of items.
1001  * @param[in] actions
1002  *   Pointer to the list of actions.
1003  * @param[out] error
1004  *   Pointer to the error structure.
1005  *
1006  * @return
1007  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1008  */
1009 static int
1010 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
1011                    const struct rte_flow_attr *attr,
1012                    const struct rte_flow_item items[],
1013                    const struct rte_flow_action actions[],
1014                    struct rte_flow_error *error)
1015 {
1016         union {
1017                 const struct rte_flow_item_port_id *port_id;
1018                 const struct rte_flow_item_eth *eth;
1019                 const struct rte_flow_item_vlan *vlan;
1020                 const struct rte_flow_item_ipv4 *ipv4;
1021                 const struct rte_flow_item_ipv6 *ipv6;
1022                 const struct rte_flow_item_tcp *tcp;
1023                 const struct rte_flow_item_udp *udp;
1024         } spec, mask;
1025         union {
1026                 const struct rte_flow_action_port_id *port_id;
1027                 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1028                 const struct rte_flow_action_of_set_vlan_vid *
1029                         of_set_vlan_vid;
1030                 const struct rte_flow_action_of_set_vlan_pcp *
1031                         of_set_vlan_pcp;
1032         } conf;
1033         struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1034         struct nlmsghdr *nlh = dev_flow->tcf.nlh;
1035         struct tcmsg *tcm = dev_flow->tcf.tcm;
1036         uint32_t na_act_index_cur;
1037         bool eth_type_set = 0;
1038         bool vlan_present = 0;
1039         bool vlan_eth_type_set = 0;
1040         bool ip_proto_set = 0;
1041         struct nlattr *na_flower;
1042         struct nlattr *na_flower_act;
1043         struct nlattr *na_vlan_id = NULL;
1044         struct nlattr *na_vlan_priority = NULL;
1045
1046         claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1047                                                 PTOI_TABLE_SZ_MAX(dev)));
1048         nlh = dev_flow->tcf.nlh;
1049         tcm = dev_flow->tcf.tcm;
1050         /* Prepare API must have been called beforehand. */
1051         assert(nlh != NULL && tcm != NULL);
1052         tcm->tcm_family = AF_UNSPEC;
1053         tcm->tcm_ifindex = ptoi[0].ifindex;
1054         tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
1055         /*
1056          * Priority cannot be zero to prevent the kernel from picking one
1057          * automatically.
1058          */
1059         tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
1060                                   RTE_BE16(ETH_P_ALL));
1061         mnl_attr_put_strz(nlh, TCA_KIND, "flower");
1062         na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
1063         mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
1064         for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1065                 unsigned int i;
1066
1067                 switch (items->type) {
1068                 case RTE_FLOW_ITEM_TYPE_VOID:
1069                         break;
1070                 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1071                         mask.port_id = flow_tcf_item_mask
1072                                 (items, &rte_flow_item_port_id_mask,
1073                                  &flow_tcf_mask_supported.port_id,
1074                                  &flow_tcf_mask_empty.port_id,
1075                                  sizeof(flow_tcf_mask_supported.port_id),
1076                                  error);
1077                         assert(mask.port_id);
1078                         if (mask.port_id == &flow_tcf_mask_empty.port_id)
1079                                 break;
1080                         spec.port_id = items->spec;
1081                         if (!mask.port_id->id)
1082                                 i = 0;
1083                         else
1084                                 for (i = 0; ptoi[i].ifindex; ++i)
1085                                         if (ptoi[i].port_id == spec.port_id->id)
1086                                                 break;
1087                         assert(ptoi[i].ifindex);
1088                         tcm->tcm_ifindex = ptoi[i].ifindex;
1089                         break;
1090                 case RTE_FLOW_ITEM_TYPE_ETH:
1091                         mask.eth = flow_tcf_item_mask
1092                                 (items, &rte_flow_item_eth_mask,
1093                                  &flow_tcf_mask_supported.eth,
1094                                  &flow_tcf_mask_empty.eth,
1095                                  sizeof(flow_tcf_mask_supported.eth),
1096                                  error);
1097                         assert(mask.eth);
1098                         if (mask.eth == &flow_tcf_mask_empty.eth)
1099                                 break;
1100                         spec.eth = items->spec;
1101                         if (mask.eth->type) {
1102                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
1103                                                  spec.eth->type);
1104                                 eth_type_set = 1;
1105                         }
1106                         if (!is_zero_ether_addr(&mask.eth->dst)) {
1107                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
1108                                              ETHER_ADDR_LEN,
1109                                              spec.eth->dst.addr_bytes);
1110                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
1111                                              ETHER_ADDR_LEN,
1112                                              mask.eth->dst.addr_bytes);
1113                         }
1114                         if (!is_zero_ether_addr(&mask.eth->src)) {
1115                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
1116                                              ETHER_ADDR_LEN,
1117                                              spec.eth->src.addr_bytes);
1118                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
1119                                              ETHER_ADDR_LEN,
1120                                              mask.eth->src.addr_bytes);
1121                         }
1122                         break;
1123                 case RTE_FLOW_ITEM_TYPE_VLAN:
1124                         mask.vlan = flow_tcf_item_mask
1125                                 (items, &rte_flow_item_vlan_mask,
1126                                  &flow_tcf_mask_supported.vlan,
1127                                  &flow_tcf_mask_empty.vlan,
1128                                  sizeof(flow_tcf_mask_supported.vlan),
1129                                  error);
1130                         assert(mask.vlan);
1131                         if (!eth_type_set)
1132                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
1133                                                  RTE_BE16(ETH_P_8021Q));
1134                         eth_type_set = 1;
1135                         vlan_present = 1;
1136                         if (mask.vlan == &flow_tcf_mask_empty.vlan)
1137                                 break;
1138                         spec.vlan = items->spec;
1139                         if (mask.vlan->inner_type) {
1140                                 mnl_attr_put_u16(nlh,
1141                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
1142                                                  spec.vlan->inner_type);
1143                                 vlan_eth_type_set = 1;
1144                         }
1145                         if (mask.vlan->tci & RTE_BE16(0xe000))
1146                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
1147                                                 (rte_be_to_cpu_16
1148                                                  (spec.vlan->tci) >> 13) & 0x7);
1149                         if (mask.vlan->tci & RTE_BE16(0x0fff))
1150                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
1151                                                  rte_be_to_cpu_16
1152                                                  (spec.vlan->tci &
1153                                                   RTE_BE16(0x0fff)));
1154                         break;
1155                 case RTE_FLOW_ITEM_TYPE_IPV4:
1156                         mask.ipv4 = flow_tcf_item_mask
1157                                 (items, &rte_flow_item_ipv4_mask,
1158                                  &flow_tcf_mask_supported.ipv4,
1159                                  &flow_tcf_mask_empty.ipv4,
1160                                  sizeof(flow_tcf_mask_supported.ipv4),
1161                                  error);
1162                         assert(mask.ipv4);
1163                         if (!eth_type_set || !vlan_eth_type_set)
1164                                 mnl_attr_put_u16(nlh,
1165                                                  vlan_present ?
1166                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
1167                                                  TCA_FLOWER_KEY_ETH_TYPE,
1168                                                  RTE_BE16(ETH_P_IP));
1169                         eth_type_set = 1;
1170                         vlan_eth_type_set = 1;
1171                         if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
1172                                 break;
1173                         spec.ipv4 = items->spec;
1174                         if (mask.ipv4->hdr.next_proto_id) {
1175                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1176                                                 spec.ipv4->hdr.next_proto_id);
1177                                 ip_proto_set = 1;
1178                         }
1179                         if (mask.ipv4->hdr.src_addr) {
1180                                 mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
1181                                                  spec.ipv4->hdr.src_addr);
1182                                 mnl_attr_put_u32(nlh,
1183                                                  TCA_FLOWER_KEY_IPV4_SRC_MASK,
1184                                                  mask.ipv4->hdr.src_addr);
1185                         }
1186                         if (mask.ipv4->hdr.dst_addr) {
1187                                 mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
1188                                                  spec.ipv4->hdr.dst_addr);
1189                                 mnl_attr_put_u32(nlh,
1190                                                  TCA_FLOWER_KEY_IPV4_DST_MASK,
1191                                                  mask.ipv4->hdr.dst_addr);
1192                         }
1193                         break;
1194                 case RTE_FLOW_ITEM_TYPE_IPV6:
1195                         mask.ipv6 = flow_tcf_item_mask
1196                                 (items, &rte_flow_item_ipv6_mask,
1197                                  &flow_tcf_mask_supported.ipv6,
1198                                  &flow_tcf_mask_empty.ipv6,
1199                                  sizeof(flow_tcf_mask_supported.ipv6),
1200                                  error);
1201                         assert(mask.ipv6);
1202                         if (!eth_type_set || !vlan_eth_type_set)
1203                                 mnl_attr_put_u16(nlh,
1204                                                  vlan_present ?
1205                                                  TCA_FLOWER_KEY_VLAN_ETH_TYPE :
1206                                                  TCA_FLOWER_KEY_ETH_TYPE,
1207                                                  RTE_BE16(ETH_P_IPV6));
1208                         eth_type_set = 1;
1209                         vlan_eth_type_set = 1;
1210                         if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
1211                                 break;
1212                         spec.ipv6 = items->spec;
1213                         if (mask.ipv6->hdr.proto) {
1214                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1215                                                 spec.ipv6->hdr.proto);
1216                                 ip_proto_set = 1;
1217                         }
1218                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
1219                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
1220                                              sizeof(spec.ipv6->hdr.src_addr),
1221                                              spec.ipv6->hdr.src_addr);
1222                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
1223                                              sizeof(mask.ipv6->hdr.src_addr),
1224                                              mask.ipv6->hdr.src_addr);
1225                         }
1226                         if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
1227                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
1228                                              sizeof(spec.ipv6->hdr.dst_addr),
1229                                              spec.ipv6->hdr.dst_addr);
1230                                 mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
1231                                              sizeof(mask.ipv6->hdr.dst_addr),
1232                                              mask.ipv6->hdr.dst_addr);
1233                         }
1234                         break;
1235                 case RTE_FLOW_ITEM_TYPE_UDP:
1236                         mask.udp = flow_tcf_item_mask
1237                                 (items, &rte_flow_item_udp_mask,
1238                                  &flow_tcf_mask_supported.udp,
1239                                  &flow_tcf_mask_empty.udp,
1240                                  sizeof(flow_tcf_mask_supported.udp),
1241                                  error);
1242                         assert(mask.udp);
1243                         if (!ip_proto_set)
1244                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1245                                                 IPPROTO_UDP);
1246                         if (mask.udp == &flow_tcf_mask_empty.udp)
1247                                 break;
1248                         spec.udp = items->spec;
1249                         if (mask.udp->hdr.src_port) {
1250                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
1251                                                  spec.udp->hdr.src_port);
1252                                 mnl_attr_put_u16(nlh,
1253                                                  TCA_FLOWER_KEY_UDP_SRC_MASK,
1254                                                  mask.udp->hdr.src_port);
1255                         }
1256                         if (mask.udp->hdr.dst_port) {
1257                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
1258                                                  spec.udp->hdr.dst_port);
1259                                 mnl_attr_put_u16(nlh,
1260                                                  TCA_FLOWER_KEY_UDP_DST_MASK,
1261                                                  mask.udp->hdr.dst_port);
1262                         }
1263                         break;
1264                 case RTE_FLOW_ITEM_TYPE_TCP:
1265                         mask.tcp = flow_tcf_item_mask
1266                                 (items, &rte_flow_item_tcp_mask,
1267                                  &flow_tcf_mask_supported.tcp,
1268                                  &flow_tcf_mask_empty.tcp,
1269                                  sizeof(flow_tcf_mask_supported.tcp),
1270                                  error);
1271                         assert(mask.tcp);
1272                         if (!ip_proto_set)
1273                                 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
1274                                                 IPPROTO_TCP);
1275                         if (mask.tcp == &flow_tcf_mask_empty.tcp)
1276                                 break;
1277                         spec.tcp = items->spec;
1278                         if (mask.tcp->hdr.src_port) {
1279                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
1280                                                  spec.tcp->hdr.src_port);
1281                                 mnl_attr_put_u16(nlh,
1282                                                  TCA_FLOWER_KEY_TCP_SRC_MASK,
1283                                                  mask.tcp->hdr.src_port);
1284                         }
1285                         if (mask.tcp->hdr.dst_port) {
1286                                 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
1287                                                  spec.tcp->hdr.dst_port);
1288                                 mnl_attr_put_u16(nlh,
1289                                                  TCA_FLOWER_KEY_TCP_DST_MASK,
1290                                                  mask.tcp->hdr.dst_port);
1291                         }
1292                         break;
1293                 default:
1294                         return rte_flow_error_set(error, ENOTSUP,
1295                                                   RTE_FLOW_ERROR_TYPE_ITEM,
1296                                                   NULL, "item not supported");
1297                 }
1298         }
1299         na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
1300         na_act_index_cur = 1;
1301         for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1302                 struct nlattr *na_act_index;
1303                 struct nlattr *na_act;
1304                 unsigned int vlan_act;
1305                 unsigned int i;
1306
1307                 switch (actions->type) {
1308                 case RTE_FLOW_ACTION_TYPE_VOID:
1309                         break;
1310                 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1311                         conf.port_id = actions->conf;
1312                         if (conf.port_id->original)
1313                                 i = 0;
1314                         else
1315                                 for (i = 0; ptoi[i].ifindex; ++i)
1316                                         if (ptoi[i].port_id == conf.port_id->id)
1317                                                 break;
1318                         assert(ptoi[i].ifindex);
1319                         na_act_index =
1320                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
1321                         assert(na_act_index);
1322                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
1323                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
1324                         assert(na_act);
1325                         mnl_attr_put(nlh, TCA_MIRRED_PARMS,
1326                                      sizeof(struct tc_mirred),
1327                                      &(struct tc_mirred){
1328                                         .action = TC_ACT_STOLEN,
1329                                         .eaction = TCA_EGRESS_REDIR,
1330                                         .ifindex = ptoi[i].ifindex,
1331                                      });
1332                         mnl_attr_nest_end(nlh, na_act);
1333                         mnl_attr_nest_end(nlh, na_act_index);
1334                         break;
1335                 case RTE_FLOW_ACTION_TYPE_DROP:
1336                         na_act_index =
1337                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
1338                         assert(na_act_index);
1339                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
1340                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
1341                         assert(na_act);
1342                         mnl_attr_put(nlh, TCA_GACT_PARMS,
1343                                      sizeof(struct tc_gact),
1344                                      &(struct tc_gact){
1345                                         .action = TC_ACT_SHOT,
1346                                      });
1347                         mnl_attr_nest_end(nlh, na_act);
1348                         mnl_attr_nest_end(nlh, na_act_index);
1349                         break;
1350                 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1351                         conf.of_push_vlan = NULL;
1352                         vlan_act = TCA_VLAN_ACT_POP;
1353                         goto action_of_vlan;
1354                 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
1355                         conf.of_push_vlan = actions->conf;
1356                         vlan_act = TCA_VLAN_ACT_PUSH;
1357                         goto action_of_vlan;
1358                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1359                         conf.of_set_vlan_vid = actions->conf;
1360                         if (na_vlan_id)
1361                                 goto override_na_vlan_id;
1362                         vlan_act = TCA_VLAN_ACT_MODIFY;
1363                         goto action_of_vlan;
1364                 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1365                         conf.of_set_vlan_pcp = actions->conf;
1366                         if (na_vlan_priority)
1367                                 goto override_na_vlan_priority;
1368                         vlan_act = TCA_VLAN_ACT_MODIFY;
1369                         goto action_of_vlan;
1370 action_of_vlan:
1371                         na_act_index =
1372                                 mnl_attr_nest_start(nlh, na_act_index_cur++);
1373                         assert(na_act_index);
1374                         mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
1375                         na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
1376                         assert(na_act);
1377                         mnl_attr_put(nlh, TCA_VLAN_PARMS,
1378                                      sizeof(struct tc_vlan),
1379                                      &(struct tc_vlan){
1380                                         .action = TC_ACT_PIPE,
1381                                         .v_action = vlan_act,
1382                                      });
1383                         if (vlan_act == TCA_VLAN_ACT_POP) {
1384                                 mnl_attr_nest_end(nlh, na_act);
1385                                 mnl_attr_nest_end(nlh, na_act_index);
1386                                 break;
1387                         }
1388                         if (vlan_act == TCA_VLAN_ACT_PUSH)
1389                                 mnl_attr_put_u16(nlh,
1390                                                  TCA_VLAN_PUSH_VLAN_PROTOCOL,
1391                                                  conf.of_push_vlan->ethertype);
1392                         na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
1393                         mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
1394                         na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
1395                         mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
1396                         mnl_attr_nest_end(nlh, na_act);
1397                         mnl_attr_nest_end(nlh, na_act_index);
1398                         if (actions->type ==
1399                             RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
1400 override_na_vlan_id:
1401                                 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
1402                                 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
1403                                         rte_be_to_cpu_16
1404                                         (conf.of_set_vlan_vid->vlan_vid);
1405                         } else if (actions->type ==
1406                                    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
1407 override_na_vlan_priority:
1408                                 na_vlan_priority->nla_type =
1409                                         TCA_VLAN_PUSH_VLAN_PRIORITY;
1410                                 *(uint8_t *)mnl_attr_get_payload
1411                                         (na_vlan_priority) =
1412                                         conf.of_set_vlan_pcp->vlan_pcp;
1413                         }
1414                         break;
1415                 default:
1416                         return rte_flow_error_set(error, ENOTSUP,
1417                                                   RTE_FLOW_ERROR_TYPE_ACTION,
1418                                                   actions,
1419                                                   "action not supported");
1420                 }
1421         }
1422         assert(na_flower);
1423         assert(na_flower_act);
1424         mnl_attr_nest_end(nlh, na_flower_act);
1425         mnl_attr_nest_end(nlh, na_flower);
1426         return 0;
1427 }
1428
1429 /**
1430  * Send Netlink message with acknowledgment.
1431  *
1432  * @param nl
1433  *   Libmnl socket to use.
1434  * @param nlh
1435  *   Message to send. This function always raises the NLM_F_ACK flag before
1436  *   sending.
1437  *
1438  * @return
1439  *   0 on success, a negative errno value otherwise and rte_errno is set.
1440  */
1441 static int
1442 flow_tcf_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
1443 {
1444         alignas(struct nlmsghdr)
1445         uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
1446                     nlh->nlmsg_len - sizeof(*nlh)];
1447         uint32_t seq = random();
1448         int ret;
1449
1450         nlh->nlmsg_flags |= NLM_F_ACK;
1451         nlh->nlmsg_seq = seq;
1452         ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
1453         if (ret != -1)
1454                 ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
1455         if (ret != -1)
1456                 ret = mnl_cb_run
1457                         (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
1458         if (ret > 0)
1459                 return 0;
1460         rte_errno = errno;
1461         return -rte_errno;
1462 }
1463
1464 /**
1465  * Apply flow to E-Switch by sending Netlink message.
1466  *
1467  * @param[in] dev
1468  *   Pointer to Ethernet device.
1469  * @param[in, out] flow
1470  *   Pointer to the sub flow.
1471  * @param[out] error
1472  *   Pointer to the error structure.
1473  *
1474  * @return
1475  *   0 on success, a negative errno value otherwise and rte_ernno is set.
1476  */
1477 static int
1478 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
1479                struct rte_flow_error *error)
1480 {
1481         struct priv *priv = dev->data->dev_private;
1482         struct mnl_socket *nl = priv->mnl_socket;
1483         struct mlx5_flow *dev_flow;
1484         struct nlmsghdr *nlh;
1485
1486         dev_flow = LIST_FIRST(&flow->dev_flows);
1487         /* E-Switch flow can't be expanded. */
1488         assert(!LIST_NEXT(dev_flow, next));
1489         nlh = dev_flow->tcf.nlh;
1490         nlh->nlmsg_type = RTM_NEWTFILTER;
1491         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
1492         if (!flow_tcf_nl_ack(nl, nlh))
1493                 return 0;
1494         return rte_flow_error_set(error, rte_errno,
1495                                   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1496                                   "netlink: failed to create TC flow rule");
1497 }
1498
1499 /**
1500  * Remove flow from E-Switch by sending Netlink message.
1501  *
1502  * @param[in] dev
1503  *   Pointer to Ethernet device.
1504  * @param[in, out] flow
1505  *   Pointer to the sub flow.
1506  */
1507 static void
1508 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
1509 {
1510         struct priv *priv = dev->data->dev_private;
1511         struct mnl_socket *nl = priv->mnl_socket;
1512         struct mlx5_flow *dev_flow;
1513         struct nlmsghdr *nlh;
1514
1515         if (!flow)
1516                 return;
1517         dev_flow = LIST_FIRST(&flow->dev_flows);
1518         if (!dev_flow)
1519                 return;
1520         /* E-Switch flow can't be expanded. */
1521         assert(!LIST_NEXT(dev_flow, next));
1522         nlh = dev_flow->tcf.nlh;
1523         nlh->nlmsg_type = RTM_DELTFILTER;
1524         nlh->nlmsg_flags = NLM_F_REQUEST;
1525         flow_tcf_nl_ack(nl, nlh);
1526 }
1527
1528 /**
1529  * Remove flow from E-Switch and release resources of the device flow.
1530  *
1531  * @param[in] dev
1532  *   Pointer to Ethernet device.
1533  * @param[in, out] flow
1534  *   Pointer to the sub flow.
1535  */
1536 static void
1537 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
1538 {
1539         struct mlx5_flow *dev_flow;
1540
1541         if (!flow)
1542                 return;
1543         flow_tcf_remove(dev, flow);
1544         dev_flow = LIST_FIRST(&flow->dev_flows);
1545         if (!dev_flow)
1546                 return;
1547         /* E-Switch flow can't be expanded. */
1548         assert(!LIST_NEXT(dev_flow, next));
1549         LIST_REMOVE(dev_flow, next);
1550         rte_free(dev_flow);
1551 }
1552
1553 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
1554         .validate = flow_tcf_validate,
1555         .prepare = flow_tcf_prepare,
1556         .translate = flow_tcf_translate,
1557         .apply = flow_tcf_apply,
1558         .remove = flow_tcf_remove,
1559         .destroy = flow_tcf_destroy,
1560 };
1561
1562 /**
1563  * Initialize ingress qdisc of a given network interface.
1564  *
1565  * @param nl
1566  *   Libmnl socket of the @p NETLINK_ROUTE kind.
1567  * @param ifindex
1568  *   Index of network interface to initialize.
1569  * @param[out] error
1570  *   Perform verbose error reporting if not NULL.
1571  *
1572  * @return
1573  *   0 on success, a negative errno value otherwise and rte_errno is set.
1574  */
1575 int
1576 mlx5_flow_tcf_init(struct mnl_socket *nl, unsigned int ifindex,
1577                    struct rte_flow_error *error)
1578 {
1579         struct nlmsghdr *nlh;
1580         struct tcmsg *tcm;
1581         alignas(struct nlmsghdr)
1582         uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
1583
1584         /* Destroy existing ingress qdisc and everything attached to it. */
1585         nlh = mnl_nlmsg_put_header(buf);
1586         nlh->nlmsg_type = RTM_DELQDISC;
1587         nlh->nlmsg_flags = NLM_F_REQUEST;
1588         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1589         tcm->tcm_family = AF_UNSPEC;
1590         tcm->tcm_ifindex = ifindex;
1591         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
1592         tcm->tcm_parent = TC_H_INGRESS;
1593         /* Ignore errors when qdisc is already absent. */
1594         if (flow_tcf_nl_ack(nl, nlh) &&
1595             rte_errno != EINVAL && rte_errno != ENOENT)
1596                 return rte_flow_error_set(error, rte_errno,
1597                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1598                                           "netlink: failed to remove ingress"
1599                                           " qdisc");
1600         /* Create fresh ingress qdisc. */
1601         nlh = mnl_nlmsg_put_header(buf);
1602         nlh->nlmsg_type = RTM_NEWQDISC;
1603         nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
1604         tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
1605         tcm->tcm_family = AF_UNSPEC;
1606         tcm->tcm_ifindex = ifindex;
1607         tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
1608         tcm->tcm_parent = TC_H_INGRESS;
1609         mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
1610         if (flow_tcf_nl_ack(nl, nlh))
1611                 return rte_flow_error_set(error, rte_errno,
1612                                           RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
1613                                           "netlink: failed to create ingress"
1614                                           " qdisc");
1615         return 0;
1616 }
1617
1618 /**
1619  * Create and configure a libmnl socket for Netlink flow rules.
1620  *
1621  * @return
1622  *   A valid libmnl socket object pointer on success, NULL otherwise and
1623  *   rte_errno is set.
1624  */
1625 struct mnl_socket *
1626 mlx5_flow_tcf_socket_create(void)
1627 {
1628         struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
1629
1630         if (nl) {
1631                 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
1632                                       sizeof(int));
1633                 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
1634                         return nl;
1635         }
1636         rte_errno = errno;
1637         if (nl)
1638                 mnl_socket_close(nl);
1639         return NULL;
1640 }
1641
1642 /**
1643  * Destroy a libmnl socket.
1644  *
1645  * @param nl
1646  *   Libmnl socket of the @p NETLINK_ROUTE kind.
1647  */
1648 void
1649 mlx5_flow_tcf_socket_destroy(struct mnl_socket *nl)
1650 {
1651         mnl_socket_close(nl);
1652 }