net/bonding: support flow API
[dpdk.git] / drivers / net / bonding / rte_eth_bond_pmd.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 #include <stdlib.h>
5 #include <netinet/in.h>
6
7 #include <rte_mbuf.h>
8 #include <rte_malloc.h>
9 #include <rte_ethdev_driver.h>
10 #include <rte_ethdev_vdev.h>
11 #include <rte_tcp.h>
12 #include <rte_udp.h>
13 #include <rte_ip.h>
14 #include <rte_ip_frag.h>
15 #include <rte_devargs.h>
16 #include <rte_kvargs.h>
17 #include <rte_bus_vdev.h>
18 #include <rte_alarm.h>
19 #include <rte_cycles.h>
20 #include <rte_string_fns.h>
21
22 #include "rte_eth_bond.h"
23 #include "rte_eth_bond_private.h"
24 #include "rte_eth_bond_8023ad_private.h"
25
26 #define REORDER_PERIOD_MS 10
27 #define DEFAULT_POLLING_INTERVAL_10_MS (10)
28
29 #define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
30
31 /* Table for statistics in mode 5 TLB */
32 static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
33
34 static inline size_t
35 get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
36 {
37         size_t vlan_offset = 0;
38
39         if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
40                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
41
42                 vlan_offset = sizeof(struct vlan_hdr);
43                 *proto = vlan_hdr->eth_proto;
44
45                 if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
46                         vlan_hdr = vlan_hdr + 1;
47                         *proto = vlan_hdr->eth_proto;
48                         vlan_offset += sizeof(struct vlan_hdr);
49                 }
50         }
51         return vlan_offset;
52 }
53
54 static uint16_t
55 bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
56 {
57         struct bond_dev_private *internals;
58
59         uint16_t num_rx_slave = 0;
60         uint16_t num_rx_total = 0;
61
62         int i;
63
64         /* Cast to structure, containing bonded device's port id and queue id */
65         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
66
67         internals = bd_rx_q->dev_private;
68
69
70         for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
71                 /* Offset of pointer to *bufs increases as packets are received
72                  * from other slaves */
73                 num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
74                                 bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
75                 if (num_rx_slave) {
76                         num_rx_total += num_rx_slave;
77                         nb_pkts -= num_rx_slave;
78                 }
79         }
80
81         return num_rx_total;
82 }
83
84 static uint16_t
85 bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
86                 uint16_t nb_pkts)
87 {
88         struct bond_dev_private *internals;
89
90         /* Cast to structure, containing bonded device's port id and queue id */
91         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
92
93         internals = bd_rx_q->dev_private;
94
95         return rte_eth_rx_burst(internals->current_primary_port,
96                         bd_rx_q->queue_id, bufs, nb_pkts);
97 }
98
99 static inline uint8_t
100 is_lacp_packets(uint16_t ethertype, uint8_t subtype, struct rte_mbuf *mbuf)
101 {
102         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
103
104         return !((mbuf->ol_flags & PKT_RX_VLAN) ? mbuf->vlan_tci : 0) &&
105                 (ethertype == ether_type_slow_be &&
106                 (subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
107 }
108
109 /*****************************************************************************
110  * Flow director's setup for mode 4 optimization
111  */
112
113 static struct rte_flow_item_eth flow_item_eth_type_8023ad = {
114         .dst.addr_bytes = { 0 },
115         .src.addr_bytes = { 0 },
116         .type = RTE_BE16(ETHER_TYPE_SLOW),
117 };
118
119 static struct rte_flow_item_eth flow_item_eth_mask_type_8023ad = {
120         .dst.addr_bytes = { 0 },
121         .src.addr_bytes = { 0 },
122         .type = 0xFFFF,
123 };
124
125 static struct rte_flow_item flow_item_8023ad[] = {
126         {
127                 .type = RTE_FLOW_ITEM_TYPE_ETH,
128                 .spec = &flow_item_eth_type_8023ad,
129                 .last = NULL,
130                 .mask = &flow_item_eth_mask_type_8023ad,
131         },
132         {
133                 .type = RTE_FLOW_ITEM_TYPE_END,
134                 .spec = NULL,
135                 .last = NULL,
136                 .mask = NULL,
137         }
138 };
139
140 const struct rte_flow_attr flow_attr_8023ad = {
141         .group = 0,
142         .priority = 0,
143         .ingress = 1,
144         .egress = 0,
145         .reserved = 0,
146 };
147
148 int
149 bond_ethdev_8023ad_flow_verify(struct rte_eth_dev *bond_dev,
150                 uint16_t slave_port) {
151         struct rte_eth_dev_info slave_info;
152         struct rte_flow_error error;
153         struct bond_dev_private *internals = (struct bond_dev_private *)
154                         (bond_dev->data->dev_private);
155
156         const struct rte_flow_action_queue lacp_queue_conf = {
157                 .index = 0,
158         };
159
160         const struct rte_flow_action actions[] = {
161                 {
162                         .type = RTE_FLOW_ACTION_TYPE_QUEUE,
163                         .conf = &lacp_queue_conf
164                 },
165                 {
166                         .type = RTE_FLOW_ACTION_TYPE_END,
167                 }
168         };
169
170         int ret = rte_flow_validate(slave_port, &flow_attr_8023ad,
171                         flow_item_8023ad, actions, &error);
172         if (ret < 0) {
173                 RTE_BOND_LOG(ERR, "%s: %s (slave_port=%d queue_id=%d)",
174                                 __func__, error.message, slave_port,
175                                 internals->mode4.dedicated_queues.rx_qid);
176                 return -1;
177         }
178
179         rte_eth_dev_info_get(slave_port, &slave_info);
180         if (slave_info.max_rx_queues < bond_dev->data->nb_rx_queues ||
181                         slave_info.max_tx_queues < bond_dev->data->nb_tx_queues) {
182                 RTE_BOND_LOG(ERR,
183                         "%s: Slave %d capabilities doesn't allow to allocate additional queues",
184                         __func__, slave_port);
185                 return -1;
186         }
187
188         return 0;
189 }
190
191 int
192 bond_8023ad_slow_pkt_hw_filter_supported(uint16_t port_id) {
193         struct rte_eth_dev *bond_dev = &rte_eth_devices[port_id];
194         struct bond_dev_private *internals = (struct bond_dev_private *)
195                         (bond_dev->data->dev_private);
196         struct rte_eth_dev_info bond_info;
197         uint16_t idx;
198
199         /* Verify if all slaves in bonding supports flow director and */
200         if (internals->slave_count > 0) {
201                 rte_eth_dev_info_get(bond_dev->data->port_id, &bond_info);
202
203                 internals->mode4.dedicated_queues.rx_qid = bond_info.nb_rx_queues;
204                 internals->mode4.dedicated_queues.tx_qid = bond_info.nb_tx_queues;
205
206                 for (idx = 0; idx < internals->slave_count; idx++) {
207                         if (bond_ethdev_8023ad_flow_verify(bond_dev,
208                                         internals->slaves[idx].port_id) != 0)
209                                 return -1;
210                 }
211         }
212
213         return 0;
214 }
215
216 int
217 bond_ethdev_8023ad_flow_set(struct rte_eth_dev *bond_dev, uint16_t slave_port) {
218
219         struct rte_flow_error error;
220         struct bond_dev_private *internals = (struct bond_dev_private *)
221                         (bond_dev->data->dev_private);
222
223         struct rte_flow_action_queue lacp_queue_conf = {
224                 .index = internals->mode4.dedicated_queues.rx_qid,
225         };
226
227         const struct rte_flow_action actions[] = {
228                 {
229                         .type = RTE_FLOW_ACTION_TYPE_QUEUE,
230                         .conf = &lacp_queue_conf
231                 },
232                 {
233                         .type = RTE_FLOW_ACTION_TYPE_END,
234                 }
235         };
236
237         internals->mode4.dedicated_queues.flow[slave_port] = rte_flow_create(slave_port,
238                         &flow_attr_8023ad, flow_item_8023ad, actions, &error);
239         if (internals->mode4.dedicated_queues.flow[slave_port] == NULL) {
240                 RTE_BOND_LOG(ERR, "bond_ethdev_8023ad_flow_set: %s "
241                                 "(slave_port=%d queue_id=%d)",
242                                 error.message, slave_port,
243                                 internals->mode4.dedicated_queues.rx_qid);
244                 return -1;
245         }
246
247         return 0;
248 }
249
250 static uint16_t
251 bond_ethdev_rx_burst_8023ad_fast_queue(void *queue, struct rte_mbuf **bufs,
252                 uint16_t nb_pkts)
253 {
254         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
255         struct bond_dev_private *internals = bd_rx_q->dev_private;
256         uint16_t num_rx_total = 0;      /* Total number of received packets */
257         uint16_t slaves[RTE_MAX_ETHPORTS];
258         uint16_t slave_count;
259
260         uint16_t i, idx;
261
262         /* Copy slave list to protect against slave up/down changes during tx
263          * bursting */
264         slave_count = internals->active_slave_count;
265         memcpy(slaves, internals->active_slaves,
266                         sizeof(internals->active_slaves[0]) * slave_count);
267
268         for (i = 0, idx = internals->active_slave;
269                         i < slave_count && num_rx_total < nb_pkts; i++, idx++) {
270                 idx = idx % slave_count;
271
272                 /* Read packets from this slave */
273                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
274                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
275         }
276
277         internals->active_slave = idx;
278
279         return num_rx_total;
280 }
281
282 static uint16_t
283 bond_ethdev_tx_burst_8023ad_fast_queue(void *queue, struct rte_mbuf **bufs,
284                 uint16_t nb_bufs)
285 {
286         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
287         struct bond_dev_private *internals = bd_tx_q->dev_private;
288
289         uint16_t slave_port_ids[RTE_MAX_ETHPORTS];
290         uint16_t slave_count;
291
292         uint16_t dist_slave_port_ids[RTE_MAX_ETHPORTS];
293         uint16_t dist_slave_count;
294
295         /* 2-D array to sort mbufs for transmission on each slave into */
296         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_bufs];
297         /* Number of mbufs for transmission on each slave */
298         uint16_t slave_nb_bufs[RTE_MAX_ETHPORTS] = { 0 };
299         /* Mapping array generated by hash function to map mbufs to slaves */
300         uint16_t bufs_slave_port_idxs[RTE_MAX_ETHPORTS] = { 0 };
301
302         uint16_t slave_tx_count, slave_tx_fail_count[RTE_MAX_ETHPORTS] = { 0 };
303         uint16_t total_tx_count = 0, total_tx_fail_count = 0;
304
305         uint16_t i, j;
306
307         if (unlikely(nb_bufs == 0))
308                 return 0;
309
310         /* Copy slave list to protect against slave up/down changes during tx
311          * bursting */
312         slave_count = internals->active_slave_count;
313         if (unlikely(slave_count < 1))
314                 return 0;
315
316         memcpy(slave_port_ids, internals->active_slaves,
317                         sizeof(slave_port_ids[0]) * slave_count);
318
319
320         dist_slave_count = 0;
321         for (i = 0; i < slave_count; i++) {
322                 struct port *port = &mode_8023ad_ports[slave_port_ids[i]];
323
324                 if (ACTOR_STATE(port, DISTRIBUTING))
325                         dist_slave_port_ids[dist_slave_count++] =
326                                         slave_port_ids[i];
327         }
328
329         if (unlikely(dist_slave_count < 1))
330                 return 0;
331
332         /*
333          * Populate slaves mbuf with the packets which are to be sent on it
334          * selecting output slave using hash based on xmit policy
335          */
336         internals->burst_xmit_hash(bufs, nb_bufs, dist_slave_count,
337                         bufs_slave_port_idxs);
338
339         for (i = 0; i < nb_bufs; i++) {
340                 /* Populate slave mbuf arrays with mbufs for that slave. */
341                 uint8_t slave_idx = bufs_slave_port_idxs[i];
342
343                 slave_bufs[slave_idx][slave_nb_bufs[slave_idx]++] = bufs[i];
344         }
345
346
347         /* Send packet burst on each slave device */
348         for (i = 0; i < dist_slave_count; i++) {
349                 if (slave_nb_bufs[i] == 0)
350                         continue;
351
352                 slave_tx_count = rte_eth_tx_burst(dist_slave_port_ids[i],
353                                 bd_tx_q->queue_id, slave_bufs[i],
354                                 slave_nb_bufs[i]);
355
356                 total_tx_count += slave_tx_count;
357
358                 /* If tx burst fails move packets to end of bufs */
359                 if (unlikely(slave_tx_count < slave_nb_bufs[i])) {
360                         slave_tx_fail_count[i] = slave_nb_bufs[i] -
361                                         slave_tx_count;
362                         total_tx_fail_count += slave_tx_fail_count[i];
363
364                         /*
365                          * Shift bufs to beginning of array to allow reordering
366                          * later
367                          */
368                         for (j = 0; j < slave_tx_fail_count[i]; j++) {
369                                 slave_bufs[i][j] =
370                                         slave_bufs[i][(slave_tx_count - 1) + j];
371                         }
372                 }
373         }
374
375         /*
376          * If there are tx burst failures we move packets to end of bufs to
377          * preserve expected PMD behaviour of all failed transmitted being
378          * at the end of the input mbuf array
379          */
380         if (unlikely(total_tx_fail_count > 0)) {
381                 int bufs_idx = nb_bufs - total_tx_fail_count - 1;
382
383                 for (i = 0; i < slave_count; i++) {
384                         if (slave_tx_fail_count[i] > 0) {
385                                 for (j = 0; j < slave_tx_fail_count[i]; j++)
386                                         bufs[bufs_idx++] = slave_bufs[i][j];
387                         }
388                 }
389         }
390
391         return total_tx_count;
392 }
393
394
395 static uint16_t
396 bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
397                 uint16_t nb_pkts)
398 {
399         /* Cast to structure, containing bonded device's port id and queue id */
400         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
401         struct bond_dev_private *internals = bd_rx_q->dev_private;
402         struct ether_addr bond_mac;
403
404         struct ether_hdr *hdr;
405
406         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
407         uint16_t num_rx_total = 0;      /* Total number of received packets */
408         uint16_t slaves[RTE_MAX_ETHPORTS];
409         uint16_t slave_count, idx;
410
411         uint8_t collecting;  /* current slave collecting status */
412         const uint8_t promisc = internals->promiscuous_en;
413         uint8_t i, j, k;
414         uint8_t subtype;
415
416         rte_eth_macaddr_get(internals->port_id, &bond_mac);
417         /* Copy slave list to protect against slave up/down changes during tx
418          * bursting */
419         slave_count = internals->active_slave_count;
420         memcpy(slaves, internals->active_slaves,
421                         sizeof(internals->active_slaves[0]) * slave_count);
422
423         idx = internals->active_slave;
424         if (idx >= slave_count) {
425                 internals->active_slave = 0;
426                 idx = 0;
427         }
428         for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
429                 j = num_rx_total;
430                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
431                                          COLLECTING);
432
433                 /* Read packets from this slave */
434                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
435                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
436
437                 for (k = j; k < 2 && k < num_rx_total; k++)
438                         rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
439
440                 /* Handle slow protocol packets. */
441                 while (j < num_rx_total) {
442
443                         /* If packet is not pure L2 and is known, skip it */
444                         if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) {
445                                 j++;
446                                 continue;
447                         }
448
449                         if (j + 3 < num_rx_total)
450                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
451
452                         hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
453                         subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
454
455                         /* Remove packet from array if it is slow packet or slave is not
456                          * in collecting state or bonding interface is not in promiscuous
457                          * mode and packet address does not match. */
458                         if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]) ||
459                                 !collecting || (!promisc &&
460                                         !is_multicast_ether_addr(&hdr->d_addr) &&
461                                         !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
462
463                                 if (hdr->ether_type == ether_type_slow_be) {
464                                         bond_mode_8023ad_handle_slow_pkt(
465                                             internals, slaves[idx], bufs[j]);
466                                 } else
467                                         rte_pktmbuf_free(bufs[j]);
468
469                                 /* Packet is managed by mode 4 or dropped, shift the array */
470                                 num_rx_total--;
471                                 if (j < num_rx_total) {
472                                         memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
473                                                 (num_rx_total - j));
474                                 }
475                         } else
476                                 j++;
477                 }
478                 if (unlikely(++idx == slave_count))
479                         idx = 0;
480         }
481
482         internals->active_slave = idx;
483         return num_rx_total;
484 }
485
486 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
487 uint32_t burstnumberRX;
488 uint32_t burstnumberTX;
489
490 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
491
492 static void
493 arp_op_name(uint16_t arp_op, char *buf)
494 {
495         switch (arp_op) {
496         case ARP_OP_REQUEST:
497                 snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
498                 return;
499         case ARP_OP_REPLY:
500                 snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
501                 return;
502         case ARP_OP_REVREQUEST:
503                 snprintf(buf, sizeof("Reverse ARP Request"), "%s",
504                                 "Reverse ARP Request");
505                 return;
506         case ARP_OP_REVREPLY:
507                 snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
508                                 "Reverse ARP Reply");
509                 return;
510         case ARP_OP_INVREQUEST:
511                 snprintf(buf, sizeof("Peer Identify Request"), "%s",
512                                 "Peer Identify Request");
513                 return;
514         case ARP_OP_INVREPLY:
515                 snprintf(buf, sizeof("Peer Identify Reply"), "%s",
516                                 "Peer Identify Reply");
517                 return;
518         default:
519                 break;
520         }
521         snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
522         return;
523 }
524 #endif
525 #define MaxIPv4String   16
526 static void
527 ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
528 {
529         uint32_t ipv4_addr;
530
531         ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
532         snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
533                 (ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
534                 ipv4_addr & 0xFF);
535 }
536
537 #define MAX_CLIENTS_NUMBER      128
538 uint8_t active_clients;
539 struct client_stats_t {
540         uint16_t port;
541         uint32_t ipv4_addr;
542         uint32_t ipv4_rx_packets;
543         uint32_t ipv4_tx_packets;
544 };
545 struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
546
547 static void
548 update_client_stats(uint32_t addr, uint16_t port, uint32_t *TXorRXindicator)
549 {
550         int i = 0;
551
552         for (; i < MAX_CLIENTS_NUMBER; i++)     {
553                 if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))      {
554                         /* Just update RX packets number for this client */
555                         if (TXorRXindicator == &burstnumberRX)
556                                 client_stats[i].ipv4_rx_packets++;
557                         else
558                                 client_stats[i].ipv4_tx_packets++;
559                         return;
560                 }
561         }
562         /* We have a new client. Insert him to the table, and increment stats */
563         if (TXorRXindicator == &burstnumberRX)
564                 client_stats[active_clients].ipv4_rx_packets++;
565         else
566                 client_stats[active_clients].ipv4_tx_packets++;
567         client_stats[active_clients].ipv4_addr = addr;
568         client_stats[active_clients].port = port;
569         active_clients++;
570
571 }
572
573 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
574 #define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)     \
575                 RTE_LOG(DEBUG, PMD, \
576                 "%s " \
577                 "port:%d " \
578                 "SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
579                 "SrcIP:%s " \
580                 "DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
581                 "DstIP:%s " \
582                 "%s " \
583                 "%d\n", \
584                 info, \
585                 port, \
586                 eth_h->s_addr.addr_bytes[0], \
587                 eth_h->s_addr.addr_bytes[1], \
588                 eth_h->s_addr.addr_bytes[2], \
589                 eth_h->s_addr.addr_bytes[3], \
590                 eth_h->s_addr.addr_bytes[4], \
591                 eth_h->s_addr.addr_bytes[5], \
592                 src_ip, \
593                 eth_h->d_addr.addr_bytes[0], \
594                 eth_h->d_addr.addr_bytes[1], \
595                 eth_h->d_addr.addr_bytes[2], \
596                 eth_h->d_addr.addr_bytes[3], \
597                 eth_h->d_addr.addr_bytes[4], \
598                 eth_h->d_addr.addr_bytes[5], \
599                 dst_ip, \
600                 arp_op, \
601                 ++burstnumber)
602 #endif
603
604 static void
605 mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
606                 uint16_t port, uint32_t __attribute__((unused)) *burstnumber)
607 {
608         struct ipv4_hdr *ipv4_h;
609 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
610         struct arp_hdr *arp_h;
611         char dst_ip[16];
612         char ArpOp[24];
613         char buf[16];
614 #endif
615         char src_ip[16];
616
617         uint16_t ether_type = eth_h->ether_type;
618         uint16_t offset = get_vlan_offset(eth_h, &ether_type);
619
620 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
621         strlcpy(buf, info, 16);
622 #endif
623
624         if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
625                 ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
626                 ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
627 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
628                 ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
629                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
630 #endif
631                 update_client_stats(ipv4_h->src_addr, port, burstnumber);
632         }
633 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
634         else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
635                 arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
636                 ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
637                 ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
638                 arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
639                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
640         }
641 #endif
642 }
643 #endif
644
645 static uint16_t
646 bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
647 {
648         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
649         struct bond_dev_private *internals = bd_tx_q->dev_private;
650         struct ether_hdr *eth_h;
651         uint16_t ether_type, offset;
652         uint16_t nb_recv_pkts;
653         int i;
654
655         nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
656
657         for (i = 0; i < nb_recv_pkts; i++) {
658                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
659                 ether_type = eth_h->ether_type;
660                 offset = get_vlan_offset(eth_h, &ether_type);
661
662                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
663 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
664                         mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
665 #endif
666                         bond_mode_alb_arp_recv(eth_h, offset, internals);
667                 }
668 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
669                 else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
670                         mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
671 #endif
672         }
673
674         return nb_recv_pkts;
675 }
676
677 static uint16_t
678 bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
679                 uint16_t nb_pkts)
680 {
681         struct bond_dev_private *internals;
682         struct bond_tx_queue *bd_tx_q;
683
684         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
685         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
686
687         uint16_t num_of_slaves;
688         uint16_t slaves[RTE_MAX_ETHPORTS];
689
690         uint16_t num_tx_total = 0, num_tx_slave;
691
692         static int slave_idx = 0;
693         int i, cslave_idx = 0, tx_fail_total = 0;
694
695         bd_tx_q = (struct bond_tx_queue *)queue;
696         internals = bd_tx_q->dev_private;
697
698         /* Copy slave list to protect against slave up/down changes during tx
699          * bursting */
700         num_of_slaves = internals->active_slave_count;
701         memcpy(slaves, internals->active_slaves,
702                         sizeof(internals->active_slaves[0]) * num_of_slaves);
703
704         if (num_of_slaves < 1)
705                 return num_tx_total;
706
707         /* Populate slaves mbuf with which packets are to be sent on it  */
708         for (i = 0; i < nb_pkts; i++) {
709                 cslave_idx = (slave_idx + i) % num_of_slaves;
710                 slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
711         }
712
713         /* increment current slave index so the next call to tx burst starts on the
714          * next slave */
715         slave_idx = ++cslave_idx;
716
717         /* Send packet burst on each slave device */
718         for (i = 0; i < num_of_slaves; i++) {
719                 if (slave_nb_pkts[i] > 0) {
720                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
721                                         slave_bufs[i], slave_nb_pkts[i]);
722
723                         /* if tx burst fails move packets to end of bufs */
724                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
725                                 int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
726
727                                 tx_fail_total += tx_fail_slave;
728
729                                 memcpy(&bufs[nb_pkts - tx_fail_total],
730                                                 &slave_bufs[i][num_tx_slave],
731                                                 tx_fail_slave * sizeof(bufs[0]));
732                         }
733                         num_tx_total += num_tx_slave;
734                 }
735         }
736
737         return num_tx_total;
738 }
739
740 static uint16_t
741 bond_ethdev_tx_burst_active_backup(void *queue,
742                 struct rte_mbuf **bufs, uint16_t nb_pkts)
743 {
744         struct bond_dev_private *internals;
745         struct bond_tx_queue *bd_tx_q;
746
747         bd_tx_q = (struct bond_tx_queue *)queue;
748         internals = bd_tx_q->dev_private;
749
750         if (internals->active_slave_count < 1)
751                 return 0;
752
753         return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
754                         bufs, nb_pkts);
755 }
756
757 static inline uint16_t
758 ether_hash(struct ether_hdr *eth_hdr)
759 {
760         unaligned_uint16_t *word_src_addr =
761                 (unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
762         unaligned_uint16_t *word_dst_addr =
763                 (unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
764
765         return (word_src_addr[0] ^ word_dst_addr[0]) ^
766                         (word_src_addr[1] ^ word_dst_addr[1]) ^
767                         (word_src_addr[2] ^ word_dst_addr[2]);
768 }
769
770 static inline uint32_t
771 ipv4_hash(struct ipv4_hdr *ipv4_hdr)
772 {
773         return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
774 }
775
776 static inline uint32_t
777 ipv6_hash(struct ipv6_hdr *ipv6_hdr)
778 {
779         unaligned_uint32_t *word_src_addr =
780                 (unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
781         unaligned_uint32_t *word_dst_addr =
782                 (unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
783
784         return (word_src_addr[0] ^ word_dst_addr[0]) ^
785                         (word_src_addr[1] ^ word_dst_addr[1]) ^
786                         (word_src_addr[2] ^ word_dst_addr[2]) ^
787                         (word_src_addr[3] ^ word_dst_addr[3]);
788 }
789
790
791 void
792 burst_xmit_l2_hash(struct rte_mbuf **buf, uint16_t nb_pkts,
793                 uint8_t slave_count, uint16_t *slaves)
794 {
795         struct ether_hdr *eth_hdr;
796         uint32_t hash;
797         int i;
798
799         for (i = 0; i < nb_pkts; i++) {
800                 eth_hdr = rte_pktmbuf_mtod(buf[i], struct ether_hdr *);
801
802                 hash = ether_hash(eth_hdr);
803
804                 slaves[i] = (hash ^= hash >> 8) % slave_count;
805         }
806 }
807
808 void
809 burst_xmit_l23_hash(struct rte_mbuf **buf, uint16_t nb_pkts,
810                 uint8_t slave_count, uint16_t *slaves)
811 {
812         uint16_t i;
813         struct ether_hdr *eth_hdr;
814         uint16_t proto;
815         size_t vlan_offset;
816         uint32_t hash, l3hash;
817
818         for (i = 0; i < nb_pkts; i++) {
819                 eth_hdr = rte_pktmbuf_mtod(buf[i], struct ether_hdr *);
820                 l3hash = 0;
821
822                 proto = eth_hdr->ether_type;
823                 hash = ether_hash(eth_hdr);
824
825                 vlan_offset = get_vlan_offset(eth_hdr, &proto);
826
827                 if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
828                         struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
829                                         ((char *)(eth_hdr + 1) + vlan_offset);
830                         l3hash = ipv4_hash(ipv4_hdr);
831
832                 } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
833                         struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
834                                         ((char *)(eth_hdr + 1) + vlan_offset);
835                         l3hash = ipv6_hash(ipv6_hdr);
836                 }
837
838                 hash = hash ^ l3hash;
839                 hash ^= hash >> 16;
840                 hash ^= hash >> 8;
841
842                 slaves[i] = hash % slave_count;
843         }
844 }
845
846 void
847 burst_xmit_l34_hash(struct rte_mbuf **buf, uint16_t nb_pkts,
848                 uint8_t slave_count, uint16_t *slaves)
849 {
850         struct ether_hdr *eth_hdr;
851         uint16_t proto;
852         size_t vlan_offset;
853         int i;
854
855         struct udp_hdr *udp_hdr;
856         struct tcp_hdr *tcp_hdr;
857         uint32_t hash, l3hash, l4hash;
858
859         for (i = 0; i < nb_pkts; i++) {
860                 eth_hdr = rte_pktmbuf_mtod(buf[i], struct ether_hdr *);
861                 proto = eth_hdr->ether_type;
862                 vlan_offset = get_vlan_offset(eth_hdr, &proto);
863                 l3hash = 0;
864                 l4hash = 0;
865
866                 if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
867                         struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
868                                         ((char *)(eth_hdr + 1) + vlan_offset);
869                         size_t ip_hdr_offset;
870
871                         l3hash = ipv4_hash(ipv4_hdr);
872
873                         /* there is no L4 header in fragmented packet */
874                         if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr)
875                                                                 == 0)) {
876                                 ip_hdr_offset = (ipv4_hdr->version_ihl
877                                         & IPV4_HDR_IHL_MASK) *
878                                         IPV4_IHL_MULTIPLIER;
879
880                                 if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
881                                         tcp_hdr = (struct tcp_hdr *)
882                                                 ((char *)ipv4_hdr +
883                                                         ip_hdr_offset);
884                                         l4hash = HASH_L4_PORTS(tcp_hdr);
885                                 } else if (ipv4_hdr->next_proto_id ==
886                                                                 IPPROTO_UDP) {
887                                         udp_hdr = (struct udp_hdr *)
888                                                 ((char *)ipv4_hdr +
889                                                         ip_hdr_offset);
890                                         l4hash = HASH_L4_PORTS(udp_hdr);
891                                 }
892                         }
893                 } else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
894                         struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
895                                         ((char *)(eth_hdr + 1) + vlan_offset);
896                         l3hash = ipv6_hash(ipv6_hdr);
897
898                         if (ipv6_hdr->proto == IPPROTO_TCP) {
899                                 tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
900                                 l4hash = HASH_L4_PORTS(tcp_hdr);
901                         } else if (ipv6_hdr->proto == IPPROTO_UDP) {
902                                 udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
903                                 l4hash = HASH_L4_PORTS(udp_hdr);
904                         }
905                 }
906
907                 hash = l3hash ^ l4hash;
908                 hash ^= hash >> 16;
909                 hash ^= hash >> 8;
910
911                 slaves[i] = hash % slave_count;
912         }
913 }
914
915 struct bwg_slave {
916         uint64_t bwg_left_int;
917         uint64_t bwg_left_remainder;
918         uint8_t slave;
919 };
920
921 void
922 bond_tlb_activate_slave(struct bond_dev_private *internals) {
923         int i;
924
925         for (i = 0; i < internals->active_slave_count; i++) {
926                 tlb_last_obytets[internals->active_slaves[i]] = 0;
927         }
928 }
929
930 static int
931 bandwidth_cmp(const void *a, const void *b)
932 {
933         const struct bwg_slave *bwg_a = a;
934         const struct bwg_slave *bwg_b = b;
935         int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
936         int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
937                         (int64_t)bwg_a->bwg_left_remainder;
938         if (diff > 0)
939                 return 1;
940         else if (diff < 0)
941                 return -1;
942         else if (diff2 > 0)
943                 return 1;
944         else if (diff2 < 0)
945                 return -1;
946         else
947                 return 0;
948 }
949
950 static void
951 bandwidth_left(uint16_t port_id, uint64_t load, uint8_t update_idx,
952                 struct bwg_slave *bwg_slave)
953 {
954         struct rte_eth_link link_status;
955
956         rte_eth_link_get_nowait(port_id, &link_status);
957         uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
958         if (link_bwg == 0)
959                 return;
960         link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
961         bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
962         bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
963 }
964
965 static void
966 bond_ethdev_update_tlb_slave_cb(void *arg)
967 {
968         struct bond_dev_private *internals = arg;
969         struct rte_eth_stats slave_stats;
970         struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
971         uint8_t slave_count;
972         uint64_t tx_bytes;
973
974         uint8_t update_stats = 0;
975         uint8_t i, slave_id;
976
977         internals->slave_update_idx++;
978
979
980         if (internals->slave_update_idx >= REORDER_PERIOD_MS)
981                 update_stats = 1;
982
983         for (i = 0; i < internals->active_slave_count; i++) {
984                 slave_id = internals->active_slaves[i];
985                 rte_eth_stats_get(slave_id, &slave_stats);
986                 tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
987                 bandwidth_left(slave_id, tx_bytes,
988                                 internals->slave_update_idx, &bwg_array[i]);
989                 bwg_array[i].slave = slave_id;
990
991                 if (update_stats) {
992                         tlb_last_obytets[slave_id] = slave_stats.obytes;
993                 }
994         }
995
996         if (update_stats == 1)
997                 internals->slave_update_idx = 0;
998
999         slave_count = i;
1000         qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
1001         for (i = 0; i < slave_count; i++)
1002                 internals->tlb_slaves_order[i] = bwg_array[i].slave;
1003
1004         rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
1005                         (struct bond_dev_private *)internals);
1006 }
1007
1008 static uint16_t
1009 bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
1010 {
1011         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
1012         struct bond_dev_private *internals = bd_tx_q->dev_private;
1013
1014         struct rte_eth_dev *primary_port =
1015                         &rte_eth_devices[internals->primary_port];
1016         uint16_t num_tx_total = 0;
1017         uint16_t i, j;
1018
1019         uint16_t num_of_slaves = internals->active_slave_count;
1020         uint16_t slaves[RTE_MAX_ETHPORTS];
1021
1022         struct ether_hdr *ether_hdr;
1023         struct ether_addr primary_slave_addr;
1024         struct ether_addr active_slave_addr;
1025
1026         if (num_of_slaves < 1)
1027                 return num_tx_total;
1028
1029         memcpy(slaves, internals->tlb_slaves_order,
1030                                 sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
1031
1032
1033         ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
1034
1035         if (nb_pkts > 3) {
1036                 for (i = 0; i < 3; i++)
1037                         rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
1038         }
1039
1040         for (i = 0; i < num_of_slaves; i++) {
1041                 rte_eth_macaddr_get(slaves[i], &active_slave_addr);
1042                 for (j = num_tx_total; j < nb_pkts; j++) {
1043                         if (j + 3 < nb_pkts)
1044                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
1045
1046                         ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
1047                         if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
1048                                 ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
1049 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1050                                         mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
1051 #endif
1052                 }
1053
1054                 num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1055                                 bufs + num_tx_total, nb_pkts - num_tx_total);
1056
1057                 if (num_tx_total == nb_pkts)
1058                         break;
1059         }
1060
1061         return num_tx_total;
1062 }
1063
1064 void
1065 bond_tlb_disable(struct bond_dev_private *internals)
1066 {
1067         rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
1068 }
1069
1070 void
1071 bond_tlb_enable(struct bond_dev_private *internals)
1072 {
1073         bond_ethdev_update_tlb_slave_cb(internals);
1074 }
1075
1076 static uint16_t
1077 bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
1078 {
1079         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
1080         struct bond_dev_private *internals = bd_tx_q->dev_private;
1081
1082         struct ether_hdr *eth_h;
1083         uint16_t ether_type, offset;
1084
1085         struct client_data *client_info;
1086
1087         /*
1088          * We create transmit buffers for every slave and one additional to send
1089          * through tlb. In worst case every packet will be send on one port.
1090          */
1091         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
1092         uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
1093
1094         /*
1095          * We create separate transmit buffers for update packets as they won't
1096          * be counted in num_tx_total.
1097          */
1098         struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
1099         uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
1100
1101         struct rte_mbuf *upd_pkt;
1102         size_t pkt_size;
1103
1104         uint16_t num_send, num_not_send = 0;
1105         uint16_t num_tx_total = 0;
1106         uint16_t slave_idx;
1107
1108         int i, j;
1109
1110         /* Search tx buffer for ARP packets and forward them to alb */
1111         for (i = 0; i < nb_pkts; i++) {
1112                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
1113                 ether_type = eth_h->ether_type;
1114                 offset = get_vlan_offset(eth_h, &ether_type);
1115
1116                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
1117                         slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
1118
1119                         /* Change src mac in eth header */
1120                         rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
1121
1122                         /* Add packet to slave tx buffer */
1123                         slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
1124                         slave_bufs_pkts[slave_idx]++;
1125                 } else {
1126                         /* If packet is not ARP, send it with TLB policy */
1127                         slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
1128                                         bufs[i];
1129                         slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
1130                 }
1131         }
1132
1133         /* Update connected client ARP tables */
1134         if (internals->mode6.ntt) {
1135                 for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
1136                         client_info = &internals->mode6.client_table[i];
1137
1138                         if (client_info->in_use) {
1139                                 /* Allocate new packet to send ARP update on current slave */
1140                                 upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
1141                                 if (upd_pkt == NULL) {
1142                                         RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
1143                                         continue;
1144                                 }
1145                                 pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
1146                                                 + client_info->vlan_count * sizeof(struct vlan_hdr);
1147                                 upd_pkt->data_len = pkt_size;
1148                                 upd_pkt->pkt_len = pkt_size;
1149
1150                                 slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
1151                                                 internals);
1152
1153                                 /* Add packet to update tx buffer */
1154                                 update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
1155                                 update_bufs_pkts[slave_idx]++;
1156                         }
1157                 }
1158                 internals->mode6.ntt = 0;
1159         }
1160
1161         /* Send ARP packets on proper slaves */
1162         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1163                 if (slave_bufs_pkts[i] > 0) {
1164                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
1165                                         slave_bufs[i], slave_bufs_pkts[i]);
1166                         for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
1167                                 bufs[nb_pkts - 1 - num_not_send - j] =
1168                                                 slave_bufs[i][nb_pkts - 1 - j];
1169                         }
1170
1171                         num_tx_total += num_send;
1172                         num_not_send += slave_bufs_pkts[i] - num_send;
1173
1174 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1175         /* Print TX stats including update packets */
1176                         for (j = 0; j < slave_bufs_pkts[i]; j++) {
1177                                 eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
1178                                 mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
1179                         }
1180 #endif
1181                 }
1182         }
1183
1184         /* Send update packets on proper slaves */
1185         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1186                 if (update_bufs_pkts[i] > 0) {
1187                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
1188                                         update_bufs_pkts[i]);
1189                         for (j = num_send; j < update_bufs_pkts[i]; j++) {
1190                                 rte_pktmbuf_free(update_bufs[i][j]);
1191                         }
1192 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1193                         for (j = 0; j < update_bufs_pkts[i]; j++) {
1194                                 eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
1195                                 mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
1196                         }
1197 #endif
1198                 }
1199         }
1200
1201         /* Send non-ARP packets using tlb policy */
1202         if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
1203                 num_send = bond_ethdev_tx_burst_tlb(queue,
1204                                 slave_bufs[RTE_MAX_ETHPORTS],
1205                                 slave_bufs_pkts[RTE_MAX_ETHPORTS]);
1206
1207                 for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
1208                         bufs[nb_pkts - 1 - num_not_send - j] =
1209                                         slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
1210                 }
1211
1212                 num_tx_total += num_send;
1213         }
1214
1215         return num_tx_total;
1216 }
1217
1218 static uint16_t
1219 bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
1220                 uint16_t nb_bufs)
1221 {
1222         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
1223         struct bond_dev_private *internals = bd_tx_q->dev_private;
1224
1225         uint16_t slave_port_ids[RTE_MAX_ETHPORTS];
1226         uint16_t slave_count;
1227
1228         /* Array to sort mbufs for transmission on each slave into */
1229         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_bufs];
1230         /* Number of mbufs for transmission on each slave */
1231         uint16_t slave_nb_bufs[RTE_MAX_ETHPORTS] = { 0 };
1232         /* Mapping array generated by hash function to map mbufs to slaves */
1233         uint16_t bufs_slave_port_idxs[nb_bufs];
1234
1235         uint16_t slave_tx_count, slave_tx_fail_count[RTE_MAX_ETHPORTS] = { 0 };
1236         uint16_t total_tx_count = 0, total_tx_fail_count = 0;
1237
1238         uint16_t i, j;
1239
1240         if (unlikely(nb_bufs == 0))
1241                 return 0;
1242
1243         /* Copy slave list to protect against slave up/down changes during tx
1244          * bursting */
1245         slave_count = internals->active_slave_count;
1246         if (unlikely(slave_count < 1))
1247                 return 0;
1248
1249         memcpy(slave_port_ids, internals->active_slaves,
1250                         sizeof(slave_port_ids[0]) * slave_count);
1251
1252         /*
1253          * Populate slaves mbuf with the packets which are to be sent on it
1254          * selecting output slave using hash based on xmit policy
1255          */
1256         internals->burst_xmit_hash(bufs, nb_bufs, slave_count,
1257                         bufs_slave_port_idxs);
1258
1259         for (i = 0; i < nb_bufs; i++) {
1260                 /* Populate slave mbuf arrays with mbufs for that slave. */
1261                 uint8_t slave_idx = bufs_slave_port_idxs[i];
1262
1263                 slave_bufs[slave_idx][slave_nb_bufs[slave_idx]++] = bufs[i];
1264         }
1265
1266         /* Send packet burst on each slave device */
1267         for (i = 0; i < slave_count; i++) {
1268                 if (slave_nb_bufs[i] == 0)
1269                         continue;
1270
1271                 slave_tx_count = rte_eth_tx_burst(slave_port_ids[i],
1272                                 bd_tx_q->queue_id, slave_bufs[i],
1273                                 slave_nb_bufs[i]);
1274
1275                 total_tx_count += slave_tx_count;
1276
1277                 /* If tx burst fails move packets to end of bufs */
1278                 if (unlikely(slave_tx_count < slave_nb_bufs[i])) {
1279                         slave_tx_fail_count[i] = slave_nb_bufs[i] -
1280                                         slave_tx_count;
1281                         total_tx_fail_count += slave_tx_fail_count[i];
1282
1283                         /*
1284                          * Shift bufs to beginning of array to allow reordering
1285                          * later
1286                          */
1287                         for (j = 0; j < slave_tx_fail_count[i]; j++) {
1288                                 slave_bufs[i][j] =
1289                                         slave_bufs[i][(slave_tx_count - 1) + j];
1290                         }
1291                 }
1292         }
1293
1294         /*
1295          * If there are tx burst failures we move packets to end of bufs to
1296          * preserve expected PMD behaviour of all failed transmitted being
1297          * at the end of the input mbuf array
1298          */
1299         if (unlikely(total_tx_fail_count > 0)) {
1300                 int bufs_idx = nb_bufs - total_tx_fail_count - 1;
1301
1302                 for (i = 0; i < slave_count; i++) {
1303                         if (slave_tx_fail_count[i] > 0) {
1304                                 for (j = 0; j < slave_tx_fail_count[i]; j++)
1305                                         bufs[bufs_idx++] = slave_bufs[i][j];
1306                         }
1307                 }
1308         }
1309
1310         return total_tx_count;
1311 }
1312
1313 static uint16_t
1314 bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
1315                 uint16_t nb_bufs)
1316 {
1317         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
1318         struct bond_dev_private *internals = bd_tx_q->dev_private;
1319
1320         uint16_t slave_port_ids[RTE_MAX_ETHPORTS];
1321         uint16_t slave_count;
1322
1323         uint16_t dist_slave_port_ids[RTE_MAX_ETHPORTS];
1324         uint16_t dist_slave_count;
1325
1326         /* 2-D array to sort mbufs for transmission on each slave into */
1327         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_bufs];
1328         /* Number of mbufs for transmission on each slave */
1329         uint16_t slave_nb_bufs[RTE_MAX_ETHPORTS] = { 0 };
1330         /* Mapping array generated by hash function to map mbufs to slaves */
1331         uint16_t bufs_slave_port_idxs[RTE_MAX_ETHPORTS] = { 0 };
1332
1333         uint16_t slave_tx_count, slave_tx_fail_count[RTE_MAX_ETHPORTS] = { 0 };
1334         uint16_t total_tx_count = 0, total_tx_fail_count = 0;
1335
1336         uint16_t i, j;
1337
1338         if (unlikely(nb_bufs == 0))
1339                 return 0;
1340
1341         /* Copy slave list to protect against slave up/down changes during tx
1342          * bursting */
1343         slave_count = internals->active_slave_count;
1344         if (unlikely(slave_count < 1))
1345                 return 0;
1346
1347         memcpy(slave_port_ids, internals->active_slaves,
1348                         sizeof(slave_port_ids[0]) * slave_count);
1349
1350         dist_slave_count = 0;
1351         for (i = 0; i < slave_count; i++) {
1352                 struct port *port = &mode_8023ad_ports[slave_port_ids[i]];
1353
1354                 if (ACTOR_STATE(port, DISTRIBUTING))
1355                         dist_slave_port_ids[dist_slave_count++] =
1356                                         slave_port_ids[i];
1357         }
1358
1359         if (likely(dist_slave_count > 1)) {
1360
1361                 /*
1362                  * Populate slaves mbuf with the packets which are to be sent
1363                  * on it, selecting output slave using hash based on xmit policy
1364                  */
1365                 internals->burst_xmit_hash(bufs, nb_bufs, dist_slave_count,
1366                                 bufs_slave_port_idxs);
1367
1368                 for (i = 0; i < nb_bufs; i++) {
1369                         /*
1370                          * Populate slave mbuf arrays with mbufs for that
1371                          * slave
1372                          */
1373                         uint8_t slave_idx = bufs_slave_port_idxs[i];
1374
1375                         slave_bufs[slave_idx][slave_nb_bufs[slave_idx]++] =
1376                                         bufs[i];
1377                 }
1378
1379
1380                 /* Send packet burst on each slave device */
1381                 for (i = 0; i < dist_slave_count; i++) {
1382                         if (slave_nb_bufs[i] == 0)
1383                                 continue;
1384
1385                         slave_tx_count = rte_eth_tx_burst(
1386                                         dist_slave_port_ids[i],
1387                                         bd_tx_q->queue_id, slave_bufs[i],
1388                                         slave_nb_bufs[i]);
1389
1390                         total_tx_count += slave_tx_count;
1391
1392                         /* If tx burst fails move packets to end of bufs */
1393                         if (unlikely(slave_tx_count < slave_nb_bufs[i])) {
1394                                 slave_tx_fail_count[i] = slave_nb_bufs[i] -
1395                                                 slave_tx_count;
1396                                 total_tx_fail_count += slave_tx_fail_count[i];
1397
1398                                 /*
1399                                  * Shift bufs to beginning of array to allow
1400                                  * reordering later
1401                                  */
1402                                 for (j = 0; j < slave_tx_fail_count[i]; j++)
1403                                         slave_bufs[i][j] =
1404                                                 slave_bufs[i]
1405                                                         [(slave_tx_count - 1)
1406                                                         + j];
1407                         }
1408                 }
1409
1410                 /*
1411                  * If there are tx burst failures we move packets to end of
1412                  * bufs to preserve expected PMD behaviour of all failed
1413                  * transmitted being at the end of the input mbuf array
1414                  */
1415                 if (unlikely(total_tx_fail_count > 0)) {
1416                         int bufs_idx = nb_bufs - total_tx_fail_count - 1;
1417
1418                         for (i = 0; i < slave_count; i++) {
1419                                 if (slave_tx_fail_count[i] > 0) {
1420                                         for (j = 0;
1421                                                 j < slave_tx_fail_count[i];
1422                                                 j++) {
1423                                                 bufs[bufs_idx++] =
1424                                                         slave_bufs[i][j];
1425                                         }
1426                                 }
1427                         }
1428                 }
1429         }
1430
1431         /* Check for LACP control packets and send if available */
1432         for (i = 0; i < slave_count; i++) {
1433                 struct port *port = &mode_8023ad_ports[slave_port_ids[i]];
1434                 struct rte_mbuf *ctrl_pkt = NULL;
1435
1436                 if (likely(rte_ring_empty(port->tx_ring)))
1437                         continue;
1438
1439                 if (rte_ring_dequeue(port->tx_ring,
1440                                      (void **)&ctrl_pkt) != -ENOENT) {
1441                         slave_tx_count = rte_eth_tx_burst(slave_port_ids[i],
1442                                         bd_tx_q->queue_id, &ctrl_pkt, 1);
1443                         /*
1444                          * re-enqueue LAG control plane packets to buffering
1445                          * ring if transmission fails so the packet isn't lost.
1446                          */
1447                         if (slave_tx_count != 1)
1448                                 rte_ring_enqueue(port->tx_ring, ctrl_pkt);
1449                 }
1450         }
1451
1452         return total_tx_count;
1453 }
1454
1455 static uint16_t
1456 bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1457                 uint16_t nb_pkts)
1458 {
1459         struct bond_dev_private *internals;
1460         struct bond_tx_queue *bd_tx_q;
1461
1462         uint8_t tx_failed_flag = 0, num_of_slaves;
1463         uint16_t slaves[RTE_MAX_ETHPORTS];
1464
1465         uint16_t max_nb_of_tx_pkts = 0;
1466
1467         int slave_tx_total[RTE_MAX_ETHPORTS];
1468         int i, most_successful_tx_slave = -1;
1469
1470         bd_tx_q = (struct bond_tx_queue *)queue;
1471         internals = bd_tx_q->dev_private;
1472
1473         /* Copy slave list to protect against slave up/down changes during tx
1474          * bursting */
1475         num_of_slaves = internals->active_slave_count;
1476         memcpy(slaves, internals->active_slaves,
1477                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1478
1479         if (num_of_slaves < 1)
1480                 return 0;
1481
1482         /* Increment reference count on mbufs */
1483         for (i = 0; i < nb_pkts; i++)
1484                 rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1485
1486         /* Transmit burst on each active slave */
1487         for (i = 0; i < num_of_slaves; i++) {
1488                 slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1489                                         bufs, nb_pkts);
1490
1491                 if (unlikely(slave_tx_total[i] < nb_pkts))
1492                         tx_failed_flag = 1;
1493
1494                 /* record the value and slave index for the slave which transmits the
1495                  * maximum number of packets */
1496                 if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1497                         max_nb_of_tx_pkts = slave_tx_total[i];
1498                         most_successful_tx_slave = i;
1499                 }
1500         }
1501
1502         /* if slaves fail to transmit packets from burst, the calling application
1503          * is not expected to know about multiple references to packets so we must
1504          * handle failures of all packets except those of the most successful slave
1505          */
1506         if (unlikely(tx_failed_flag))
1507                 for (i = 0; i < num_of_slaves; i++)
1508                         if (i != most_successful_tx_slave)
1509                                 while (slave_tx_total[i] < nb_pkts)
1510                                         rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1511
1512         return max_nb_of_tx_pkts;
1513 }
1514
1515 void
1516 link_properties_set(struct rte_eth_dev *ethdev, struct rte_eth_link *slave_link)
1517 {
1518         struct bond_dev_private *bond_ctx = ethdev->data->dev_private;
1519
1520         if (bond_ctx->mode == BONDING_MODE_8023AD) {
1521                 /**
1522                  * If in mode 4 then save the link properties of the first
1523                  * slave, all subsequent slaves must match these properties
1524                  */
1525                 struct rte_eth_link *bond_link = &bond_ctx->mode4.slave_link;
1526
1527                 bond_link->link_autoneg = slave_link->link_autoneg;
1528                 bond_link->link_duplex = slave_link->link_duplex;
1529                 bond_link->link_speed = slave_link->link_speed;
1530         } else {
1531                 /**
1532                  * In any other mode the link properties are set to default
1533                  * values of AUTONEG/DUPLEX
1534                  */
1535                 ethdev->data->dev_link.link_autoneg = ETH_LINK_AUTONEG;
1536                 ethdev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
1537         }
1538 }
1539
1540 int
1541 link_properties_valid(struct rte_eth_dev *ethdev,
1542                 struct rte_eth_link *slave_link)
1543 {
1544         struct bond_dev_private *bond_ctx = ethdev->data->dev_private;
1545
1546         if (bond_ctx->mode == BONDING_MODE_8023AD) {
1547                 struct rte_eth_link *bond_link = &bond_ctx->mode4.slave_link;
1548
1549                 if (bond_link->link_duplex != slave_link->link_duplex ||
1550                         bond_link->link_autoneg != slave_link->link_autoneg ||
1551                         bond_link->link_speed != slave_link->link_speed)
1552                         return -1;
1553         }
1554
1555         return 0;
1556 }
1557
1558 int
1559 mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1560 {
1561         struct ether_addr *mac_addr;
1562
1563         if (eth_dev == NULL) {
1564                 RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1565                 return -1;
1566         }
1567
1568         if (dst_mac_addr == NULL) {
1569                 RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1570                 return -1;
1571         }
1572
1573         mac_addr = eth_dev->data->mac_addrs;
1574
1575         ether_addr_copy(mac_addr, dst_mac_addr);
1576         return 0;
1577 }
1578
1579 int
1580 mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1581 {
1582         struct ether_addr *mac_addr;
1583
1584         if (eth_dev == NULL) {
1585                 RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1586                 return -1;
1587         }
1588
1589         if (new_mac_addr == NULL) {
1590                 RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1591                 return -1;
1592         }
1593
1594         mac_addr = eth_dev->data->mac_addrs;
1595
1596         /* If new MAC is different to current MAC then update */
1597         if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1598                 memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1599
1600         return 0;
1601 }
1602
1603 int
1604 mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1605 {
1606         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1607         int i;
1608
1609         /* Update slave devices MAC addresses */
1610         if (internals->slave_count < 1)
1611                 return -1;
1612
1613         switch (internals->mode) {
1614         case BONDING_MODE_ROUND_ROBIN:
1615         case BONDING_MODE_BALANCE:
1616         case BONDING_MODE_BROADCAST:
1617                 for (i = 0; i < internals->slave_count; i++) {
1618                         if (rte_eth_dev_default_mac_addr_set(
1619                                         internals->slaves[i].port_id,
1620                                         bonded_eth_dev->data->mac_addrs)) {
1621                                 RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1622                                                 internals->slaves[i].port_id);
1623                                 return -1;
1624                         }
1625                 }
1626                 break;
1627         case BONDING_MODE_8023AD:
1628                 bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1629                 break;
1630         case BONDING_MODE_ACTIVE_BACKUP:
1631         case BONDING_MODE_TLB:
1632         case BONDING_MODE_ALB:
1633         default:
1634                 for (i = 0; i < internals->slave_count; i++) {
1635                         if (internals->slaves[i].port_id ==
1636                                         internals->current_primary_port) {
1637                                 if (rte_eth_dev_default_mac_addr_set(
1638                                                 internals->primary_port,
1639                                                 bonded_eth_dev->data->mac_addrs)) {
1640                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1641                                                         internals->current_primary_port);
1642                                         return -1;
1643                                 }
1644                         } else {
1645                                 if (rte_eth_dev_default_mac_addr_set(
1646                                                 internals->slaves[i].port_id,
1647                                                 &internals->slaves[i].persisted_mac_addr)) {
1648                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1649                                                         internals->slaves[i].port_id);
1650                                         return -1;
1651                                 }
1652                         }
1653                 }
1654         }
1655
1656         return 0;
1657 }
1658
1659 int
1660 bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1661 {
1662         struct bond_dev_private *internals;
1663
1664         internals = eth_dev->data->dev_private;
1665
1666         switch (mode) {
1667         case BONDING_MODE_ROUND_ROBIN:
1668                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1669                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1670                 break;
1671         case BONDING_MODE_ACTIVE_BACKUP:
1672                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1673                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1674                 break;
1675         case BONDING_MODE_BALANCE:
1676                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1677                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1678                 break;
1679         case BONDING_MODE_BROADCAST:
1680                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1681                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1682                 break;
1683         case BONDING_MODE_8023AD:
1684                 if (bond_mode_8023ad_enable(eth_dev) != 0)
1685                         return -1;
1686
1687                 if (internals->mode4.dedicated_queues.enabled == 0) {
1688                         eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1689                         eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1690                         RTE_LOG(WARNING, PMD,
1691                                 "Using mode 4, it is necessary to do TX burst "
1692                                 "and RX burst at least every 100ms.\n");
1693                 } else {
1694                         /* Use flow director's optimization */
1695                         eth_dev->rx_pkt_burst =
1696                                         bond_ethdev_rx_burst_8023ad_fast_queue;
1697                         eth_dev->tx_pkt_burst =
1698                                         bond_ethdev_tx_burst_8023ad_fast_queue;
1699                 }
1700                 break;
1701         case BONDING_MODE_TLB:
1702                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1703                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1704                 break;
1705         case BONDING_MODE_ALB:
1706                 if (bond_mode_alb_enable(eth_dev) != 0)
1707                         return -1;
1708
1709                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1710                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1711                 break;
1712         default:
1713                 return -1;
1714         }
1715
1716         internals->mode = mode;
1717
1718         return 0;
1719 }
1720
1721
1722 static int
1723 slave_configure_slow_queue(struct rte_eth_dev *bonded_eth_dev,
1724                 struct rte_eth_dev *slave_eth_dev)
1725 {
1726         int errval = 0;
1727         struct bond_dev_private *internals = (struct bond_dev_private *)
1728                 bonded_eth_dev->data->dev_private;
1729         struct port *port = &mode_8023ad_ports[slave_eth_dev->data->port_id];
1730
1731         if (port->slow_pool == NULL) {
1732                 char mem_name[256];
1733                 int slave_id = slave_eth_dev->data->port_id;
1734
1735                 snprintf(mem_name, RTE_DIM(mem_name), "slave_port%u_slow_pool",
1736                                 slave_id);
1737                 port->slow_pool = rte_pktmbuf_pool_create(mem_name, 8191,
1738                         250, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
1739                         slave_eth_dev->data->numa_node);
1740
1741                 /* Any memory allocation failure in initialization is critical because
1742                  * resources can't be free, so reinitialization is impossible. */
1743                 if (port->slow_pool == NULL) {
1744                         rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
1745                                 slave_id, mem_name, rte_strerror(rte_errno));
1746                 }
1747         }
1748
1749         if (internals->mode4.dedicated_queues.enabled == 1) {
1750                 /* Configure slow Rx queue */
1751
1752                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id,
1753                                 internals->mode4.dedicated_queues.rx_qid, 128,
1754                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1755                                 NULL, port->slow_pool);
1756                 if (errval != 0) {
1757                         RTE_BOND_LOG(ERR,
1758                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1759                                         slave_eth_dev->data->port_id,
1760                                         internals->mode4.dedicated_queues.rx_qid,
1761                                         errval);
1762                         return errval;
1763                 }
1764
1765                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id,
1766                                 internals->mode4.dedicated_queues.tx_qid, 512,
1767                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1768                                 NULL);
1769                 if (errval != 0) {
1770                         RTE_BOND_LOG(ERR,
1771                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1772                                 slave_eth_dev->data->port_id,
1773                                 internals->mode4.dedicated_queues.tx_qid,
1774                                 errval);
1775                         return errval;
1776                 }
1777         }
1778         return 0;
1779 }
1780
1781 int
1782 slave_configure(struct rte_eth_dev *bonded_eth_dev,
1783                 struct rte_eth_dev *slave_eth_dev)
1784 {
1785         struct bond_rx_queue *bd_rx_q;
1786         struct bond_tx_queue *bd_tx_q;
1787         uint16_t nb_rx_queues;
1788         uint16_t nb_tx_queues;
1789
1790         int errval;
1791         uint16_t q_id;
1792         struct rte_flow_error flow_error;
1793
1794         struct bond_dev_private *internals = (struct bond_dev_private *)
1795                 bonded_eth_dev->data->dev_private;
1796
1797         /* Stop slave */
1798         rte_eth_dev_stop(slave_eth_dev->data->port_id);
1799
1800         /* Enable interrupts on slave device if supported */
1801         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1802                 slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1803
1804         /* If RSS is enabled for bonding, try to enable it for slaves  */
1805         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1806                 if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1807                                 != 0) {
1808                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1809                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1810                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1811                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1812                 } else {
1813                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1814                 }
1815
1816                 slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1817                                 bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1818                 slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1819                                 bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1820         }
1821
1822         if (bonded_eth_dev->data->dev_conf.rxmode.offloads &
1823                         DEV_RX_OFFLOAD_VLAN_FILTER)
1824                 slave_eth_dev->data->dev_conf.rxmode.offloads |=
1825                                 DEV_RX_OFFLOAD_VLAN_FILTER;
1826         else
1827                 slave_eth_dev->data->dev_conf.rxmode.offloads &=
1828                                 ~DEV_RX_OFFLOAD_VLAN_FILTER;
1829
1830         nb_rx_queues = bonded_eth_dev->data->nb_rx_queues;
1831         nb_tx_queues = bonded_eth_dev->data->nb_tx_queues;
1832
1833         if (internals->mode == BONDING_MODE_8023AD) {
1834                 if (internals->mode4.dedicated_queues.enabled == 1) {
1835                         nb_rx_queues++;
1836                         nb_tx_queues++;
1837                 }
1838         }
1839
1840         errval = rte_eth_dev_set_mtu(slave_eth_dev->data->port_id,
1841                                      bonded_eth_dev->data->mtu);
1842         if (errval != 0 && errval != -ENOTSUP) {
1843                 RTE_BOND_LOG(ERR, "rte_eth_dev_set_mtu: port %u, err (%d)",
1844                                 slave_eth_dev->data->port_id, errval);
1845                 return errval;
1846         }
1847
1848         /* Configure device */
1849         errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1850                         nb_rx_queues, nb_tx_queues,
1851                         &(slave_eth_dev->data->dev_conf));
1852         if (errval != 0) {
1853                 RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1854                                 slave_eth_dev->data->port_id, errval);
1855                 return errval;
1856         }
1857
1858         /* Setup Rx Queues */
1859         for (q_id = 0; q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1860                 bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1861
1862                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1863                                 bd_rx_q->nb_rx_desc,
1864                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1865                                 &(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1866                 if (errval != 0) {
1867                         RTE_BOND_LOG(ERR,
1868                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1869                                         slave_eth_dev->data->port_id, q_id, errval);
1870                         return errval;
1871                 }
1872         }
1873
1874         /* Setup Tx Queues */
1875         for (q_id = 0; q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1876                 bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1877
1878                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1879                                 bd_tx_q->nb_tx_desc,
1880                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1881                                 &bd_tx_q->tx_conf);
1882                 if (errval != 0) {
1883                         RTE_BOND_LOG(ERR,
1884                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1885                                 slave_eth_dev->data->port_id, q_id, errval);
1886                         return errval;
1887                 }
1888         }
1889
1890         if (internals->mode == BONDING_MODE_8023AD &&
1891                         internals->mode4.dedicated_queues.enabled == 1) {
1892                 if (slave_configure_slow_queue(bonded_eth_dev, slave_eth_dev)
1893                                 != 0)
1894                         return errval;
1895
1896                 if (bond_ethdev_8023ad_flow_verify(bonded_eth_dev,
1897                                 slave_eth_dev->data->port_id) != 0) {
1898                         RTE_BOND_LOG(ERR,
1899                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1900                                 slave_eth_dev->data->port_id, q_id, errval);
1901                         return -1;
1902                 }
1903
1904                 if (internals->mode4.dedicated_queues.flow[slave_eth_dev->data->port_id] != NULL)
1905                         rte_flow_destroy(slave_eth_dev->data->port_id,
1906                                         internals->mode4.dedicated_queues.flow[slave_eth_dev->data->port_id],
1907                                         &flow_error);
1908
1909                 bond_ethdev_8023ad_flow_set(bonded_eth_dev,
1910                                 slave_eth_dev->data->port_id);
1911         }
1912
1913         /* Start device */
1914         errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1915         if (errval != 0) {
1916                 RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1917                                 slave_eth_dev->data->port_id, errval);
1918                 return -1;
1919         }
1920
1921         /* If RSS is enabled for bonding, synchronize RETA */
1922         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1923                 int i;
1924                 struct bond_dev_private *internals;
1925
1926                 internals = bonded_eth_dev->data->dev_private;
1927
1928                 for (i = 0; i < internals->slave_count; i++) {
1929                         if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1930                                 errval = rte_eth_dev_rss_reta_update(
1931                                                 slave_eth_dev->data->port_id,
1932                                                 &internals->reta_conf[0],
1933                                                 internals->slaves[i].reta_size);
1934                                 if (errval != 0) {
1935                                         RTE_LOG(WARNING, PMD,
1936                                                         "rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1937                                                         " RSS Configuration for bonding may be inconsistent.\n",
1938                                                         slave_eth_dev->data->port_id, errval);
1939                                 }
1940                                 break;
1941                         }
1942                 }
1943         }
1944
1945         /* If lsc interrupt is set, check initial slave's link status */
1946         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) {
1947                 slave_eth_dev->dev_ops->link_update(slave_eth_dev, 0);
1948                 bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1949                         RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id,
1950                         NULL);
1951         }
1952
1953         return 0;
1954 }
1955
1956 void
1957 slave_remove(struct bond_dev_private *internals,
1958                 struct rte_eth_dev *slave_eth_dev)
1959 {
1960         uint8_t i;
1961
1962         for (i = 0; i < internals->slave_count; i++)
1963                 if (internals->slaves[i].port_id ==
1964                                 slave_eth_dev->data->port_id)
1965                         break;
1966
1967         if (i < (internals->slave_count - 1)) {
1968                 struct rte_flow *flow;
1969
1970                 memmove(&internals->slaves[i], &internals->slaves[i + 1],
1971                                 sizeof(internals->slaves[0]) *
1972                                 (internals->slave_count - i - 1));
1973                 TAILQ_FOREACH(flow, &internals->flow_list, next) {
1974                         memmove(&flow->flows[i], &flow->flows[i + 1],
1975                                 sizeof(flow->flows[0]) *
1976                                 (internals->slave_count - i - 1));
1977                         flow->flows[internals->slave_count - 1] = NULL;
1978                 }
1979         }
1980
1981         internals->slave_count--;
1982
1983         /* force reconfiguration of slave interfaces */
1984         _rte_eth_dev_reset(slave_eth_dev);
1985 }
1986
1987 static void
1988 bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1989
1990 void
1991 slave_add(struct bond_dev_private *internals,
1992                 struct rte_eth_dev *slave_eth_dev)
1993 {
1994         struct bond_slave_details *slave_details =
1995                         &internals->slaves[internals->slave_count];
1996
1997         slave_details->port_id = slave_eth_dev->data->port_id;
1998         slave_details->last_link_status = 0;
1999
2000         /* Mark slave devices that don't support interrupts so we can
2001          * compensate when we start the bond
2002          */
2003         if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
2004                 slave_details->link_status_poll_enabled = 1;
2005         }
2006
2007         slave_details->link_status_wait_to_complete = 0;
2008         /* clean tlb_last_obytes when adding port for bonding device */
2009         memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
2010                         sizeof(struct ether_addr));
2011 }
2012
2013 void
2014 bond_ethdev_primary_set(struct bond_dev_private *internals,
2015                 uint16_t slave_port_id)
2016 {
2017         int i;
2018
2019         if (internals->active_slave_count < 1)
2020                 internals->current_primary_port = slave_port_id;
2021         else
2022                 /* Search bonded device slave ports for new proposed primary port */
2023                 for (i = 0; i < internals->active_slave_count; i++) {
2024                         if (internals->active_slaves[i] == slave_port_id)
2025                                 internals->current_primary_port = slave_port_id;
2026                 }
2027 }
2028
2029 static void
2030 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
2031
2032 static int
2033 bond_ethdev_start(struct rte_eth_dev *eth_dev)
2034 {
2035         struct bond_dev_private *internals;
2036         int i;
2037
2038         /* slave eth dev will be started by bonded device */
2039         if (check_for_bonded_ethdev(eth_dev)) {
2040                 RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
2041                                 eth_dev->data->port_id);
2042                 return -1;
2043         }
2044
2045         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2046         eth_dev->data->dev_started = 1;
2047
2048         internals = eth_dev->data->dev_private;
2049
2050         if (internals->slave_count == 0) {
2051                 RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
2052                 goto out_err;
2053         }
2054
2055         if (internals->user_defined_mac == 0) {
2056                 struct ether_addr *new_mac_addr = NULL;
2057
2058                 for (i = 0; i < internals->slave_count; i++)
2059                         if (internals->slaves[i].port_id == internals->primary_port)
2060                                 new_mac_addr = &internals->slaves[i].persisted_mac_addr;
2061
2062                 if (new_mac_addr == NULL)
2063                         goto out_err;
2064
2065                 if (mac_address_set(eth_dev, new_mac_addr) != 0) {
2066                         RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
2067                                         eth_dev->data->port_id);
2068                         goto out_err;
2069                 }
2070         }
2071
2072         /* Update all slave devices MACs*/
2073         if (mac_address_slaves_update(eth_dev) != 0)
2074                 goto out_err;
2075
2076         /* If bonded device is configure in promiscuous mode then re-apply config */
2077         if (internals->promiscuous_en)
2078                 bond_ethdev_promiscuous_enable(eth_dev);
2079
2080         if (internals->mode == BONDING_MODE_8023AD) {
2081                 if (internals->mode4.dedicated_queues.enabled == 1) {
2082                         internals->mode4.dedicated_queues.rx_qid =
2083                                         eth_dev->data->nb_rx_queues;
2084                         internals->mode4.dedicated_queues.tx_qid =
2085                                         eth_dev->data->nb_tx_queues;
2086                 }
2087         }
2088
2089
2090         /* Reconfigure each slave device if starting bonded device */
2091         for (i = 0; i < internals->slave_count; i++) {
2092                 struct rte_eth_dev *slave_ethdev =
2093                                 &(rte_eth_devices[internals->slaves[i].port_id]);
2094                 if (slave_configure(eth_dev, slave_ethdev) != 0) {
2095                         RTE_BOND_LOG(ERR,
2096                                 "bonded port (%d) failed to reconfigure slave device (%d)",
2097                                 eth_dev->data->port_id,
2098                                 internals->slaves[i].port_id);
2099                         goto out_err;
2100                 }
2101                 /* We will need to poll for link status if any slave doesn't
2102                  * support interrupts
2103                  */
2104                 if (internals->slaves[i].link_status_poll_enabled)
2105                         internals->link_status_polling_enabled = 1;
2106         }
2107
2108         /* start polling if needed */
2109         if (internals->link_status_polling_enabled) {
2110                 rte_eal_alarm_set(
2111                         internals->link_status_polling_interval_ms * 1000,
2112                         bond_ethdev_slave_link_status_change_monitor,
2113                         (void *)&rte_eth_devices[internals->port_id]);
2114         }
2115
2116         if (internals->user_defined_primary_port)
2117                 bond_ethdev_primary_set(internals, internals->primary_port);
2118
2119         if (internals->mode == BONDING_MODE_8023AD)
2120                 bond_mode_8023ad_start(eth_dev);
2121
2122         if (internals->mode == BONDING_MODE_TLB ||
2123                         internals->mode == BONDING_MODE_ALB)
2124                 bond_tlb_enable(internals);
2125
2126         return 0;
2127
2128 out_err:
2129         eth_dev->data->dev_started = 0;
2130         return -1;
2131 }
2132
2133 static void
2134 bond_ethdev_free_queues(struct rte_eth_dev *dev)
2135 {
2136         uint8_t i;
2137
2138         if (dev->data->rx_queues != NULL) {
2139                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
2140                         rte_free(dev->data->rx_queues[i]);
2141                         dev->data->rx_queues[i] = NULL;
2142                 }
2143                 dev->data->nb_rx_queues = 0;
2144         }
2145
2146         if (dev->data->tx_queues != NULL) {
2147                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2148                         rte_free(dev->data->tx_queues[i]);
2149                         dev->data->tx_queues[i] = NULL;
2150                 }
2151                 dev->data->nb_tx_queues = 0;
2152         }
2153 }
2154
2155 void
2156 bond_ethdev_stop(struct rte_eth_dev *eth_dev)
2157 {
2158         struct bond_dev_private *internals = eth_dev->data->dev_private;
2159         uint8_t i;
2160
2161         if (internals->mode == BONDING_MODE_8023AD) {
2162                 struct port *port;
2163                 void *pkt = NULL;
2164
2165                 bond_mode_8023ad_stop(eth_dev);
2166
2167                 /* Discard all messages to/from mode 4 state machines */
2168                 for (i = 0; i < internals->active_slave_count; i++) {
2169                         port = &mode_8023ad_ports[internals->active_slaves[i]];
2170
2171                         RTE_ASSERT(port->rx_ring != NULL);
2172                         while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
2173                                 rte_pktmbuf_free(pkt);
2174
2175                         RTE_ASSERT(port->tx_ring != NULL);
2176                         while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
2177                                 rte_pktmbuf_free(pkt);
2178                 }
2179         }
2180
2181         if (internals->mode == BONDING_MODE_TLB ||
2182                         internals->mode == BONDING_MODE_ALB) {
2183                 bond_tlb_disable(internals);
2184                 for (i = 0; i < internals->active_slave_count; i++)
2185                         tlb_last_obytets[internals->active_slaves[i]] = 0;
2186         }
2187
2188         internals->active_slave_count = 0;
2189         internals->link_status_polling_enabled = 0;
2190         for (i = 0; i < internals->slave_count; i++)
2191                 internals->slaves[i].last_link_status = 0;
2192
2193         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2194         eth_dev->data->dev_started = 0;
2195 }
2196
2197 void
2198 bond_ethdev_close(struct rte_eth_dev *dev)
2199 {
2200         struct bond_dev_private *internals = dev->data->dev_private;
2201         uint8_t bond_port_id = internals->port_id;
2202         int skipped = 0;
2203         struct rte_flow_error ferror;
2204
2205         RTE_LOG(INFO, EAL, "Closing bonded device %s\n", dev->device->name);
2206         while (internals->slave_count != skipped) {
2207                 uint16_t port_id = internals->slaves[skipped].port_id;
2208
2209                 rte_eth_dev_stop(port_id);
2210
2211                 if (rte_eth_bond_slave_remove(bond_port_id, port_id) != 0) {
2212                         RTE_LOG(ERR, EAL,
2213                                 "Failed to remove port %d from bonded device "
2214                                 "%s\n", port_id, dev->device->name);
2215                         skipped++;
2216                 }
2217         }
2218         bond_flow_ops.flush(dev, &ferror);
2219         bond_ethdev_free_queues(dev);
2220         rte_bitmap_reset(internals->vlan_filter_bmp);
2221 }
2222
2223 /* forward declaration */
2224 static int bond_ethdev_configure(struct rte_eth_dev *dev);
2225
2226 static void
2227 bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
2228 {
2229         struct bond_dev_private *internals = dev->data->dev_private;
2230
2231         uint16_t max_nb_rx_queues = UINT16_MAX;
2232         uint16_t max_nb_tx_queues = UINT16_MAX;
2233
2234         dev_info->max_mac_addrs = 1;
2235
2236         dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen ?
2237                         internals->candidate_max_rx_pktlen :
2238                         ETHER_MAX_JUMBO_FRAME_LEN;
2239
2240         /* Max number of tx/rx queues that the bonded device can support is the
2241          * minimum values of the bonded slaves, as all slaves must be capable
2242          * of supporting the same number of tx/rx queues.
2243          */
2244         if (internals->slave_count > 0) {
2245                 struct rte_eth_dev_info slave_info;
2246                 uint8_t idx;
2247
2248                 for (idx = 0; idx < internals->slave_count; idx++) {
2249                         rte_eth_dev_info_get(internals->slaves[idx].port_id,
2250                                         &slave_info);
2251
2252                         if (slave_info.max_rx_queues < max_nb_rx_queues)
2253                                 max_nb_rx_queues = slave_info.max_rx_queues;
2254
2255                         if (slave_info.max_tx_queues < max_nb_tx_queues)
2256                                 max_nb_tx_queues = slave_info.max_tx_queues;
2257                 }
2258         }
2259
2260         dev_info->max_rx_queues = max_nb_rx_queues;
2261         dev_info->max_tx_queues = max_nb_tx_queues;
2262
2263         /**
2264          * If dedicated hw queues enabled for link bonding device in LACP mode
2265          * then we need to reduce the maximum number of data path queues by 1.
2266          */
2267         if (internals->mode == BONDING_MODE_8023AD &&
2268                 internals->mode4.dedicated_queues.enabled == 1) {
2269                 dev_info->max_rx_queues--;
2270                 dev_info->max_tx_queues--;
2271         }
2272
2273         dev_info->min_rx_bufsize = 0;
2274
2275         dev_info->rx_offload_capa = internals->rx_offload_capa;
2276         dev_info->tx_offload_capa = internals->tx_offload_capa;
2277         dev_info->rx_queue_offload_capa = internals->rx_queue_offload_capa;
2278         dev_info->tx_queue_offload_capa = internals->tx_queue_offload_capa;
2279         dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
2280
2281         dev_info->reta_size = internals->reta_size;
2282 }
2283
2284 static int
2285 bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
2286 {
2287         int res;
2288         uint16_t i;
2289         struct bond_dev_private *internals = dev->data->dev_private;
2290
2291         /* don't do this while a slave is being added */
2292         rte_spinlock_lock(&internals->lock);
2293
2294         if (on)
2295                 rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
2296         else
2297                 rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
2298
2299         for (i = 0; i < internals->slave_count; i++) {
2300                 uint16_t port_id = internals->slaves[i].port_id;
2301
2302                 res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
2303                 if (res == ENOTSUP)
2304                         RTE_LOG(WARNING, PMD,
2305                                 "Setting VLAN filter on slave port %u not supported.\n",
2306                                 port_id);
2307         }
2308
2309         rte_spinlock_unlock(&internals->lock);
2310         return 0;
2311 }
2312
2313 static int
2314 bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
2315                 uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
2316                 const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
2317 {
2318         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
2319                         rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
2320                                         0, dev->data->numa_node);
2321         if (bd_rx_q == NULL)
2322                 return -1;
2323
2324         bd_rx_q->queue_id = rx_queue_id;
2325         bd_rx_q->dev_private = dev->data->dev_private;
2326
2327         bd_rx_q->nb_rx_desc = nb_rx_desc;
2328
2329         memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
2330         bd_rx_q->mb_pool = mb_pool;
2331
2332         dev->data->rx_queues[rx_queue_id] = bd_rx_q;
2333
2334         return 0;
2335 }
2336
2337 static int
2338 bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
2339                 uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
2340                 const struct rte_eth_txconf *tx_conf)
2341 {
2342         struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
2343                         rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
2344                                         0, dev->data->numa_node);
2345
2346         if (bd_tx_q == NULL)
2347                 return -1;
2348
2349         bd_tx_q->queue_id = tx_queue_id;
2350         bd_tx_q->dev_private = dev->data->dev_private;
2351
2352         bd_tx_q->nb_tx_desc = nb_tx_desc;
2353         memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
2354
2355         dev->data->tx_queues[tx_queue_id] = bd_tx_q;
2356
2357         return 0;
2358 }
2359
2360 static void
2361 bond_ethdev_rx_queue_release(void *queue)
2362 {
2363         if (queue == NULL)
2364                 return;
2365
2366         rte_free(queue);
2367 }
2368
2369 static void
2370 bond_ethdev_tx_queue_release(void *queue)
2371 {
2372         if (queue == NULL)
2373                 return;
2374
2375         rte_free(queue);
2376 }
2377
2378 static void
2379 bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
2380 {
2381         struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
2382         struct bond_dev_private *internals;
2383
2384         /* Default value for polling slave found is true as we don't want to
2385          * disable the polling thread if we cannot get the lock */
2386         int i, polling_slave_found = 1;
2387
2388         if (cb_arg == NULL)
2389                 return;
2390
2391         bonded_ethdev = (struct rte_eth_dev *)cb_arg;
2392         internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
2393
2394         if (!bonded_ethdev->data->dev_started ||
2395                 !internals->link_status_polling_enabled)
2396                 return;
2397
2398         /* If device is currently being configured then don't check slaves link
2399          * status, wait until next period */
2400         if (rte_spinlock_trylock(&internals->lock)) {
2401                 if (internals->slave_count > 0)
2402                         polling_slave_found = 0;
2403
2404                 for (i = 0; i < internals->slave_count; i++) {
2405                         if (!internals->slaves[i].link_status_poll_enabled)
2406                                 continue;
2407
2408                         slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
2409                         polling_slave_found = 1;
2410
2411                         /* Update slave link status */
2412                         (*slave_ethdev->dev_ops->link_update)(slave_ethdev,
2413                                         internals->slaves[i].link_status_wait_to_complete);
2414
2415                         /* if link status has changed since last checked then call lsc
2416                          * event callback */
2417                         if (slave_ethdev->data->dev_link.link_status !=
2418                                         internals->slaves[i].last_link_status) {
2419                                 internals->slaves[i].last_link_status =
2420                                                 slave_ethdev->data->dev_link.link_status;
2421
2422                                 bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
2423                                                 RTE_ETH_EVENT_INTR_LSC,
2424                                                 &bonded_ethdev->data->port_id,
2425                                                 NULL);
2426                         }
2427                 }
2428                 rte_spinlock_unlock(&internals->lock);
2429         }
2430
2431         if (polling_slave_found)
2432                 /* Set alarm to continue monitoring link status of slave ethdev's */
2433                 rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
2434                                 bond_ethdev_slave_link_status_change_monitor, cb_arg);
2435 }
2436
2437 static int
2438 bond_ethdev_link_update(struct rte_eth_dev *ethdev, int wait_to_complete)
2439 {
2440         void (*link_update)(uint16_t port_id, struct rte_eth_link *eth_link);
2441
2442         struct bond_dev_private *bond_ctx;
2443         struct rte_eth_link slave_link;
2444
2445         uint32_t idx;
2446
2447         bond_ctx = ethdev->data->dev_private;
2448
2449         ethdev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
2450
2451         if (ethdev->data->dev_started == 0 ||
2452                         bond_ctx->active_slave_count == 0) {
2453                 ethdev->data->dev_link.link_status = ETH_LINK_DOWN;
2454                 return 0;
2455         }
2456
2457         ethdev->data->dev_link.link_status = ETH_LINK_UP;
2458
2459         if (wait_to_complete)
2460                 link_update = rte_eth_link_get;
2461         else
2462                 link_update = rte_eth_link_get_nowait;
2463
2464         switch (bond_ctx->mode) {
2465         case BONDING_MODE_BROADCAST:
2466                 /**
2467                  * Setting link speed to UINT32_MAX to ensure we pick up the
2468                  * value of the first active slave
2469                  */
2470                 ethdev->data->dev_link.link_speed = UINT32_MAX;
2471
2472                 /**
2473                  * link speed is minimum value of all the slaves link speed as
2474                  * packet loss will occur on this slave if transmission at rates
2475                  * greater than this are attempted
2476                  */
2477                 for (idx = 1; idx < bond_ctx->active_slave_count; idx++) {
2478                         link_update(bond_ctx->active_slaves[0], &slave_link);
2479
2480                         if (slave_link.link_speed <
2481                                         ethdev->data->dev_link.link_speed)
2482                                 ethdev->data->dev_link.link_speed =
2483                                                 slave_link.link_speed;
2484                 }
2485                 break;
2486         case BONDING_MODE_ACTIVE_BACKUP:
2487                 /* Current primary slave */
2488                 link_update(bond_ctx->current_primary_port, &slave_link);
2489
2490                 ethdev->data->dev_link.link_speed = slave_link.link_speed;
2491                 break;
2492         case BONDING_MODE_8023AD:
2493                 ethdev->data->dev_link.link_autoneg =
2494                                 bond_ctx->mode4.slave_link.link_autoneg;
2495                 ethdev->data->dev_link.link_duplex =
2496                                 bond_ctx->mode4.slave_link.link_duplex;
2497                 /* fall through to update link speed */
2498         case BONDING_MODE_ROUND_ROBIN:
2499         case BONDING_MODE_BALANCE:
2500         case BONDING_MODE_TLB:
2501         case BONDING_MODE_ALB:
2502         default:
2503                 /**
2504                  * In theses mode the maximum theoretical link speed is the sum
2505                  * of all the slaves
2506                  */
2507                 ethdev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
2508
2509                 for (idx = 0; idx < bond_ctx->active_slave_count; idx++) {
2510                         link_update(bond_ctx->active_slaves[idx], &slave_link);
2511
2512                         ethdev->data->dev_link.link_speed +=
2513                                         slave_link.link_speed;
2514                 }
2515         }
2516
2517
2518         return 0;
2519 }
2520
2521
2522 static int
2523 bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
2524 {
2525         struct bond_dev_private *internals = dev->data->dev_private;
2526         struct rte_eth_stats slave_stats;
2527         int i, j;
2528
2529         for (i = 0; i < internals->slave_count; i++) {
2530                 rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
2531
2532                 stats->ipackets += slave_stats.ipackets;
2533                 stats->opackets += slave_stats.opackets;
2534                 stats->ibytes += slave_stats.ibytes;
2535                 stats->obytes += slave_stats.obytes;
2536                 stats->imissed += slave_stats.imissed;
2537                 stats->ierrors += slave_stats.ierrors;
2538                 stats->oerrors += slave_stats.oerrors;
2539                 stats->rx_nombuf += slave_stats.rx_nombuf;
2540
2541                 for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
2542                         stats->q_ipackets[j] += slave_stats.q_ipackets[j];
2543                         stats->q_opackets[j] += slave_stats.q_opackets[j];
2544                         stats->q_ibytes[j] += slave_stats.q_ibytes[j];
2545                         stats->q_obytes[j] += slave_stats.q_obytes[j];
2546                         stats->q_errors[j] += slave_stats.q_errors[j];
2547                 }
2548
2549         }
2550
2551         return 0;
2552 }
2553
2554 static void
2555 bond_ethdev_stats_reset(struct rte_eth_dev *dev)
2556 {
2557         struct bond_dev_private *internals = dev->data->dev_private;
2558         int i;
2559
2560         for (i = 0; i < internals->slave_count; i++)
2561                 rte_eth_stats_reset(internals->slaves[i].port_id);
2562 }
2563
2564 static void
2565 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
2566 {
2567         struct bond_dev_private *internals = eth_dev->data->dev_private;
2568         int i;
2569
2570         internals->promiscuous_en = 1;
2571
2572         switch (internals->mode) {
2573         /* Promiscuous mode is propagated to all slaves */
2574         case BONDING_MODE_ROUND_ROBIN:
2575         case BONDING_MODE_BALANCE:
2576         case BONDING_MODE_BROADCAST:
2577                 for (i = 0; i < internals->slave_count; i++)
2578                         rte_eth_promiscuous_enable(internals->slaves[i].port_id);
2579                 break;
2580         /* In mode4 promiscus mode is managed when slave is added/removed */
2581         case BONDING_MODE_8023AD:
2582                 break;
2583         /* Promiscuous mode is propagated only to primary slave */
2584         case BONDING_MODE_ACTIVE_BACKUP:
2585         case BONDING_MODE_TLB:
2586         case BONDING_MODE_ALB:
2587         default:
2588                 rte_eth_promiscuous_enable(internals->current_primary_port);
2589         }
2590 }
2591
2592 static void
2593 bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
2594 {
2595         struct bond_dev_private *internals = dev->data->dev_private;
2596         int i;
2597
2598         internals->promiscuous_en = 0;
2599
2600         switch (internals->mode) {
2601         /* Promiscuous mode is propagated to all slaves */
2602         case BONDING_MODE_ROUND_ROBIN:
2603         case BONDING_MODE_BALANCE:
2604         case BONDING_MODE_BROADCAST:
2605                 for (i = 0; i < internals->slave_count; i++)
2606                         rte_eth_promiscuous_disable(internals->slaves[i].port_id);
2607                 break;
2608         /* In mode4 promiscus mode is set managed when slave is added/removed */
2609         case BONDING_MODE_8023AD:
2610                 break;
2611         /* Promiscuous mode is propagated only to primary slave */
2612         case BONDING_MODE_ACTIVE_BACKUP:
2613         case BONDING_MODE_TLB:
2614         case BONDING_MODE_ALB:
2615         default:
2616                 rte_eth_promiscuous_disable(internals->current_primary_port);
2617         }
2618 }
2619
2620 static void
2621 bond_ethdev_delayed_lsc_propagation(void *arg)
2622 {
2623         if (arg == NULL)
2624                 return;
2625
2626         _rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
2627                         RTE_ETH_EVENT_INTR_LSC, NULL);
2628 }
2629
2630 int
2631 bond_ethdev_lsc_event_callback(uint16_t port_id, enum rte_eth_event_type type,
2632                 void *param, void *ret_param __rte_unused)
2633 {
2634         struct rte_eth_dev *bonded_eth_dev;
2635         struct bond_dev_private *internals;
2636         struct rte_eth_link link;
2637         int rc = -1;
2638
2639         int i, valid_slave = 0;
2640         uint8_t active_pos;
2641         uint8_t lsc_flag = 0;
2642
2643         if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
2644                 return rc;
2645
2646         bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
2647
2648         if (check_for_bonded_ethdev(bonded_eth_dev))
2649                 return rc;
2650
2651         internals = bonded_eth_dev->data->dev_private;
2652
2653         /* If the device isn't started don't handle interrupts */
2654         if (!bonded_eth_dev->data->dev_started)
2655                 return rc;
2656
2657         /* verify that port_id is a valid slave of bonded port */
2658         for (i = 0; i < internals->slave_count; i++) {
2659                 if (internals->slaves[i].port_id == port_id) {
2660                         valid_slave = 1;
2661                         break;
2662                 }
2663         }
2664
2665         if (!valid_slave)
2666                 return rc;
2667
2668         /* Search for port in active port list */
2669         active_pos = find_slave_by_id(internals->active_slaves,
2670                         internals->active_slave_count, port_id);
2671
2672         rte_eth_link_get_nowait(port_id, &link);
2673         if (link.link_status) {
2674                 if (active_pos < internals->active_slave_count)
2675                         return rc;
2676
2677                 /* if no active slave ports then set this port to be primary port */
2678                 if (internals->active_slave_count < 1) {
2679                         /* If first active slave, then change link status */
2680                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2681                         internals->current_primary_port = port_id;
2682                         lsc_flag = 1;
2683
2684                         mac_address_slaves_update(bonded_eth_dev);
2685                 }
2686
2687                 activate_slave(bonded_eth_dev, port_id);
2688
2689                 /* If user has defined the primary port then default to using it */
2690                 if (internals->user_defined_primary_port &&
2691                                 internals->primary_port == port_id)
2692                         bond_ethdev_primary_set(internals, port_id);
2693         } else {
2694                 if (active_pos == internals->active_slave_count)
2695                         return rc;
2696
2697                 /* Remove from active slave list */
2698                 deactivate_slave(bonded_eth_dev, port_id);
2699
2700                 if (internals->active_slave_count < 1)
2701                         lsc_flag = 1;
2702
2703                 /* Update primary id, take first active slave from list or if none
2704                  * available set to -1 */
2705                 if (port_id == internals->current_primary_port) {
2706                         if (internals->active_slave_count > 0)
2707                                 bond_ethdev_primary_set(internals,
2708                                                 internals->active_slaves[0]);
2709                         else
2710                                 internals->current_primary_port = internals->primary_port;
2711                 }
2712         }
2713
2714         /**
2715          * Update bonded device link properties after any change to active
2716          * slaves
2717          */
2718         bond_ethdev_link_update(bonded_eth_dev, 0);
2719
2720         if (lsc_flag) {
2721                 /* Cancel any possible outstanding interrupts if delays are enabled */
2722                 if (internals->link_up_delay_ms > 0 ||
2723                         internals->link_down_delay_ms > 0)
2724                         rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2725                                         bonded_eth_dev);
2726
2727                 if (bonded_eth_dev->data->dev_link.link_status) {
2728                         if (internals->link_up_delay_ms > 0)
2729                                 rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2730                                                 bond_ethdev_delayed_lsc_propagation,
2731                                                 (void *)bonded_eth_dev);
2732                         else
2733                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2734                                                 RTE_ETH_EVENT_INTR_LSC,
2735                                                 NULL);
2736
2737                 } else {
2738                         if (internals->link_down_delay_ms > 0)
2739                                 rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2740                                                 bond_ethdev_delayed_lsc_propagation,
2741                                                 (void *)bonded_eth_dev);
2742                         else
2743                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2744                                                 RTE_ETH_EVENT_INTR_LSC,
2745                                                 NULL);
2746                 }
2747         }
2748         return 0;
2749 }
2750
2751 static int
2752 bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2753                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2754 {
2755         unsigned i, j;
2756         int result = 0;
2757         int slave_reta_size;
2758         unsigned reta_count;
2759         struct bond_dev_private *internals = dev->data->dev_private;
2760
2761         if (reta_size != internals->reta_size)
2762                 return -EINVAL;
2763
2764          /* Copy RETA table */
2765         reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2766
2767         for (i = 0; i < reta_count; i++) {
2768                 internals->reta_conf[i].mask = reta_conf[i].mask;
2769                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2770                         if ((reta_conf[i].mask >> j) & 0x01)
2771                                 internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2772         }
2773
2774         /* Fill rest of array */
2775         for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2776                 memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2777                                 sizeof(internals->reta_conf[0]) * reta_count);
2778
2779         /* Propagate RETA over slaves */
2780         for (i = 0; i < internals->slave_count; i++) {
2781                 slave_reta_size = internals->slaves[i].reta_size;
2782                 result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2783                                 &internals->reta_conf[0], slave_reta_size);
2784                 if (result < 0)
2785                         return result;
2786         }
2787
2788         return 0;
2789 }
2790
2791 static int
2792 bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2793                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2794 {
2795         int i, j;
2796         struct bond_dev_private *internals = dev->data->dev_private;
2797
2798         if (reta_size != internals->reta_size)
2799                 return -EINVAL;
2800
2801          /* Copy RETA table */
2802         for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2803                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2804                         if ((reta_conf[i].mask >> j) & 0x01)
2805                                 reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2806
2807         return 0;
2808 }
2809
2810 static int
2811 bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2812                 struct rte_eth_rss_conf *rss_conf)
2813 {
2814         int i, result = 0;
2815         struct bond_dev_private *internals = dev->data->dev_private;
2816         struct rte_eth_rss_conf bond_rss_conf;
2817
2818         memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2819
2820         bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2821
2822         if (bond_rss_conf.rss_hf != 0)
2823                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2824
2825         if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2826                         sizeof(internals->rss_key)) {
2827                 if (bond_rss_conf.rss_key_len == 0)
2828                         bond_rss_conf.rss_key_len = 40;
2829                 internals->rss_key_len = bond_rss_conf.rss_key_len;
2830                 memcpy(internals->rss_key, bond_rss_conf.rss_key,
2831                                 internals->rss_key_len);
2832         }
2833
2834         for (i = 0; i < internals->slave_count; i++) {
2835                 result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2836                                 &bond_rss_conf);
2837                 if (result < 0)
2838                         return result;
2839         }
2840
2841         return 0;
2842 }
2843
2844 static int
2845 bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2846                 struct rte_eth_rss_conf *rss_conf)
2847 {
2848         struct bond_dev_private *internals = dev->data->dev_private;
2849
2850         rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2851         rss_conf->rss_key_len = internals->rss_key_len;
2852         if (rss_conf->rss_key)
2853                 memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2854
2855         return 0;
2856 }
2857
2858 static int
2859 bond_ethdev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
2860 {
2861         struct rte_eth_dev *slave_eth_dev;
2862         struct bond_dev_private *internals = dev->data->dev_private;
2863         int ret, i;
2864
2865         rte_spinlock_lock(&internals->lock);
2866
2867         for (i = 0; i < internals->slave_count; i++) {
2868                 slave_eth_dev = &rte_eth_devices[internals->slaves[i].port_id];
2869                 if (*slave_eth_dev->dev_ops->mtu_set == NULL) {
2870                         rte_spinlock_unlock(&internals->lock);
2871                         return -ENOTSUP;
2872                 }
2873         }
2874         for (i = 0; i < internals->slave_count; i++) {
2875                 ret = rte_eth_dev_set_mtu(internals->slaves[i].port_id, mtu);
2876                 if (ret < 0) {
2877                         rte_spinlock_unlock(&internals->lock);
2878                         return ret;
2879                 }
2880         }
2881
2882         rte_spinlock_unlock(&internals->lock);
2883         return 0;
2884 }
2885
2886 static int
2887 bond_ethdev_mac_address_set(struct rte_eth_dev *dev, struct ether_addr *addr)
2888 {
2889         if (mac_address_set(dev, addr)) {
2890                 RTE_BOND_LOG(ERR, "Failed to update MAC address");
2891                 return -EINVAL;
2892         }
2893
2894         return 0;
2895 }
2896
2897 static int
2898 bond_filter_ctrl(struct rte_eth_dev *dev __rte_unused,
2899                  enum rte_filter_type type, enum rte_filter_op op, void *arg)
2900 {
2901         if (type == RTE_ETH_FILTER_GENERIC && op == RTE_ETH_FILTER_GET) {
2902                 *(const void **)arg = &bond_flow_ops;
2903                 return 0;
2904         }
2905         return -ENOTSUP;
2906 }
2907
2908 const struct eth_dev_ops default_dev_ops = {
2909         .dev_start            = bond_ethdev_start,
2910         .dev_stop             = bond_ethdev_stop,
2911         .dev_close            = bond_ethdev_close,
2912         .dev_configure        = bond_ethdev_configure,
2913         .dev_infos_get        = bond_ethdev_info,
2914         .vlan_filter_set      = bond_ethdev_vlan_filter_set,
2915         .rx_queue_setup       = bond_ethdev_rx_queue_setup,
2916         .tx_queue_setup       = bond_ethdev_tx_queue_setup,
2917         .rx_queue_release     = bond_ethdev_rx_queue_release,
2918         .tx_queue_release     = bond_ethdev_tx_queue_release,
2919         .link_update          = bond_ethdev_link_update,
2920         .stats_get            = bond_ethdev_stats_get,
2921         .stats_reset          = bond_ethdev_stats_reset,
2922         .promiscuous_enable   = bond_ethdev_promiscuous_enable,
2923         .promiscuous_disable  = bond_ethdev_promiscuous_disable,
2924         .reta_update          = bond_ethdev_rss_reta_update,
2925         .reta_query           = bond_ethdev_rss_reta_query,
2926         .rss_hash_update      = bond_ethdev_rss_hash_update,
2927         .rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get,
2928         .mtu_set              = bond_ethdev_mtu_set,
2929         .mac_addr_set         = bond_ethdev_mac_address_set,
2930         .filter_ctrl          = bond_filter_ctrl
2931 };
2932
2933 static int
2934 bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
2935 {
2936         const char *name = rte_vdev_device_name(dev);
2937         uint8_t socket_id = dev->device.numa_node;
2938         struct bond_dev_private *internals = NULL;
2939         struct rte_eth_dev *eth_dev = NULL;
2940         uint32_t vlan_filter_bmp_size;
2941
2942         /* now do all data allocation - for eth_dev structure, dummy pci driver
2943          * and internal (private) data
2944          */
2945
2946         /* reserve an ethdev entry */
2947         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internals));
2948         if (eth_dev == NULL) {
2949                 RTE_BOND_LOG(ERR, "Unable to allocate rte_eth_dev");
2950                 goto err;
2951         }
2952
2953         internals = eth_dev->data->dev_private;
2954         eth_dev->data->nb_rx_queues = (uint16_t)1;
2955         eth_dev->data->nb_tx_queues = (uint16_t)1;
2956
2957         eth_dev->data->mac_addrs = rte_zmalloc_socket(name, ETHER_ADDR_LEN, 0,
2958                         socket_id);
2959         if (eth_dev->data->mac_addrs == NULL) {
2960                 RTE_BOND_LOG(ERR, "Unable to malloc mac_addrs");
2961                 goto err;
2962         }
2963
2964         eth_dev->dev_ops = &default_dev_ops;
2965         eth_dev->data->dev_flags = RTE_ETH_DEV_INTR_LSC;
2966
2967         rte_spinlock_init(&internals->lock);
2968
2969         internals->port_id = eth_dev->data->port_id;
2970         internals->mode = BONDING_MODE_INVALID;
2971         internals->current_primary_port = RTE_MAX_ETHPORTS + 1;
2972         internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2;
2973         internals->burst_xmit_hash = burst_xmit_l2_hash;
2974         internals->user_defined_mac = 0;
2975
2976         internals->link_status_polling_enabled = 0;
2977
2978         internals->link_status_polling_interval_ms =
2979                 DEFAULT_POLLING_INTERVAL_10_MS;
2980         internals->link_down_delay_ms = 0;
2981         internals->link_up_delay_ms = 0;
2982
2983         internals->slave_count = 0;
2984         internals->active_slave_count = 0;
2985         internals->rx_offload_capa = 0;
2986         internals->tx_offload_capa = 0;
2987         internals->rx_queue_offload_capa = 0;
2988         internals->tx_queue_offload_capa = 0;
2989         internals->candidate_max_rx_pktlen = 0;
2990         internals->max_rx_pktlen = 0;
2991
2992         /* Initially allow to choose any offload type */
2993         internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK;
2994
2995         memset(internals->active_slaves, 0, sizeof(internals->active_slaves));
2996         memset(internals->slaves, 0, sizeof(internals->slaves));
2997
2998         TAILQ_INIT(&internals->flow_list);
2999         internals->flow_isolated_valid = 0;
3000
3001         /* Set mode 4 default configuration */
3002         bond_mode_8023ad_setup(eth_dev, NULL);
3003         if (bond_ethdev_mode_set(eth_dev, mode)) {
3004                 RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode to %d\n",
3005                                  eth_dev->data->port_id, mode);
3006                 goto err;
3007         }
3008
3009         vlan_filter_bmp_size =
3010                 rte_bitmap_get_memory_footprint(ETHER_MAX_VLAN_ID + 1);
3011         internals->vlan_filter_bmpmem = rte_malloc(name, vlan_filter_bmp_size,
3012                                                    RTE_CACHE_LINE_SIZE);
3013         if (internals->vlan_filter_bmpmem == NULL) {
3014                 RTE_BOND_LOG(ERR,
3015                              "Failed to allocate vlan bitmap for bonded device %u\n",
3016                              eth_dev->data->port_id);
3017                 goto err;
3018         }
3019
3020         internals->vlan_filter_bmp = rte_bitmap_init(ETHER_MAX_VLAN_ID + 1,
3021                         internals->vlan_filter_bmpmem, vlan_filter_bmp_size);
3022         if (internals->vlan_filter_bmp == NULL) {
3023                 RTE_BOND_LOG(ERR,
3024                              "Failed to init vlan bitmap for bonded device %u\n",
3025                              eth_dev->data->port_id);
3026                 rte_free(internals->vlan_filter_bmpmem);
3027                 goto err;
3028         }
3029
3030         return eth_dev->data->port_id;
3031
3032 err:
3033         rte_free(internals);
3034         if (eth_dev != NULL) {
3035                 rte_free(eth_dev->data->mac_addrs);
3036                 rte_eth_dev_release_port(eth_dev);
3037         }
3038         return -1;
3039 }
3040
3041 static int
3042 bond_probe(struct rte_vdev_device *dev)
3043 {
3044         const char *name;
3045         struct bond_dev_private *internals;
3046         struct rte_kvargs *kvlist;
3047         uint8_t bonding_mode, socket_id/*, agg_mode*/;
3048         int  arg_count, port_id;
3049         uint8_t agg_mode;
3050         struct rte_eth_dev *eth_dev;
3051
3052         if (!dev)
3053                 return -EINVAL;
3054
3055         name = rte_vdev_device_name(dev);
3056         RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
3057
3058         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
3059             strlen(rte_vdev_device_args(dev)) == 0) {
3060                 eth_dev = rte_eth_dev_attach_secondary(name);
3061                 if (!eth_dev) {
3062                         RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
3063                         return -1;
3064                 }
3065                 /* TODO: request info from primary to set up Rx and Tx */
3066                 eth_dev->dev_ops = &default_dev_ops;
3067                 return 0;
3068         }
3069
3070         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev),
3071                 pmd_bond_init_valid_arguments);
3072         if (kvlist == NULL)
3073                 return -1;
3074
3075         /* Parse link bonding mode */
3076         if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
3077                 if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
3078                                 &bond_ethdev_parse_slave_mode_kvarg,
3079                                 &bonding_mode) != 0) {
3080                         RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
3081                                         name);
3082                         goto parse_error;
3083                 }
3084         } else {
3085                 RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
3086                                 "device %s\n", name);
3087                 goto parse_error;
3088         }
3089
3090         /* Parse socket id to create bonding device on */
3091         arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
3092         if (arg_count == 1) {
3093                 if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
3094                                 &bond_ethdev_parse_socket_id_kvarg, &socket_id)
3095                                 != 0) {
3096                         RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
3097                                         "bonded device %s\n", name);
3098                         goto parse_error;
3099                 }
3100         } else if (arg_count > 1) {
3101                 RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
3102                                 "bonded device %s\n", name);
3103                 goto parse_error;
3104         } else {
3105                 socket_id = rte_socket_id();
3106         }
3107
3108         dev->device.numa_node = socket_id;
3109
3110         /* Create link bonding eth device */
3111         port_id = bond_alloc(dev, bonding_mode);
3112         if (port_id < 0) {
3113                 RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
3114                                 "socket %u.\n", name, bonding_mode, socket_id);
3115                 goto parse_error;
3116         }
3117         internals = rte_eth_devices[port_id].data->dev_private;
3118         internals->kvlist = kvlist;
3119
3120
3121         if (rte_kvargs_count(kvlist, PMD_BOND_AGG_MODE_KVARG) == 1) {
3122                 if (rte_kvargs_process(kvlist,
3123                                 PMD_BOND_AGG_MODE_KVARG,
3124                                 &bond_ethdev_parse_slave_agg_mode_kvarg,
3125                                 &agg_mode) != 0) {
3126                         RTE_LOG(ERR, EAL,
3127                                         "Failed to parse agg selection mode for bonded device %s\n",
3128                                         name);
3129                         goto parse_error;
3130                 }
3131
3132                 if (internals->mode == BONDING_MODE_8023AD)
3133                         rte_eth_bond_8023ad_agg_selection_set(port_id,
3134                                         agg_mode);
3135         } else {
3136                 rte_eth_bond_8023ad_agg_selection_set(port_id, AGG_STABLE);
3137         }
3138
3139         RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
3140                         "socket %u.\n", name, port_id, bonding_mode, socket_id);
3141         return 0;
3142
3143 parse_error:
3144         rte_kvargs_free(kvlist);
3145
3146         return -1;
3147 }
3148
3149 static int
3150 bond_remove(struct rte_vdev_device *dev)
3151 {
3152         struct rte_eth_dev *eth_dev;
3153         struct bond_dev_private *internals;
3154         const char *name;
3155
3156         if (!dev)
3157                 return -EINVAL;
3158
3159         name = rte_vdev_device_name(dev);
3160         RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
3161
3162         /* now free all data allocation - for eth_dev structure,
3163          * dummy pci driver and internal (private) data
3164          */
3165
3166         /* find an ethdev entry */
3167         eth_dev = rte_eth_dev_allocated(name);
3168         if (eth_dev == NULL)
3169                 return -ENODEV;
3170
3171         RTE_ASSERT(eth_dev->device == &dev->device);
3172
3173         internals = eth_dev->data->dev_private;
3174         if (internals->slave_count != 0)
3175                 return -EBUSY;
3176
3177         if (eth_dev->data->dev_started == 1) {
3178                 bond_ethdev_stop(eth_dev);
3179                 bond_ethdev_close(eth_dev);
3180         }
3181
3182         eth_dev->dev_ops = NULL;
3183         eth_dev->rx_pkt_burst = NULL;
3184         eth_dev->tx_pkt_burst = NULL;
3185
3186         internals = eth_dev->data->dev_private;
3187         /* Try to release mempool used in mode6. If the bond
3188          * device is not mode6, free the NULL is not problem.
3189          */
3190         rte_mempool_free(internals->mode6.mempool);
3191         rte_bitmap_free(internals->vlan_filter_bmp);
3192         rte_free(internals->vlan_filter_bmpmem);
3193         rte_free(eth_dev->data->dev_private);
3194         rte_free(eth_dev->data->mac_addrs);
3195
3196         rte_eth_dev_release_port(eth_dev);
3197
3198         return 0;
3199 }
3200
3201 /* this part will resolve the slave portids after all the other pdev and vdev
3202  * have been allocated */
3203 static int
3204 bond_ethdev_configure(struct rte_eth_dev *dev)
3205 {
3206         const char *name = dev->device->name;
3207         struct bond_dev_private *internals = dev->data->dev_private;
3208         struct rte_kvargs *kvlist = internals->kvlist;
3209         int arg_count;
3210         uint16_t port_id = dev - rte_eth_devices;
3211         uint8_t agg_mode;
3212
3213         static const uint8_t default_rss_key[40] = {
3214                 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
3215                 0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
3216                 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
3217                 0xBE, 0xAC, 0x01, 0xFA
3218         };
3219
3220         unsigned i, j;
3221
3222         /* If RSS is enabled, fill table and key with default values */
3223         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
3224                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
3225                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
3226                 memcpy(internals->rss_key, default_rss_key, 40);
3227
3228                 for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
3229                         internals->reta_conf[i].mask = ~0LL;
3230                         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
3231                                 internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
3232                 }
3233         }
3234
3235         /* set the max_rx_pktlen */
3236         internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
3237
3238         /*
3239          * if no kvlist, it means that this bonded device has been created
3240          * through the bonding api.
3241          */
3242         if (!kvlist)
3243                 return 0;
3244
3245         /* Parse MAC address for bonded device */
3246         arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
3247         if (arg_count == 1) {
3248                 struct ether_addr bond_mac;
3249
3250                 if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
3251                                 &bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
3252                         RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
3253                                         name);
3254                         return -1;
3255                 }
3256
3257                 /* Set MAC address */
3258                 if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
3259                         RTE_LOG(ERR, EAL,
3260                                         "Failed to set mac address on bonded device %s\n",
3261                                         name);
3262                         return -1;
3263                 }
3264         } else if (arg_count > 1) {
3265                 RTE_LOG(ERR, EAL,
3266                                 "MAC address can be specified only once for bonded device %s\n",
3267                                 name);
3268                 return -1;
3269         }
3270
3271         /* Parse/set balance mode transmit policy */
3272         arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
3273         if (arg_count == 1) {
3274                 uint8_t xmit_policy;
3275
3276                 if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
3277                                 &bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
3278                                                 0) {
3279                         RTE_LOG(INFO, EAL,
3280                                         "Invalid xmit policy specified for bonded device %s\n",
3281                                         name);
3282                         return -1;
3283                 }
3284
3285                 /* Set balance mode transmit policy*/
3286                 if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
3287                         RTE_LOG(ERR, EAL,
3288                                         "Failed to set balance xmit policy on bonded device %s\n",
3289                                         name);
3290                         return -1;
3291                 }
3292         } else if (arg_count > 1) {
3293                 RTE_LOG(ERR, EAL,
3294                                 "Transmit policy can be specified only once for bonded device"
3295                                 " %s\n", name);
3296                 return -1;
3297         }
3298
3299         if (rte_kvargs_count(kvlist, PMD_BOND_AGG_MODE_KVARG) == 1) {
3300                 if (rte_kvargs_process(kvlist,
3301                                 PMD_BOND_AGG_MODE_KVARG,
3302                                 &bond_ethdev_parse_slave_agg_mode_kvarg,
3303                                 &agg_mode) != 0) {
3304                         RTE_LOG(ERR, EAL,
3305                                         "Failed to parse agg selection mode for bonded device %s\n",
3306                                         name);
3307                 }
3308                 if (internals->mode == BONDING_MODE_8023AD)
3309                                 rte_eth_bond_8023ad_agg_selection_set(port_id,
3310                                                 agg_mode);
3311         }
3312
3313         /* Parse/add slave ports to bonded device */
3314         if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
3315                 struct bond_ethdev_slave_ports slave_ports;
3316                 unsigned i;
3317
3318                 memset(&slave_ports, 0, sizeof(slave_ports));
3319
3320                 if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
3321                                 &bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
3322                         RTE_LOG(ERR, EAL,
3323                                         "Failed to parse slave ports for bonded device %s\n",
3324                                         name);
3325                         return -1;
3326                 }
3327
3328                 for (i = 0; i < slave_ports.slave_count; i++) {
3329                         if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
3330                                 RTE_LOG(ERR, EAL,
3331                                                 "Failed to add port %d as slave to bonded device %s\n",
3332                                                 slave_ports.slaves[i], name);
3333                         }
3334                 }
3335
3336         } else {
3337                 RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
3338                 return -1;
3339         }
3340
3341         /* Parse/set primary slave port id*/
3342         arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
3343         if (arg_count == 1) {
3344                 uint16_t primary_slave_port_id;
3345
3346                 if (rte_kvargs_process(kvlist,
3347                                 PMD_BOND_PRIMARY_SLAVE_KVARG,
3348                                 &bond_ethdev_parse_primary_slave_port_id_kvarg,
3349                                 &primary_slave_port_id) < 0) {
3350                         RTE_LOG(INFO, EAL,
3351                                         "Invalid primary slave port id specified for bonded device"
3352                                         " %s\n", name);
3353                         return -1;
3354                 }
3355
3356                 /* Set balance mode transmit policy*/
3357                 if (rte_eth_bond_primary_set(port_id, primary_slave_port_id)
3358                                 != 0) {
3359                         RTE_LOG(ERR, EAL,
3360                                         "Failed to set primary slave port %d on bonded device %s\n",
3361                                         primary_slave_port_id, name);
3362                         return -1;
3363                 }
3364         } else if (arg_count > 1) {
3365                 RTE_LOG(INFO, EAL,
3366                                 "Primary slave can be specified only once for bonded device"
3367                                 " %s\n", name);
3368                 return -1;
3369         }
3370
3371         /* Parse link status monitor polling interval */
3372         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
3373         if (arg_count == 1) {
3374                 uint32_t lsc_poll_interval_ms;
3375
3376                 if (rte_kvargs_process(kvlist,
3377                                 PMD_BOND_LSC_POLL_PERIOD_KVARG,
3378                                 &bond_ethdev_parse_time_ms_kvarg,
3379                                 &lsc_poll_interval_ms) < 0) {
3380                         RTE_LOG(INFO, EAL,
3381                                         "Invalid lsc polling interval value specified for bonded"
3382                                         " device %s\n", name);
3383                         return -1;
3384                 }
3385
3386                 if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
3387                                 != 0) {
3388                         RTE_LOG(ERR, EAL,
3389                                         "Failed to set lsc monitor polling interval (%u ms) on"
3390                                         " bonded device %s\n", lsc_poll_interval_ms, name);
3391                         return -1;
3392                 }
3393         } else if (arg_count > 1) {
3394                 RTE_LOG(INFO, EAL,
3395                                 "LSC polling interval can be specified only once for bonded"
3396                                 " device %s\n", name);
3397                 return -1;
3398         }
3399
3400         /* Parse link up interrupt propagation delay */
3401         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
3402         if (arg_count == 1) {
3403                 uint32_t link_up_delay_ms;
3404
3405                 if (rte_kvargs_process(kvlist,
3406                                 PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
3407                                 &bond_ethdev_parse_time_ms_kvarg,
3408                                 &link_up_delay_ms) < 0) {
3409                         RTE_LOG(INFO, EAL,
3410                                         "Invalid link up propagation delay value specified for"
3411                                         " bonded device %s\n", name);
3412                         return -1;
3413                 }
3414
3415                 /* Set balance mode transmit policy*/
3416                 if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
3417                                 != 0) {
3418                         RTE_LOG(ERR, EAL,
3419                                         "Failed to set link up propagation delay (%u ms) on bonded"
3420                                         " device %s\n", link_up_delay_ms, name);
3421                         return -1;
3422                 }
3423         } else if (arg_count > 1) {
3424                 RTE_LOG(INFO, EAL,
3425                                 "Link up propagation delay can be specified only once for"
3426                                 " bonded device %s\n", name);
3427                 return -1;
3428         }
3429
3430         /* Parse link down interrupt propagation delay */
3431         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
3432         if (arg_count == 1) {
3433                 uint32_t link_down_delay_ms;
3434
3435                 if (rte_kvargs_process(kvlist,
3436                                 PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
3437                                 &bond_ethdev_parse_time_ms_kvarg,
3438                                 &link_down_delay_ms) < 0) {
3439                         RTE_LOG(INFO, EAL,
3440                                         "Invalid link down propagation delay value specified for"
3441                                         " bonded device %s\n", name);
3442                         return -1;
3443                 }
3444
3445                 /* Set balance mode transmit policy*/
3446                 if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
3447                                 != 0) {
3448                         RTE_LOG(ERR, EAL,
3449                                         "Failed to set link down propagation delay (%u ms) on"
3450                                         " bonded device %s\n", link_down_delay_ms, name);
3451                         return -1;
3452                 }
3453         } else if (arg_count > 1) {
3454                 RTE_LOG(INFO, EAL,
3455                                 "Link down propagation delay can be specified only once for"
3456                                 " bonded device %s\n", name);
3457                 return -1;
3458         }
3459
3460         return 0;
3461 }
3462
3463 struct rte_vdev_driver pmd_bond_drv = {
3464         .probe = bond_probe,
3465         .remove = bond_remove,
3466 };
3467
3468 RTE_PMD_REGISTER_VDEV(net_bonding, pmd_bond_drv);
3469 RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
3470
3471 RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
3472         "slave=<ifc> "
3473         "primary=<ifc> "
3474         "mode=[0-6] "
3475         "xmit_policy=[l2 | l23 | l34] "
3476         "agg_mode=[count | stable | bandwidth] "
3477         "socket_id=<int> "
3478         "mac=<mac addr> "
3479         "lsc_poll_period_ms=<int> "
3480         "up_delay=<int> "
3481         "down_delay=<int>");