net/bonding: do not drop LACPDUs on slaves Tx failure
[dpdk.git] / drivers / net / bonding / rte_eth_bond_pmd.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 #include <stdlib.h>
5 #include <netinet/in.h>
6
7 #include <rte_mbuf.h>
8 #include <rte_malloc.h>
9 #include <rte_ethdev.h>
10 #include <rte_ethdev_vdev.h>
11 #include <rte_tcp.h>
12 #include <rte_udp.h>
13 #include <rte_ip.h>
14 #include <rte_ip_frag.h>
15 #include <rte_devargs.h>
16 #include <rte_kvargs.h>
17 #include <rte_bus_vdev.h>
18 #include <rte_alarm.h>
19 #include <rte_cycles.h>
20
21 #include "rte_eth_bond.h"
22 #include "rte_eth_bond_private.h"
23 #include "rte_eth_bond_8023ad_private.h"
24
25 #define REORDER_PERIOD_MS 10
26 #define DEFAULT_POLLING_INTERVAL_10_MS (10)
27
28 #define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
29
30 /* Table for statistics in mode 5 TLB */
31 static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
32
33 static inline size_t
34 get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
35 {
36         size_t vlan_offset = 0;
37
38         if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
39                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
40
41                 vlan_offset = sizeof(struct vlan_hdr);
42                 *proto = vlan_hdr->eth_proto;
43
44                 if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
45                         vlan_hdr = vlan_hdr + 1;
46                         *proto = vlan_hdr->eth_proto;
47                         vlan_offset += sizeof(struct vlan_hdr);
48                 }
49         }
50         return vlan_offset;
51 }
52
53 static uint16_t
54 bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
55 {
56         struct bond_dev_private *internals;
57
58         uint16_t num_rx_slave = 0;
59         uint16_t num_rx_total = 0;
60
61         int i;
62
63         /* Cast to structure, containing bonded device's port id and queue id */
64         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
65
66         internals = bd_rx_q->dev_private;
67
68
69         for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
70                 /* Offset of pointer to *bufs increases as packets are received
71                  * from other slaves */
72                 num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
73                                 bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
74                 if (num_rx_slave) {
75                         num_rx_total += num_rx_slave;
76                         nb_pkts -= num_rx_slave;
77                 }
78         }
79
80         return num_rx_total;
81 }
82
83 static uint16_t
84 bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
85                 uint16_t nb_pkts)
86 {
87         struct bond_dev_private *internals;
88
89         /* Cast to structure, containing bonded device's port id and queue id */
90         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
91
92         internals = bd_rx_q->dev_private;
93
94         return rte_eth_rx_burst(internals->current_primary_port,
95                         bd_rx_q->queue_id, bufs, nb_pkts);
96 }
97
98 static inline uint8_t
99 is_lacp_packets(uint16_t ethertype, uint8_t subtype, struct rte_mbuf *mbuf)
100 {
101         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
102
103         return !((mbuf->ol_flags & PKT_RX_VLAN) ? mbuf->vlan_tci : 0) &&
104                 (ethertype == ether_type_slow_be &&
105                 (subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
106 }
107
108 /*****************************************************************************
109  * Flow director's setup for mode 4 optimization
110  */
111
112 static struct rte_flow_item_eth flow_item_eth_type_8023ad = {
113         .dst.addr_bytes = { 0 },
114         .src.addr_bytes = { 0 },
115         .type = RTE_BE16(ETHER_TYPE_SLOW),
116 };
117
118 static struct rte_flow_item_eth flow_item_eth_mask_type_8023ad = {
119         .dst.addr_bytes = { 0 },
120         .src.addr_bytes = { 0 },
121         .type = 0xFFFF,
122 };
123
124 static struct rte_flow_item flow_item_8023ad[] = {
125         {
126                 .type = RTE_FLOW_ITEM_TYPE_ETH,
127                 .spec = &flow_item_eth_type_8023ad,
128                 .last = NULL,
129                 .mask = &flow_item_eth_mask_type_8023ad,
130         },
131         {
132                 .type = RTE_FLOW_ITEM_TYPE_END,
133                 .spec = NULL,
134                 .last = NULL,
135                 .mask = NULL,
136         }
137 };
138
139 const struct rte_flow_attr flow_attr_8023ad = {
140         .group = 0,
141         .priority = 0,
142         .ingress = 1,
143         .egress = 0,
144         .reserved = 0,
145 };
146
147 int
148 bond_ethdev_8023ad_flow_verify(struct rte_eth_dev *bond_dev,
149                 uint16_t slave_port) {
150         struct rte_eth_dev_info slave_info;
151         struct rte_flow_error error;
152         struct bond_dev_private *internals = (struct bond_dev_private *)
153                         (bond_dev->data->dev_private);
154
155         const struct rte_flow_action_queue lacp_queue_conf = {
156                 .index = 0,
157         };
158
159         const struct rte_flow_action actions[] = {
160                 {
161                         .type = RTE_FLOW_ACTION_TYPE_QUEUE,
162                         .conf = &lacp_queue_conf
163                 },
164                 {
165                         .type = RTE_FLOW_ACTION_TYPE_END,
166                 }
167         };
168
169         int ret = rte_flow_validate(slave_port, &flow_attr_8023ad,
170                         flow_item_8023ad, actions, &error);
171         if (ret < 0) {
172                 RTE_BOND_LOG(ERR, "%s: %s (slave_port=%d queue_id=%d)",
173                                 __func__, error.message, slave_port,
174                                 internals->mode4.dedicated_queues.rx_qid);
175                 return -1;
176         }
177
178         rte_eth_dev_info_get(slave_port, &slave_info);
179         if (slave_info.max_rx_queues < bond_dev->data->nb_rx_queues ||
180                         slave_info.max_tx_queues < bond_dev->data->nb_tx_queues) {
181                 RTE_BOND_LOG(ERR,
182                         "%s: Slave %d capabilities doesn't allow to allocate additional queues",
183                         __func__, slave_port);
184                 return -1;
185         }
186
187         return 0;
188 }
189
190 int
191 bond_8023ad_slow_pkt_hw_filter_supported(uint16_t port_id) {
192         struct rte_eth_dev *bond_dev = &rte_eth_devices[port_id];
193         struct bond_dev_private *internals = (struct bond_dev_private *)
194                         (bond_dev->data->dev_private);
195         struct rte_eth_dev_info bond_info;
196         uint16_t idx;
197
198         /* Verify if all slaves in bonding supports flow director and */
199         if (internals->slave_count > 0) {
200                 rte_eth_dev_info_get(bond_dev->data->port_id, &bond_info);
201
202                 internals->mode4.dedicated_queues.rx_qid = bond_info.nb_rx_queues;
203                 internals->mode4.dedicated_queues.tx_qid = bond_info.nb_tx_queues;
204
205                 for (idx = 0; idx < internals->slave_count; idx++) {
206                         if (bond_ethdev_8023ad_flow_verify(bond_dev,
207                                         internals->slaves[idx].port_id) != 0)
208                                 return -1;
209                 }
210         }
211
212         return 0;
213 }
214
215 int
216 bond_ethdev_8023ad_flow_set(struct rte_eth_dev *bond_dev, uint16_t slave_port) {
217
218         struct rte_flow_error error;
219         struct bond_dev_private *internals = (struct bond_dev_private *)
220                         (bond_dev->data->dev_private);
221
222         struct rte_flow_action_queue lacp_queue_conf = {
223                 .index = internals->mode4.dedicated_queues.rx_qid,
224         };
225
226         const struct rte_flow_action actions[] = {
227                 {
228                         .type = RTE_FLOW_ACTION_TYPE_QUEUE,
229                         .conf = &lacp_queue_conf
230                 },
231                 {
232                         .type = RTE_FLOW_ACTION_TYPE_END,
233                 }
234         };
235
236         internals->mode4.dedicated_queues.flow[slave_port] = rte_flow_create(slave_port,
237                         &flow_attr_8023ad, flow_item_8023ad, actions, &error);
238         if (internals->mode4.dedicated_queues.flow[slave_port] == NULL) {
239                 RTE_BOND_LOG(ERR, "bond_ethdev_8023ad_flow_set: %s "
240                                 "(slave_port=%d queue_id=%d)",
241                                 error.message, slave_port,
242                                 internals->mode4.dedicated_queues.rx_qid);
243                 return -1;
244         }
245
246         return 0;
247 }
248
249 static uint16_t
250 bond_ethdev_rx_burst_8023ad_fast_queue(void *queue, struct rte_mbuf **bufs,
251                 uint16_t nb_pkts)
252 {
253         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
254         struct bond_dev_private *internals = bd_rx_q->dev_private;
255         uint16_t num_rx_total = 0;      /* Total number of received packets */
256         uint16_t slaves[RTE_MAX_ETHPORTS];
257         uint16_t slave_count;
258
259         uint16_t i, idx;
260
261         /* Copy slave list to protect against slave up/down changes during tx
262          * bursting */
263         slave_count = internals->active_slave_count;
264         memcpy(slaves, internals->active_slaves,
265                         sizeof(internals->active_slaves[0]) * slave_count);
266
267         for (i = 0, idx = internals->active_slave;
268                         i < slave_count && num_rx_total < nb_pkts; i++, idx++) {
269                 idx = idx % slave_count;
270
271                 /* Read packets from this slave */
272                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
273                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
274         }
275
276         internals->active_slave = idx;
277
278         return num_rx_total;
279 }
280
281 static uint16_t
282 bond_ethdev_tx_burst_8023ad_fast_queue(void *queue, struct rte_mbuf **bufs,
283                 uint16_t nb_pkts)
284 {
285         struct bond_dev_private *internals;
286         struct bond_tx_queue *bd_tx_q;
287
288         uint16_t num_of_slaves;
289         uint16_t slaves[RTE_MAX_ETHPORTS];
290          /* positions in slaves, not ID */
291         uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
292         uint8_t distributing_count;
293
294         uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
295         uint16_t i, op_slave_idx;
296
297         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
298
299         /* Total amount of packets in slave_bufs */
300         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
301         /* Slow packets placed in each slave */
302
303         if (unlikely(nb_pkts == 0))
304                 return 0;
305
306         bd_tx_q = (struct bond_tx_queue *)queue;
307         internals = bd_tx_q->dev_private;
308
309         /* Copy slave list to protect against slave up/down changes during tx
310          * bursting */
311         num_of_slaves = internals->active_slave_count;
312         if (num_of_slaves < 1)
313                 return num_tx_total;
314
315         memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) *
316                         num_of_slaves);
317
318         distributing_count = 0;
319         for (i = 0; i < num_of_slaves; i++) {
320                 struct port *port = &mode_8023ad_ports[slaves[i]];
321                 if (ACTOR_STATE(port, DISTRIBUTING))
322                         distributing_offsets[distributing_count++] = i;
323         }
324
325         if (likely(distributing_count > 0)) {
326                 /* Populate slaves mbuf with the packets which are to be sent */
327                 for (i = 0; i < nb_pkts; i++) {
328                         /* Select output slave using hash based on xmit policy */
329                         op_slave_idx = internals->xmit_hash(bufs[i],
330                                         distributing_count);
331
332                         /* Populate slave mbuf arrays with mbufs for that slave.
333                          * Use only slaves that are currently distributing.
334                          */
335                         uint8_t slave_offset =
336                                         distributing_offsets[op_slave_idx];
337                         slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] =
338                                         bufs[i];
339                         slave_nb_pkts[slave_offset]++;
340                 }
341         }
342
343         /* Send packet burst on each slave device */
344         for (i = 0; i < num_of_slaves; i++) {
345                 if (slave_nb_pkts[i] == 0)
346                         continue;
347
348                 num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
349                                 slave_bufs[i], slave_nb_pkts[i]);
350
351                 num_tx_total += num_tx_slave;
352                 num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
353
354                 /* If tx burst fails move packets to end of bufs */
355                 if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
356                         uint16_t j = nb_pkts - num_tx_fail_total;
357                         for ( ; num_tx_slave < slave_nb_pkts[i]; j++,
358                                         num_tx_slave++)
359                                 bufs[j] = slave_bufs[i][num_tx_slave];
360                 }
361         }
362
363         return num_tx_total;
364 }
365
366
367 static uint16_t
368 bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
369                 uint16_t nb_pkts)
370 {
371         /* Cast to structure, containing bonded device's port id and queue id */
372         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
373         struct bond_dev_private *internals = bd_rx_q->dev_private;
374         struct ether_addr bond_mac;
375
376         struct ether_hdr *hdr;
377
378         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
379         uint16_t num_rx_total = 0;      /* Total number of received packets */
380         uint16_t slaves[RTE_MAX_ETHPORTS];
381         uint16_t slave_count, idx;
382
383         uint8_t collecting;  /* current slave collecting status */
384         const uint8_t promisc = internals->promiscuous_en;
385         uint8_t i, j, k;
386         uint8_t subtype;
387
388         rte_eth_macaddr_get(internals->port_id, &bond_mac);
389         /* Copy slave list to protect against slave up/down changes during tx
390          * bursting */
391         slave_count = internals->active_slave_count;
392         memcpy(slaves, internals->active_slaves,
393                         sizeof(internals->active_slaves[0]) * slave_count);
394
395         idx = internals->active_slave;
396         if (idx >= slave_count) {
397                 internals->active_slave = 0;
398                 idx = 0;
399         }
400         for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
401                 j = num_rx_total;
402                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
403                                          COLLECTING);
404
405                 /* Read packets from this slave */
406                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
407                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
408
409                 for (k = j; k < 2 && k < num_rx_total; k++)
410                         rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
411
412                 /* Handle slow protocol packets. */
413                 while (j < num_rx_total) {
414
415                         /* If packet is not pure L2 and is known, skip it */
416                         if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) {
417                                 j++;
418                                 continue;
419                         }
420
421                         if (j + 3 < num_rx_total)
422                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
423
424                         hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
425                         subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
426
427                         /* Remove packet from array if it is slow packet or slave is not
428                          * in collecting state or bonding interface is not in promiscuous
429                          * mode and packet address does not match. */
430                         if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]) ||
431                                 !collecting || (!promisc &&
432                                         !is_multicast_ether_addr(&hdr->d_addr) &&
433                                         !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
434
435                                 if (hdr->ether_type == ether_type_slow_be) {
436                                         bond_mode_8023ad_handle_slow_pkt(
437                                             internals, slaves[idx], bufs[j]);
438                                 } else
439                                         rte_pktmbuf_free(bufs[j]);
440
441                                 /* Packet is managed by mode 4 or dropped, shift the array */
442                                 num_rx_total--;
443                                 if (j < num_rx_total) {
444                                         memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
445                                                 (num_rx_total - j));
446                                 }
447                         } else
448                                 j++;
449                 }
450                 if (unlikely(++idx == slave_count))
451                         idx = 0;
452         }
453
454         internals->active_slave = idx;
455         return num_rx_total;
456 }
457
458 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
459 uint32_t burstnumberRX;
460 uint32_t burstnumberTX;
461
462 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
463
464 static void
465 arp_op_name(uint16_t arp_op, char *buf)
466 {
467         switch (arp_op) {
468         case ARP_OP_REQUEST:
469                 snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
470                 return;
471         case ARP_OP_REPLY:
472                 snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
473                 return;
474         case ARP_OP_REVREQUEST:
475                 snprintf(buf, sizeof("Reverse ARP Request"), "%s",
476                                 "Reverse ARP Request");
477                 return;
478         case ARP_OP_REVREPLY:
479                 snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
480                                 "Reverse ARP Reply");
481                 return;
482         case ARP_OP_INVREQUEST:
483                 snprintf(buf, sizeof("Peer Identify Request"), "%s",
484                                 "Peer Identify Request");
485                 return;
486         case ARP_OP_INVREPLY:
487                 snprintf(buf, sizeof("Peer Identify Reply"), "%s",
488                                 "Peer Identify Reply");
489                 return;
490         default:
491                 break;
492         }
493         snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
494         return;
495 }
496 #endif
497 #define MaxIPv4String   16
498 static void
499 ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
500 {
501         uint32_t ipv4_addr;
502
503         ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
504         snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
505                 (ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
506                 ipv4_addr & 0xFF);
507 }
508
509 #define MAX_CLIENTS_NUMBER      128
510 uint8_t active_clients;
511 struct client_stats_t {
512         uint16_t port;
513         uint32_t ipv4_addr;
514         uint32_t ipv4_rx_packets;
515         uint32_t ipv4_tx_packets;
516 };
517 struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
518
519 static void
520 update_client_stats(uint32_t addr, uint16_t port, uint32_t *TXorRXindicator)
521 {
522         int i = 0;
523
524         for (; i < MAX_CLIENTS_NUMBER; i++)     {
525                 if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))      {
526                         /* Just update RX packets number for this client */
527                         if (TXorRXindicator == &burstnumberRX)
528                                 client_stats[i].ipv4_rx_packets++;
529                         else
530                                 client_stats[i].ipv4_tx_packets++;
531                         return;
532                 }
533         }
534         /* We have a new client. Insert him to the table, and increment stats */
535         if (TXorRXindicator == &burstnumberRX)
536                 client_stats[active_clients].ipv4_rx_packets++;
537         else
538                 client_stats[active_clients].ipv4_tx_packets++;
539         client_stats[active_clients].ipv4_addr = addr;
540         client_stats[active_clients].port = port;
541         active_clients++;
542
543 }
544
545 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
546 #define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)     \
547                 RTE_LOG(DEBUG, PMD, \
548                 "%s " \
549                 "port:%d " \
550                 "SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
551                 "SrcIP:%s " \
552                 "DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
553                 "DstIP:%s " \
554                 "%s " \
555                 "%d\n", \
556                 info, \
557                 port, \
558                 eth_h->s_addr.addr_bytes[0], \
559                 eth_h->s_addr.addr_bytes[1], \
560                 eth_h->s_addr.addr_bytes[2], \
561                 eth_h->s_addr.addr_bytes[3], \
562                 eth_h->s_addr.addr_bytes[4], \
563                 eth_h->s_addr.addr_bytes[5], \
564                 src_ip, \
565                 eth_h->d_addr.addr_bytes[0], \
566                 eth_h->d_addr.addr_bytes[1], \
567                 eth_h->d_addr.addr_bytes[2], \
568                 eth_h->d_addr.addr_bytes[3], \
569                 eth_h->d_addr.addr_bytes[4], \
570                 eth_h->d_addr.addr_bytes[5], \
571                 dst_ip, \
572                 arp_op, \
573                 ++burstnumber)
574 #endif
575
576 static void
577 mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
578                 uint16_t port, uint32_t __attribute__((unused)) *burstnumber)
579 {
580         struct ipv4_hdr *ipv4_h;
581 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
582         struct arp_hdr *arp_h;
583         char dst_ip[16];
584         char ArpOp[24];
585         char buf[16];
586 #endif
587         char src_ip[16];
588
589         uint16_t ether_type = eth_h->ether_type;
590         uint16_t offset = get_vlan_offset(eth_h, &ether_type);
591
592 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
593         snprintf(buf, 16, "%s", info);
594 #endif
595
596         if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
597                 ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
598                 ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
599 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
600                 ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
601                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
602 #endif
603                 update_client_stats(ipv4_h->src_addr, port, burstnumber);
604         }
605 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
606         else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
607                 arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
608                 ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
609                 ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
610                 arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
611                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
612         }
613 #endif
614 }
615 #endif
616
617 static uint16_t
618 bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
619 {
620         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
621         struct bond_dev_private *internals = bd_tx_q->dev_private;
622         struct ether_hdr *eth_h;
623         uint16_t ether_type, offset;
624         uint16_t nb_recv_pkts;
625         int i;
626
627         nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
628
629         for (i = 0; i < nb_recv_pkts; i++) {
630                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
631                 ether_type = eth_h->ether_type;
632                 offset = get_vlan_offset(eth_h, &ether_type);
633
634                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
635 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
636                         mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
637 #endif
638                         bond_mode_alb_arp_recv(eth_h, offset, internals);
639                 }
640 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
641                 else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
642                         mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
643 #endif
644         }
645
646         return nb_recv_pkts;
647 }
648
649 static uint16_t
650 bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
651                 uint16_t nb_pkts)
652 {
653         struct bond_dev_private *internals;
654         struct bond_tx_queue *bd_tx_q;
655
656         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
657         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
658
659         uint16_t num_of_slaves;
660         uint16_t slaves[RTE_MAX_ETHPORTS];
661
662         uint16_t num_tx_total = 0, num_tx_slave;
663
664         static int slave_idx = 0;
665         int i, cslave_idx = 0, tx_fail_total = 0;
666
667         bd_tx_q = (struct bond_tx_queue *)queue;
668         internals = bd_tx_q->dev_private;
669
670         /* Copy slave list to protect against slave up/down changes during tx
671          * bursting */
672         num_of_slaves = internals->active_slave_count;
673         memcpy(slaves, internals->active_slaves,
674                         sizeof(internals->active_slaves[0]) * num_of_slaves);
675
676         if (num_of_slaves < 1)
677                 return num_tx_total;
678
679         /* Populate slaves mbuf with which packets are to be sent on it  */
680         for (i = 0; i < nb_pkts; i++) {
681                 cslave_idx = (slave_idx + i) % num_of_slaves;
682                 slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
683         }
684
685         /* increment current slave index so the next call to tx burst starts on the
686          * next slave */
687         slave_idx = ++cslave_idx;
688
689         /* Send packet burst on each slave device */
690         for (i = 0; i < num_of_slaves; i++) {
691                 if (slave_nb_pkts[i] > 0) {
692                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
693                                         slave_bufs[i], slave_nb_pkts[i]);
694
695                         /* if tx burst fails move packets to end of bufs */
696                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
697                                 int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
698
699                                 tx_fail_total += tx_fail_slave;
700
701                                 memcpy(&bufs[nb_pkts - tx_fail_total],
702                                                 &slave_bufs[i][num_tx_slave],
703                                                 tx_fail_slave * sizeof(bufs[0]));
704                         }
705                         num_tx_total += num_tx_slave;
706                 }
707         }
708
709         return num_tx_total;
710 }
711
712 static uint16_t
713 bond_ethdev_tx_burst_active_backup(void *queue,
714                 struct rte_mbuf **bufs, uint16_t nb_pkts)
715 {
716         struct bond_dev_private *internals;
717         struct bond_tx_queue *bd_tx_q;
718
719         bd_tx_q = (struct bond_tx_queue *)queue;
720         internals = bd_tx_q->dev_private;
721
722         if (internals->active_slave_count < 1)
723                 return 0;
724
725         return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
726                         bufs, nb_pkts);
727 }
728
729 static inline uint16_t
730 ether_hash(struct ether_hdr *eth_hdr)
731 {
732         unaligned_uint16_t *word_src_addr =
733                 (unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
734         unaligned_uint16_t *word_dst_addr =
735                 (unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
736
737         return (word_src_addr[0] ^ word_dst_addr[0]) ^
738                         (word_src_addr[1] ^ word_dst_addr[1]) ^
739                         (word_src_addr[2] ^ word_dst_addr[2]);
740 }
741
742 static inline uint32_t
743 ipv4_hash(struct ipv4_hdr *ipv4_hdr)
744 {
745         return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
746 }
747
748 static inline uint32_t
749 ipv6_hash(struct ipv6_hdr *ipv6_hdr)
750 {
751         unaligned_uint32_t *word_src_addr =
752                 (unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
753         unaligned_uint32_t *word_dst_addr =
754                 (unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
755
756         return (word_src_addr[0] ^ word_dst_addr[0]) ^
757                         (word_src_addr[1] ^ word_dst_addr[1]) ^
758                         (word_src_addr[2] ^ word_dst_addr[2]) ^
759                         (word_src_addr[3] ^ word_dst_addr[3]);
760 }
761
762 uint16_t
763 xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
764 {
765         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
766
767         uint32_t hash = ether_hash(eth_hdr);
768
769         return (hash ^= hash >> 8) % slave_count;
770 }
771
772 uint16_t
773 xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count)
774 {
775         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
776         uint16_t proto = eth_hdr->ether_type;
777         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
778         uint32_t hash, l3hash = 0;
779
780         hash = ether_hash(eth_hdr);
781
782         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
783                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
784                                 ((char *)(eth_hdr + 1) + vlan_offset);
785                 l3hash = ipv4_hash(ipv4_hdr);
786
787         } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
788                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
789                                 ((char *)(eth_hdr + 1) + vlan_offset);
790                 l3hash = ipv6_hash(ipv6_hdr);
791         }
792
793         hash = hash ^ l3hash;
794         hash ^= hash >> 16;
795         hash ^= hash >> 8;
796
797         return hash % slave_count;
798 }
799
800 uint16_t
801 xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count)
802 {
803         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
804         uint16_t proto = eth_hdr->ether_type;
805         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
806
807         struct udp_hdr *udp_hdr = NULL;
808         struct tcp_hdr *tcp_hdr = NULL;
809         uint32_t hash, l3hash = 0, l4hash = 0;
810
811         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
812                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
813                                 ((char *)(eth_hdr + 1) + vlan_offset);
814                 size_t ip_hdr_offset;
815
816                 l3hash = ipv4_hash(ipv4_hdr);
817
818                 /* there is no L4 header in fragmented packet */
819                 if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) {
820                         ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) *
821                                         IPV4_IHL_MULTIPLIER;
822
823                         if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
824                                 tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
825                                                 ip_hdr_offset);
826                                 l4hash = HASH_L4_PORTS(tcp_hdr);
827                         } else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) {
828                                 udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
829                                                 ip_hdr_offset);
830                                 l4hash = HASH_L4_PORTS(udp_hdr);
831                         }
832                 }
833         } else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
834                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
835                                 ((char *)(eth_hdr + 1) + vlan_offset);
836                 l3hash = ipv6_hash(ipv6_hdr);
837
838                 if (ipv6_hdr->proto == IPPROTO_TCP) {
839                         tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
840                         l4hash = HASH_L4_PORTS(tcp_hdr);
841                 } else if (ipv6_hdr->proto == IPPROTO_UDP) {
842                         udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
843                         l4hash = HASH_L4_PORTS(udp_hdr);
844                 }
845         }
846
847         hash = l3hash ^ l4hash;
848         hash ^= hash >> 16;
849         hash ^= hash >> 8;
850
851         return hash % slave_count;
852 }
853
854 struct bwg_slave {
855         uint64_t bwg_left_int;
856         uint64_t bwg_left_remainder;
857         uint8_t slave;
858 };
859
860 void
861 bond_tlb_activate_slave(struct bond_dev_private *internals) {
862         int i;
863
864         for (i = 0; i < internals->active_slave_count; i++) {
865                 tlb_last_obytets[internals->active_slaves[i]] = 0;
866         }
867 }
868
869 static int
870 bandwidth_cmp(const void *a, const void *b)
871 {
872         const struct bwg_slave *bwg_a = a;
873         const struct bwg_slave *bwg_b = b;
874         int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
875         int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
876                         (int64_t)bwg_a->bwg_left_remainder;
877         if (diff > 0)
878                 return 1;
879         else if (diff < 0)
880                 return -1;
881         else if (diff2 > 0)
882                 return 1;
883         else if (diff2 < 0)
884                 return -1;
885         else
886                 return 0;
887 }
888
889 static void
890 bandwidth_left(uint16_t port_id, uint64_t load, uint8_t update_idx,
891                 struct bwg_slave *bwg_slave)
892 {
893         struct rte_eth_link link_status;
894
895         rte_eth_link_get_nowait(port_id, &link_status);
896         uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
897         if (link_bwg == 0)
898                 return;
899         link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
900         bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
901         bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
902 }
903
904 static void
905 bond_ethdev_update_tlb_slave_cb(void *arg)
906 {
907         struct bond_dev_private *internals = arg;
908         struct rte_eth_stats slave_stats;
909         struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
910         uint8_t slave_count;
911         uint64_t tx_bytes;
912
913         uint8_t update_stats = 0;
914         uint8_t i, slave_id;
915
916         internals->slave_update_idx++;
917
918
919         if (internals->slave_update_idx >= REORDER_PERIOD_MS)
920                 update_stats = 1;
921
922         for (i = 0; i < internals->active_slave_count; i++) {
923                 slave_id = internals->active_slaves[i];
924                 rte_eth_stats_get(slave_id, &slave_stats);
925                 tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
926                 bandwidth_left(slave_id, tx_bytes,
927                                 internals->slave_update_idx, &bwg_array[i]);
928                 bwg_array[i].slave = slave_id;
929
930                 if (update_stats) {
931                         tlb_last_obytets[slave_id] = slave_stats.obytes;
932                 }
933         }
934
935         if (update_stats == 1)
936                 internals->slave_update_idx = 0;
937
938         slave_count = i;
939         qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
940         for (i = 0; i < slave_count; i++)
941                 internals->tlb_slaves_order[i] = bwg_array[i].slave;
942
943         rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
944                         (struct bond_dev_private *)internals);
945 }
946
947 static uint16_t
948 bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
949 {
950         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
951         struct bond_dev_private *internals = bd_tx_q->dev_private;
952
953         struct rte_eth_dev *primary_port =
954                         &rte_eth_devices[internals->primary_port];
955         uint16_t num_tx_total = 0;
956         uint16_t i, j;
957
958         uint16_t num_of_slaves = internals->active_slave_count;
959         uint16_t slaves[RTE_MAX_ETHPORTS];
960
961         struct ether_hdr *ether_hdr;
962         struct ether_addr primary_slave_addr;
963         struct ether_addr active_slave_addr;
964
965         if (num_of_slaves < 1)
966                 return num_tx_total;
967
968         memcpy(slaves, internals->tlb_slaves_order,
969                                 sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
970
971
972         ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
973
974         if (nb_pkts > 3) {
975                 for (i = 0; i < 3; i++)
976                         rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
977         }
978
979         for (i = 0; i < num_of_slaves; i++) {
980                 rte_eth_macaddr_get(slaves[i], &active_slave_addr);
981                 for (j = num_tx_total; j < nb_pkts; j++) {
982                         if (j + 3 < nb_pkts)
983                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
984
985                         ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
986                         if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
987                                 ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
988 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
989                                         mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
990 #endif
991                 }
992
993                 num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
994                                 bufs + num_tx_total, nb_pkts - num_tx_total);
995
996                 if (num_tx_total == nb_pkts)
997                         break;
998         }
999
1000         return num_tx_total;
1001 }
1002
1003 void
1004 bond_tlb_disable(struct bond_dev_private *internals)
1005 {
1006         rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
1007 }
1008
1009 void
1010 bond_tlb_enable(struct bond_dev_private *internals)
1011 {
1012         bond_ethdev_update_tlb_slave_cb(internals);
1013 }
1014
1015 static uint16_t
1016 bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
1017 {
1018         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
1019         struct bond_dev_private *internals = bd_tx_q->dev_private;
1020
1021         struct ether_hdr *eth_h;
1022         uint16_t ether_type, offset;
1023
1024         struct client_data *client_info;
1025
1026         /*
1027          * We create transmit buffers for every slave and one additional to send
1028          * through tlb. In worst case every packet will be send on one port.
1029          */
1030         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
1031         uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
1032
1033         /*
1034          * We create separate transmit buffers for update packets as they won't
1035          * be counted in num_tx_total.
1036          */
1037         struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
1038         uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
1039
1040         struct rte_mbuf *upd_pkt;
1041         size_t pkt_size;
1042
1043         uint16_t num_send, num_not_send = 0;
1044         uint16_t num_tx_total = 0;
1045         uint16_t slave_idx;
1046
1047         int i, j;
1048
1049         /* Search tx buffer for ARP packets and forward them to alb */
1050         for (i = 0; i < nb_pkts; i++) {
1051                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
1052                 ether_type = eth_h->ether_type;
1053                 offset = get_vlan_offset(eth_h, &ether_type);
1054
1055                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
1056                         slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
1057
1058                         /* Change src mac in eth header */
1059                         rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
1060
1061                         /* Add packet to slave tx buffer */
1062                         slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
1063                         slave_bufs_pkts[slave_idx]++;
1064                 } else {
1065                         /* If packet is not ARP, send it with TLB policy */
1066                         slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
1067                                         bufs[i];
1068                         slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
1069                 }
1070         }
1071
1072         /* Update connected client ARP tables */
1073         if (internals->mode6.ntt) {
1074                 for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
1075                         client_info = &internals->mode6.client_table[i];
1076
1077                         if (client_info->in_use) {
1078                                 /* Allocate new packet to send ARP update on current slave */
1079                                 upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
1080                                 if (upd_pkt == NULL) {
1081                                         RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
1082                                         continue;
1083                                 }
1084                                 pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
1085                                                 + client_info->vlan_count * sizeof(struct vlan_hdr);
1086                                 upd_pkt->data_len = pkt_size;
1087                                 upd_pkt->pkt_len = pkt_size;
1088
1089                                 slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
1090                                                 internals);
1091
1092                                 /* Add packet to update tx buffer */
1093                                 update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
1094                                 update_bufs_pkts[slave_idx]++;
1095                         }
1096                 }
1097                 internals->mode6.ntt = 0;
1098         }
1099
1100         /* Send ARP packets on proper slaves */
1101         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1102                 if (slave_bufs_pkts[i] > 0) {
1103                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
1104                                         slave_bufs[i], slave_bufs_pkts[i]);
1105                         for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
1106                                 bufs[nb_pkts - 1 - num_not_send - j] =
1107                                                 slave_bufs[i][nb_pkts - 1 - j];
1108                         }
1109
1110                         num_tx_total += num_send;
1111                         num_not_send += slave_bufs_pkts[i] - num_send;
1112
1113 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1114         /* Print TX stats including update packets */
1115                         for (j = 0; j < slave_bufs_pkts[i]; j++) {
1116                                 eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
1117                                 mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
1118                         }
1119 #endif
1120                 }
1121         }
1122
1123         /* Send update packets on proper slaves */
1124         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1125                 if (update_bufs_pkts[i] > 0) {
1126                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
1127                                         update_bufs_pkts[i]);
1128                         for (j = num_send; j < update_bufs_pkts[i]; j++) {
1129                                 rte_pktmbuf_free(update_bufs[i][j]);
1130                         }
1131 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1132                         for (j = 0; j < update_bufs_pkts[i]; j++) {
1133                                 eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
1134                                 mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
1135                         }
1136 #endif
1137                 }
1138         }
1139
1140         /* Send non-ARP packets using tlb policy */
1141         if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
1142                 num_send = bond_ethdev_tx_burst_tlb(queue,
1143                                 slave_bufs[RTE_MAX_ETHPORTS],
1144                                 slave_bufs_pkts[RTE_MAX_ETHPORTS]);
1145
1146                 for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
1147                         bufs[nb_pkts - 1 - num_not_send - j] =
1148                                         slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
1149                 }
1150
1151                 num_tx_total += num_send;
1152         }
1153
1154         return num_tx_total;
1155 }
1156
1157 static uint16_t
1158 bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
1159                 uint16_t nb_pkts)
1160 {
1161         struct bond_dev_private *internals;
1162         struct bond_tx_queue *bd_tx_q;
1163
1164         uint16_t num_of_slaves;
1165         uint16_t slaves[RTE_MAX_ETHPORTS];
1166
1167         uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0;
1168
1169         int i, op_slave_id;
1170
1171         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
1172         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1173
1174         bd_tx_q = (struct bond_tx_queue *)queue;
1175         internals = bd_tx_q->dev_private;
1176
1177         /* Copy slave list to protect against slave up/down changes during tx
1178          * bursting */
1179         num_of_slaves = internals->active_slave_count;
1180         memcpy(slaves, internals->active_slaves,
1181                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1182
1183         if (num_of_slaves < 1)
1184                 return num_tx_total;
1185
1186         /* Populate slaves mbuf with the packets which are to be sent on it  */
1187         for (i = 0; i < nb_pkts; i++) {
1188                 /* Select output slave using hash based on xmit policy */
1189                 op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves);
1190
1191                 /* Populate slave mbuf arrays with mbufs for that slave */
1192                 slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i];
1193         }
1194
1195         /* Send packet burst on each slave device */
1196         for (i = 0; i < num_of_slaves; i++) {
1197                 if (slave_nb_pkts[i] > 0) {
1198                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1199                                         slave_bufs[i], slave_nb_pkts[i]);
1200
1201                         /* if tx burst fails move packets to end of bufs */
1202                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1203                                 int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave;
1204
1205                                 tx_fail_total += slave_tx_fail_count;
1206                                 memcpy(&bufs[nb_pkts - tx_fail_total],
1207                                                 &slave_bufs[i][num_tx_slave],
1208                                                 slave_tx_fail_count * sizeof(bufs[0]));
1209                         }
1210
1211                         num_tx_total += num_tx_slave;
1212                 }
1213         }
1214
1215         return num_tx_total;
1216 }
1217
1218 static uint16_t
1219 bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
1220                 uint16_t nb_pkts)
1221 {
1222         struct bond_dev_private *internals;
1223         struct bond_tx_queue *bd_tx_q;
1224
1225         uint16_t num_of_slaves;
1226         uint16_t slaves[RTE_MAX_ETHPORTS];
1227          /* positions in slaves, not ID */
1228         uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
1229         uint8_t distributing_count;
1230
1231         uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
1232         uint16_t i, op_slave_idx;
1233
1234         /* Allocate additional packets in case 8023AD mode. */
1235         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
1236
1237         /* Total amount of packets in slave_bufs */
1238         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1239         /* Slow packets placed in each slave */
1240         uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1241
1242         bd_tx_q = (struct bond_tx_queue *)queue;
1243         internals = bd_tx_q->dev_private;
1244
1245         /* Copy slave list to protect against slave up/down changes during tx
1246          * bursting */
1247         num_of_slaves = internals->active_slave_count;
1248         if (num_of_slaves < 1)
1249                 return num_tx_total;
1250
1251         memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves);
1252
1253         distributing_count = 0;
1254         for (i = 0; i < num_of_slaves; i++) {
1255                 struct port *port = &mode_8023ad_ports[slaves[i]];
1256
1257                 if (ACTOR_STATE(port, DISTRIBUTING))
1258                         distributing_offsets[distributing_count++] = i;
1259         }
1260
1261         if (likely(distributing_count > 0)) {
1262                 /* Populate slaves mbuf with the packets which are to be sent on it */
1263                 for (i = 0; i < nb_pkts; i++) {
1264                         /* Select output slave using hash based on xmit policy */
1265                         op_slave_idx = internals->xmit_hash(bufs[i], distributing_count);
1266
1267                         /* Populate slave mbuf arrays with mbufs for that slave. Use only
1268                          * slaves that are currently distributing. */
1269                         uint8_t slave_offset = distributing_offsets[op_slave_idx];
1270                         slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i];
1271                         slave_nb_pkts[slave_offset]++;
1272                 }
1273         }
1274
1275         /* Send packet burst on each slave device */
1276         for (i = 0; i < num_of_slaves; i++) {
1277                 if (slave_nb_pkts[i] == 0)
1278                         continue;
1279
1280                 num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1281                                 slave_bufs[i], slave_nb_pkts[i]);
1282
1283                 /* If tx burst fails drop slow packets */
1284                 for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++)
1285                         rte_pktmbuf_free(slave_bufs[i][num_tx_slave]);
1286
1287                 num_tx_total += num_tx_slave - slave_slow_nb_pkts[i];
1288                 num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
1289
1290                 /* If tx burst fails move packets to end of bufs */
1291                 if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1292                         uint16_t j = nb_pkts - num_tx_fail_total;
1293                         for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++)
1294                                 bufs[j] = slave_bufs[i][num_tx_slave];
1295                 }
1296         }
1297
1298         /* Check for LACP control packets and send if available */
1299         for (i = 0; i < num_of_slaves; i++) {
1300                 struct port *port = &mode_8023ad_ports[slaves[i]];
1301                 struct rte_mbuf *ctrl_pkt = NULL;
1302
1303                 int pkt_avail = rte_ring_dequeue(port->tx_ring,
1304                                 (void **)&ctrl_pkt);
1305
1306                 if (unlikely(pkt_avail == 0)) {
1307                         num_tx_slave = rte_eth_tx_burst(slaves[i],
1308                                         bd_tx_q->queue_id, &ctrl_pkt, 1);
1309
1310                         /*
1311                          * re-enqueue LAG control plane packets to buffering
1312                          * ring if transmission fails so the packet isn't lost.
1313                          */
1314                         if (num_tx_slave != nb_pkts)
1315                                 rte_ring_enqueue(port->tx_ring, ctrl_pkt);
1316                 }
1317         }
1318
1319         return num_tx_total;
1320 }
1321
1322 static uint16_t
1323 bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1324                 uint16_t nb_pkts)
1325 {
1326         struct bond_dev_private *internals;
1327         struct bond_tx_queue *bd_tx_q;
1328
1329         uint8_t tx_failed_flag = 0, num_of_slaves;
1330         uint16_t slaves[RTE_MAX_ETHPORTS];
1331
1332         uint16_t max_nb_of_tx_pkts = 0;
1333
1334         int slave_tx_total[RTE_MAX_ETHPORTS];
1335         int i, most_successful_tx_slave = -1;
1336
1337         bd_tx_q = (struct bond_tx_queue *)queue;
1338         internals = bd_tx_q->dev_private;
1339
1340         /* Copy slave list to protect against slave up/down changes during tx
1341          * bursting */
1342         num_of_slaves = internals->active_slave_count;
1343         memcpy(slaves, internals->active_slaves,
1344                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1345
1346         if (num_of_slaves < 1)
1347                 return 0;
1348
1349         /* Increment reference count on mbufs */
1350         for (i = 0; i < nb_pkts; i++)
1351                 rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1352
1353         /* Transmit burst on each active slave */
1354         for (i = 0; i < num_of_slaves; i++) {
1355                 slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1356                                         bufs, nb_pkts);
1357
1358                 if (unlikely(slave_tx_total[i] < nb_pkts))
1359                         tx_failed_flag = 1;
1360
1361                 /* record the value and slave index for the slave which transmits the
1362                  * maximum number of packets */
1363                 if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1364                         max_nb_of_tx_pkts = slave_tx_total[i];
1365                         most_successful_tx_slave = i;
1366                 }
1367         }
1368
1369         /* if slaves fail to transmit packets from burst, the calling application
1370          * is not expected to know about multiple references to packets so we must
1371          * handle failures of all packets except those of the most successful slave
1372          */
1373         if (unlikely(tx_failed_flag))
1374                 for (i = 0; i < num_of_slaves; i++)
1375                         if (i != most_successful_tx_slave)
1376                                 while (slave_tx_total[i] < nb_pkts)
1377                                         rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1378
1379         return max_nb_of_tx_pkts;
1380 }
1381
1382 void
1383 link_properties_set(struct rte_eth_dev *ethdev, struct rte_eth_link *slave_link)
1384 {
1385         struct bond_dev_private *bond_ctx = ethdev->data->dev_private;
1386
1387         if (bond_ctx->mode == BONDING_MODE_8023AD) {
1388                 /**
1389                  * If in mode 4 then save the link properties of the first
1390                  * slave, all subsequent slaves must match these properties
1391                  */
1392                 struct rte_eth_link *bond_link = &bond_ctx->mode4.slave_link;
1393
1394                 bond_link->link_autoneg = slave_link->link_autoneg;
1395                 bond_link->link_duplex = slave_link->link_duplex;
1396                 bond_link->link_speed = slave_link->link_speed;
1397         } else {
1398                 /**
1399                  * In any other mode the link properties are set to default
1400                  * values of AUTONEG/DUPLEX
1401                  */
1402                 ethdev->data->dev_link.link_autoneg = ETH_LINK_AUTONEG;
1403                 ethdev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
1404         }
1405 }
1406
1407 int
1408 link_properties_valid(struct rte_eth_dev *ethdev,
1409                 struct rte_eth_link *slave_link)
1410 {
1411         struct bond_dev_private *bond_ctx = ethdev->data->dev_private;
1412
1413         if (bond_ctx->mode == BONDING_MODE_8023AD) {
1414                 struct rte_eth_link *bond_link = &bond_ctx->mode4.slave_link;
1415
1416                 if (bond_link->link_duplex != slave_link->link_duplex ||
1417                         bond_link->link_autoneg != slave_link->link_autoneg ||
1418                         bond_link->link_speed != slave_link->link_speed)
1419                         return -1;
1420         }
1421
1422         return 0;
1423 }
1424
1425 int
1426 mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1427 {
1428         struct ether_addr *mac_addr;
1429
1430         if (eth_dev == NULL) {
1431                 RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1432                 return -1;
1433         }
1434
1435         if (dst_mac_addr == NULL) {
1436                 RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1437                 return -1;
1438         }
1439
1440         mac_addr = eth_dev->data->mac_addrs;
1441
1442         ether_addr_copy(mac_addr, dst_mac_addr);
1443         return 0;
1444 }
1445
1446 int
1447 mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1448 {
1449         struct ether_addr *mac_addr;
1450
1451         if (eth_dev == NULL) {
1452                 RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1453                 return -1;
1454         }
1455
1456         if (new_mac_addr == NULL) {
1457                 RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1458                 return -1;
1459         }
1460
1461         mac_addr = eth_dev->data->mac_addrs;
1462
1463         /* If new MAC is different to current MAC then update */
1464         if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1465                 memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1466
1467         return 0;
1468 }
1469
1470 int
1471 mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1472 {
1473         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1474         int i;
1475
1476         /* Update slave devices MAC addresses */
1477         if (internals->slave_count < 1)
1478                 return -1;
1479
1480         switch (internals->mode) {
1481         case BONDING_MODE_ROUND_ROBIN:
1482         case BONDING_MODE_BALANCE:
1483         case BONDING_MODE_BROADCAST:
1484                 for (i = 0; i < internals->slave_count; i++) {
1485                         if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id],
1486                                         bonded_eth_dev->data->mac_addrs)) {
1487                                 RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1488                                                 internals->slaves[i].port_id);
1489                                 return -1;
1490                         }
1491                 }
1492                 break;
1493         case BONDING_MODE_8023AD:
1494                 bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1495                 break;
1496         case BONDING_MODE_ACTIVE_BACKUP:
1497         case BONDING_MODE_TLB:
1498         case BONDING_MODE_ALB:
1499         default:
1500                 for (i = 0; i < internals->slave_count; i++) {
1501                         if (internals->slaves[i].port_id ==
1502                                         internals->current_primary_port) {
1503                                 if (mac_address_set(&rte_eth_devices[internals->primary_port],
1504                                                 bonded_eth_dev->data->mac_addrs)) {
1505                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1506                                                         internals->current_primary_port);
1507                                         return -1;
1508                                 }
1509                         } else {
1510                                 if (mac_address_set(
1511                                                 &rte_eth_devices[internals->slaves[i].port_id],
1512                                                 &internals->slaves[i].persisted_mac_addr)) {
1513                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1514                                                         internals->slaves[i].port_id);
1515                                         return -1;
1516                                 }
1517                         }
1518                 }
1519         }
1520
1521         return 0;
1522 }
1523
1524 int
1525 bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1526 {
1527         struct bond_dev_private *internals;
1528
1529         internals = eth_dev->data->dev_private;
1530
1531         switch (mode) {
1532         case BONDING_MODE_ROUND_ROBIN:
1533                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1534                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1535                 break;
1536         case BONDING_MODE_ACTIVE_BACKUP:
1537                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1538                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1539                 break;
1540         case BONDING_MODE_BALANCE:
1541                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1542                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1543                 break;
1544         case BONDING_MODE_BROADCAST:
1545                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1546                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1547                 break;
1548         case BONDING_MODE_8023AD:
1549                 if (bond_mode_8023ad_enable(eth_dev) != 0)
1550                         return -1;
1551
1552                 if (internals->mode4.dedicated_queues.enabled == 0) {
1553                         eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1554                         eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1555                         RTE_LOG(WARNING, PMD,
1556                                 "Using mode 4, it is necessary to do TX burst "
1557                                 "and RX burst at least every 100ms.\n");
1558                 } else {
1559                         /* Use flow director's optimization */
1560                         eth_dev->rx_pkt_burst =
1561                                         bond_ethdev_rx_burst_8023ad_fast_queue;
1562                         eth_dev->tx_pkt_burst =
1563                                         bond_ethdev_tx_burst_8023ad_fast_queue;
1564                 }
1565                 break;
1566         case BONDING_MODE_TLB:
1567                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1568                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1569                 break;
1570         case BONDING_MODE_ALB:
1571                 if (bond_mode_alb_enable(eth_dev) != 0)
1572                         return -1;
1573
1574                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1575                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1576                 break;
1577         default:
1578                 return -1;
1579         }
1580
1581         internals->mode = mode;
1582
1583         return 0;
1584 }
1585
1586
1587 static int
1588 slave_configure_slow_queue(struct rte_eth_dev *bonded_eth_dev,
1589                 struct rte_eth_dev *slave_eth_dev)
1590 {
1591         int errval = 0;
1592         struct bond_dev_private *internals = (struct bond_dev_private *)
1593                 bonded_eth_dev->data->dev_private;
1594         struct port *port = &mode_8023ad_ports[slave_eth_dev->data->port_id];
1595
1596         if (port->slow_pool == NULL) {
1597                 char mem_name[256];
1598                 int slave_id = slave_eth_dev->data->port_id;
1599
1600                 snprintf(mem_name, RTE_DIM(mem_name), "slave_port%u_slow_pool",
1601                                 slave_id);
1602                 port->slow_pool = rte_pktmbuf_pool_create(mem_name, 8191,
1603                         250, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
1604                         slave_eth_dev->data->numa_node);
1605
1606                 /* Any memory allocation failure in initialization is critical because
1607                  * resources can't be free, so reinitialization is impossible. */
1608                 if (port->slow_pool == NULL) {
1609                         rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
1610                                 slave_id, mem_name, rte_strerror(rte_errno));
1611                 }
1612         }
1613
1614         if (internals->mode4.dedicated_queues.enabled == 1) {
1615                 /* Configure slow Rx queue */
1616
1617                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id,
1618                                 internals->mode4.dedicated_queues.rx_qid, 128,
1619                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1620                                 NULL, port->slow_pool);
1621                 if (errval != 0) {
1622                         RTE_BOND_LOG(ERR,
1623                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1624                                         slave_eth_dev->data->port_id,
1625                                         internals->mode4.dedicated_queues.rx_qid,
1626                                         errval);
1627                         return errval;
1628                 }
1629
1630                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id,
1631                                 internals->mode4.dedicated_queues.tx_qid, 512,
1632                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1633                                 NULL);
1634                 if (errval != 0) {
1635                         RTE_BOND_LOG(ERR,
1636                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1637                                 slave_eth_dev->data->port_id,
1638                                 internals->mode4.dedicated_queues.tx_qid,
1639                                 errval);
1640                         return errval;
1641                 }
1642         }
1643         return 0;
1644 }
1645
1646 int
1647 slave_configure(struct rte_eth_dev *bonded_eth_dev,
1648                 struct rte_eth_dev *slave_eth_dev)
1649 {
1650         struct bond_rx_queue *bd_rx_q;
1651         struct bond_tx_queue *bd_tx_q;
1652         uint16_t nb_rx_queues;
1653         uint16_t nb_tx_queues;
1654
1655         int errval;
1656         uint16_t q_id;
1657         struct rte_flow_error flow_error;
1658
1659         struct bond_dev_private *internals = (struct bond_dev_private *)
1660                 bonded_eth_dev->data->dev_private;
1661
1662         /* Stop slave */
1663         rte_eth_dev_stop(slave_eth_dev->data->port_id);
1664
1665         /* Enable interrupts on slave device if supported */
1666         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1667                 slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1668
1669         /* If RSS is enabled for bonding, try to enable it for slaves  */
1670         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1671                 if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1672                                 != 0) {
1673                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1674                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1675                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1676                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1677                 } else {
1678                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1679                 }
1680
1681                 slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1682                                 bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1683                 slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1684                                 bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1685         }
1686
1687         slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter =
1688                         bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter;
1689
1690         nb_rx_queues = bonded_eth_dev->data->nb_rx_queues;
1691         nb_tx_queues = bonded_eth_dev->data->nb_tx_queues;
1692
1693         if (internals->mode == BONDING_MODE_8023AD) {
1694                 if (internals->mode4.dedicated_queues.enabled == 1) {
1695                         nb_rx_queues++;
1696                         nb_tx_queues++;
1697                 }
1698         }
1699
1700         /* Configure device */
1701         errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1702                         nb_rx_queues, nb_tx_queues,
1703                         &(slave_eth_dev->data->dev_conf));
1704         if (errval != 0) {
1705                 RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1706                                 slave_eth_dev->data->port_id, errval);
1707                 return errval;
1708         }
1709
1710         /* Setup Rx Queues */
1711         for (q_id = 0; q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1712                 bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1713
1714                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1715                                 bd_rx_q->nb_rx_desc,
1716                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1717                                 &(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1718                 if (errval != 0) {
1719                         RTE_BOND_LOG(ERR,
1720                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1721                                         slave_eth_dev->data->port_id, q_id, errval);
1722                         return errval;
1723                 }
1724         }
1725
1726         /* Setup Tx Queues */
1727         for (q_id = 0; q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1728                 bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1729
1730                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1731                                 bd_tx_q->nb_tx_desc,
1732                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1733                                 &bd_tx_q->tx_conf);
1734                 if (errval != 0) {
1735                         RTE_BOND_LOG(ERR,
1736                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1737                                 slave_eth_dev->data->port_id, q_id, errval);
1738                         return errval;
1739                 }
1740         }
1741
1742         if (internals->mode == BONDING_MODE_8023AD &&
1743                         internals->mode4.dedicated_queues.enabled == 1) {
1744                 if (slave_configure_slow_queue(bonded_eth_dev, slave_eth_dev)
1745                                 != 0)
1746                         return errval;
1747
1748                 if (bond_ethdev_8023ad_flow_verify(bonded_eth_dev,
1749                                 slave_eth_dev->data->port_id) != 0) {
1750                         RTE_BOND_LOG(ERR,
1751                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1752                                 slave_eth_dev->data->port_id, q_id, errval);
1753                         return -1;
1754                 }
1755
1756                 if (internals->mode4.dedicated_queues.flow[slave_eth_dev->data->port_id] != NULL)
1757                         rte_flow_destroy(slave_eth_dev->data->port_id,
1758                                         internals->mode4.dedicated_queues.flow[slave_eth_dev->data->port_id],
1759                                         &flow_error);
1760
1761                 bond_ethdev_8023ad_flow_set(bonded_eth_dev,
1762                                 slave_eth_dev->data->port_id);
1763         }
1764
1765         /* Start device */
1766         errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1767         if (errval != 0) {
1768                 RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1769                                 slave_eth_dev->data->port_id, errval);
1770                 return -1;
1771         }
1772
1773         /* If RSS is enabled for bonding, synchronize RETA */
1774         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1775                 int i;
1776                 struct bond_dev_private *internals;
1777
1778                 internals = bonded_eth_dev->data->dev_private;
1779
1780                 for (i = 0; i < internals->slave_count; i++) {
1781                         if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1782                                 errval = rte_eth_dev_rss_reta_update(
1783                                                 slave_eth_dev->data->port_id,
1784                                                 &internals->reta_conf[0],
1785                                                 internals->slaves[i].reta_size);
1786                                 if (errval != 0) {
1787                                         RTE_LOG(WARNING, PMD,
1788                                                         "rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1789                                                         " RSS Configuration for bonding may be inconsistent.\n",
1790                                                         slave_eth_dev->data->port_id, errval);
1791                                 }
1792                                 break;
1793                         }
1794                 }
1795         }
1796
1797         /* If lsc interrupt is set, check initial slave's link status */
1798         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) {
1799                 slave_eth_dev->dev_ops->link_update(slave_eth_dev, 0);
1800                 bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1801                         RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id,
1802                         NULL);
1803         }
1804
1805         return 0;
1806 }
1807
1808 void
1809 slave_remove(struct bond_dev_private *internals,
1810                 struct rte_eth_dev *slave_eth_dev)
1811 {
1812         uint8_t i;
1813
1814         for (i = 0; i < internals->slave_count; i++)
1815                 if (internals->slaves[i].port_id ==
1816                                 slave_eth_dev->data->port_id)
1817                         break;
1818
1819         if (i < (internals->slave_count - 1))
1820                 memmove(&internals->slaves[i], &internals->slaves[i + 1],
1821                                 sizeof(internals->slaves[0]) *
1822                                 (internals->slave_count - i - 1));
1823
1824         internals->slave_count--;
1825
1826         /* force reconfiguration of slave interfaces */
1827         _rte_eth_dev_reset(slave_eth_dev);
1828 }
1829
1830 static void
1831 bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1832
1833 void
1834 slave_add(struct bond_dev_private *internals,
1835                 struct rte_eth_dev *slave_eth_dev)
1836 {
1837         struct bond_slave_details *slave_details =
1838                         &internals->slaves[internals->slave_count];
1839
1840         slave_details->port_id = slave_eth_dev->data->port_id;
1841         slave_details->last_link_status = 0;
1842
1843         /* Mark slave devices that don't support interrupts so we can
1844          * compensate when we start the bond
1845          */
1846         if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1847                 slave_details->link_status_poll_enabled = 1;
1848         }
1849
1850         slave_details->link_status_wait_to_complete = 0;
1851         /* clean tlb_last_obytes when adding port for bonding device */
1852         memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
1853                         sizeof(struct ether_addr));
1854 }
1855
1856 void
1857 bond_ethdev_primary_set(struct bond_dev_private *internals,
1858                 uint16_t slave_port_id)
1859 {
1860         int i;
1861
1862         if (internals->active_slave_count < 1)
1863                 internals->current_primary_port = slave_port_id;
1864         else
1865                 /* Search bonded device slave ports for new proposed primary port */
1866                 for (i = 0; i < internals->active_slave_count; i++) {
1867                         if (internals->active_slaves[i] == slave_port_id)
1868                                 internals->current_primary_port = slave_port_id;
1869                 }
1870 }
1871
1872 static void
1873 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
1874
1875 static int
1876 bond_ethdev_start(struct rte_eth_dev *eth_dev)
1877 {
1878         struct bond_dev_private *internals;
1879         int i;
1880
1881         /* slave eth dev will be started by bonded device */
1882         if (check_for_bonded_ethdev(eth_dev)) {
1883                 RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
1884                                 eth_dev->data->port_id);
1885                 return -1;
1886         }
1887
1888         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1889         eth_dev->data->dev_started = 1;
1890
1891         internals = eth_dev->data->dev_private;
1892
1893         if (internals->slave_count == 0) {
1894                 RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
1895                 return -1;
1896         }
1897
1898         if (internals->user_defined_mac == 0) {
1899                 struct ether_addr *new_mac_addr = NULL;
1900
1901                 for (i = 0; i < internals->slave_count; i++)
1902                         if (internals->slaves[i].port_id == internals->primary_port)
1903                                 new_mac_addr = &internals->slaves[i].persisted_mac_addr;
1904
1905                 if (new_mac_addr == NULL)
1906                         return -1;
1907
1908                 if (mac_address_set(eth_dev, new_mac_addr) != 0) {
1909                         RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
1910                                         eth_dev->data->port_id);
1911                         return -1;
1912                 }
1913         }
1914
1915         /* Update all slave devices MACs*/
1916         if (mac_address_slaves_update(eth_dev) != 0)
1917                 return -1;
1918
1919         /* If bonded device is configure in promiscuous mode then re-apply config */
1920         if (internals->promiscuous_en)
1921                 bond_ethdev_promiscuous_enable(eth_dev);
1922
1923         if (internals->mode == BONDING_MODE_8023AD) {
1924                 if (internals->mode4.dedicated_queues.enabled == 1) {
1925                         internals->mode4.dedicated_queues.rx_qid =
1926                                         eth_dev->data->nb_rx_queues;
1927                         internals->mode4.dedicated_queues.tx_qid =
1928                                         eth_dev->data->nb_tx_queues;
1929                 }
1930         }
1931
1932
1933         /* Reconfigure each slave device if starting bonded device */
1934         for (i = 0; i < internals->slave_count; i++) {
1935                 struct rte_eth_dev *slave_ethdev =
1936                                 &(rte_eth_devices[internals->slaves[i].port_id]);
1937                 if (slave_configure(eth_dev, slave_ethdev) != 0) {
1938                         RTE_BOND_LOG(ERR,
1939                                 "bonded port (%d) failed to reconfigure slave device (%d)",
1940                                 eth_dev->data->port_id,
1941                                 internals->slaves[i].port_id);
1942                         return -1;
1943                 }
1944                 /* We will need to poll for link status if any slave doesn't
1945                  * support interrupts
1946                  */
1947                 if (internals->slaves[i].link_status_poll_enabled)
1948                         internals->link_status_polling_enabled = 1;
1949         }
1950         /* start polling if needed */
1951         if (internals->link_status_polling_enabled) {
1952                 rte_eal_alarm_set(
1953                         internals->link_status_polling_interval_ms * 1000,
1954                         bond_ethdev_slave_link_status_change_monitor,
1955                         (void *)&rte_eth_devices[internals->port_id]);
1956         }
1957
1958         if (internals->user_defined_primary_port)
1959                 bond_ethdev_primary_set(internals, internals->primary_port);
1960
1961         if (internals->mode == BONDING_MODE_8023AD)
1962                 bond_mode_8023ad_start(eth_dev);
1963
1964         if (internals->mode == BONDING_MODE_TLB ||
1965                         internals->mode == BONDING_MODE_ALB)
1966                 bond_tlb_enable(internals);
1967
1968         return 0;
1969 }
1970
1971 static void
1972 bond_ethdev_free_queues(struct rte_eth_dev *dev)
1973 {
1974         uint8_t i;
1975
1976         if (dev->data->rx_queues != NULL) {
1977                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1978                         rte_free(dev->data->rx_queues[i]);
1979                         dev->data->rx_queues[i] = NULL;
1980                 }
1981                 dev->data->nb_rx_queues = 0;
1982         }
1983
1984         if (dev->data->tx_queues != NULL) {
1985                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1986                         rte_free(dev->data->tx_queues[i]);
1987                         dev->data->tx_queues[i] = NULL;
1988                 }
1989                 dev->data->nb_tx_queues = 0;
1990         }
1991 }
1992
1993 void
1994 bond_ethdev_stop(struct rte_eth_dev *eth_dev)
1995 {
1996         struct bond_dev_private *internals = eth_dev->data->dev_private;
1997         uint8_t i;
1998
1999         if (internals->mode == BONDING_MODE_8023AD) {
2000                 struct port *port;
2001                 void *pkt = NULL;
2002
2003                 bond_mode_8023ad_stop(eth_dev);
2004
2005                 /* Discard all messages to/from mode 4 state machines */
2006                 for (i = 0; i < internals->active_slave_count; i++) {
2007                         port = &mode_8023ad_ports[internals->active_slaves[i]];
2008
2009                         RTE_ASSERT(port->rx_ring != NULL);
2010                         while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
2011                                 rte_pktmbuf_free(pkt);
2012
2013                         RTE_ASSERT(port->tx_ring != NULL);
2014                         while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
2015                                 rte_pktmbuf_free(pkt);
2016                 }
2017         }
2018
2019         if (internals->mode == BONDING_MODE_TLB ||
2020                         internals->mode == BONDING_MODE_ALB) {
2021                 bond_tlb_disable(internals);
2022                 for (i = 0; i < internals->active_slave_count; i++)
2023                         tlb_last_obytets[internals->active_slaves[i]] = 0;
2024         }
2025
2026         internals->active_slave_count = 0;
2027         internals->link_status_polling_enabled = 0;
2028         for (i = 0; i < internals->slave_count; i++)
2029                 internals->slaves[i].last_link_status = 0;
2030
2031         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2032         eth_dev->data->dev_started = 0;
2033 }
2034
2035 void
2036 bond_ethdev_close(struct rte_eth_dev *dev)
2037 {
2038         struct bond_dev_private *internals = dev->data->dev_private;
2039         uint8_t bond_port_id = internals->port_id;
2040         int skipped = 0;
2041
2042         RTE_LOG(INFO, EAL, "Closing bonded device %s\n", dev->device->name);
2043         while (internals->slave_count != skipped) {
2044                 uint16_t port_id = internals->slaves[skipped].port_id;
2045
2046                 rte_eth_dev_stop(port_id);
2047
2048                 if (rte_eth_bond_slave_remove(bond_port_id, port_id) != 0) {
2049                         RTE_LOG(ERR, EAL,
2050                                 "Failed to remove port %d from bonded device "
2051                                 "%s\n", port_id, dev->device->name);
2052                         skipped++;
2053                 }
2054         }
2055         bond_ethdev_free_queues(dev);
2056         rte_bitmap_reset(internals->vlan_filter_bmp);
2057 }
2058
2059 /* forward declaration */
2060 static int bond_ethdev_configure(struct rte_eth_dev *dev);
2061
2062 static void
2063 bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
2064 {
2065         struct bond_dev_private *internals = dev->data->dev_private;
2066
2067         uint16_t max_nb_rx_queues = UINT16_MAX;
2068         uint16_t max_nb_tx_queues = UINT16_MAX;
2069
2070         dev_info->max_mac_addrs = 1;
2071
2072         dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen ?
2073                         internals->candidate_max_rx_pktlen :
2074                         ETHER_MAX_JUMBO_FRAME_LEN;
2075
2076         /* Max number of tx/rx queues that the bonded device can support is the
2077          * minimum values of the bonded slaves, as all slaves must be capable
2078          * of supporting the same number of tx/rx queues.
2079          */
2080         if (internals->slave_count > 0) {
2081                 struct rte_eth_dev_info slave_info;
2082                 uint8_t idx;
2083
2084                 for (idx = 0; idx < internals->slave_count; idx++) {
2085                         rte_eth_dev_info_get(internals->slaves[idx].port_id,
2086                                         &slave_info);
2087
2088                         if (slave_info.max_rx_queues < max_nb_rx_queues)
2089                                 max_nb_rx_queues = slave_info.max_rx_queues;
2090
2091                         if (slave_info.max_tx_queues < max_nb_tx_queues)
2092                                 max_nb_tx_queues = slave_info.max_tx_queues;
2093                 }
2094         }
2095
2096         dev_info->max_rx_queues = max_nb_rx_queues;
2097         dev_info->max_tx_queues = max_nb_tx_queues;
2098
2099         /**
2100          * If dedicated hw queues enabled for link bonding device in LACP mode
2101          * then we need to reduce the maximum number of data path queues by 1.
2102          */
2103         if (internals->mode == BONDING_MODE_8023AD &&
2104                 internals->mode4.dedicated_queues.enabled == 1) {
2105                 dev_info->max_rx_queues--;
2106                 dev_info->max_tx_queues--;
2107         }
2108
2109         dev_info->min_rx_bufsize = 0;
2110
2111         dev_info->rx_offload_capa = internals->rx_offload_capa;
2112         dev_info->tx_offload_capa = internals->tx_offload_capa;
2113         dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
2114
2115         dev_info->reta_size = internals->reta_size;
2116 }
2117
2118 static int
2119 bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
2120 {
2121         int res;
2122         uint16_t i;
2123         struct bond_dev_private *internals = dev->data->dev_private;
2124
2125         /* don't do this while a slave is being added */
2126         rte_spinlock_lock(&internals->lock);
2127
2128         if (on)
2129                 rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
2130         else
2131                 rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
2132
2133         for (i = 0; i < internals->slave_count; i++) {
2134                 uint16_t port_id = internals->slaves[i].port_id;
2135
2136                 res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
2137                 if (res == ENOTSUP)
2138                         RTE_LOG(WARNING, PMD,
2139                                 "Setting VLAN filter on slave port %u not supported.\n",
2140                                 port_id);
2141         }
2142
2143         rte_spinlock_unlock(&internals->lock);
2144         return 0;
2145 }
2146
2147 static int
2148 bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
2149                 uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
2150                 const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
2151 {
2152         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
2153                         rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
2154                                         0, dev->data->numa_node);
2155         if (bd_rx_q == NULL)
2156                 return -1;
2157
2158         bd_rx_q->queue_id = rx_queue_id;
2159         bd_rx_q->dev_private = dev->data->dev_private;
2160
2161         bd_rx_q->nb_rx_desc = nb_rx_desc;
2162
2163         memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
2164         bd_rx_q->mb_pool = mb_pool;
2165
2166         dev->data->rx_queues[rx_queue_id] = bd_rx_q;
2167
2168         return 0;
2169 }
2170
2171 static int
2172 bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
2173                 uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
2174                 const struct rte_eth_txconf *tx_conf)
2175 {
2176         struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
2177                         rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
2178                                         0, dev->data->numa_node);
2179
2180         if (bd_tx_q == NULL)
2181                 return -1;
2182
2183         bd_tx_q->queue_id = tx_queue_id;
2184         bd_tx_q->dev_private = dev->data->dev_private;
2185
2186         bd_tx_q->nb_tx_desc = nb_tx_desc;
2187         memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
2188
2189         dev->data->tx_queues[tx_queue_id] = bd_tx_q;
2190
2191         return 0;
2192 }
2193
2194 static void
2195 bond_ethdev_rx_queue_release(void *queue)
2196 {
2197         if (queue == NULL)
2198                 return;
2199
2200         rte_free(queue);
2201 }
2202
2203 static void
2204 bond_ethdev_tx_queue_release(void *queue)
2205 {
2206         if (queue == NULL)
2207                 return;
2208
2209         rte_free(queue);
2210 }
2211
2212 static void
2213 bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
2214 {
2215         struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
2216         struct bond_dev_private *internals;
2217
2218         /* Default value for polling slave found is true as we don't want to
2219          * disable the polling thread if we cannot get the lock */
2220         int i, polling_slave_found = 1;
2221
2222         if (cb_arg == NULL)
2223                 return;
2224
2225         bonded_ethdev = (struct rte_eth_dev *)cb_arg;
2226         internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
2227
2228         if (!bonded_ethdev->data->dev_started ||
2229                 !internals->link_status_polling_enabled)
2230                 return;
2231
2232         /* If device is currently being configured then don't check slaves link
2233          * status, wait until next period */
2234         if (rte_spinlock_trylock(&internals->lock)) {
2235                 if (internals->slave_count > 0)
2236                         polling_slave_found = 0;
2237
2238                 for (i = 0; i < internals->slave_count; i++) {
2239                         if (!internals->slaves[i].link_status_poll_enabled)
2240                                 continue;
2241
2242                         slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
2243                         polling_slave_found = 1;
2244
2245                         /* Update slave link status */
2246                         (*slave_ethdev->dev_ops->link_update)(slave_ethdev,
2247                                         internals->slaves[i].link_status_wait_to_complete);
2248
2249                         /* if link status has changed since last checked then call lsc
2250                          * event callback */
2251                         if (slave_ethdev->data->dev_link.link_status !=
2252                                         internals->slaves[i].last_link_status) {
2253                                 internals->slaves[i].last_link_status =
2254                                                 slave_ethdev->data->dev_link.link_status;
2255
2256                                 bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
2257                                                 RTE_ETH_EVENT_INTR_LSC,
2258                                                 &bonded_ethdev->data->port_id,
2259                                                 NULL);
2260                         }
2261                 }
2262                 rte_spinlock_unlock(&internals->lock);
2263         }
2264
2265         if (polling_slave_found)
2266                 /* Set alarm to continue monitoring link status of slave ethdev's */
2267                 rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
2268                                 bond_ethdev_slave_link_status_change_monitor, cb_arg);
2269 }
2270
2271 static int
2272 bond_ethdev_link_update(struct rte_eth_dev *ethdev, int wait_to_complete)
2273 {
2274         void (*link_update)(uint16_t port_id, struct rte_eth_link *eth_link);
2275
2276         struct bond_dev_private *bond_ctx;
2277         struct rte_eth_link slave_link;
2278
2279         uint32_t idx;
2280
2281         bond_ctx = ethdev->data->dev_private;
2282
2283         ethdev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
2284
2285         if (ethdev->data->dev_started == 0 ||
2286                         bond_ctx->active_slave_count == 0) {
2287                 ethdev->data->dev_link.link_status = ETH_LINK_DOWN;
2288                 return 0;
2289         }
2290
2291         ethdev->data->dev_link.link_status = ETH_LINK_UP;
2292
2293         if (wait_to_complete)
2294                 link_update = rte_eth_link_get;
2295         else
2296                 link_update = rte_eth_link_get_nowait;
2297
2298         switch (bond_ctx->mode) {
2299         case BONDING_MODE_BROADCAST:
2300                 /**
2301                  * Setting link speed to UINT32_MAX to ensure we pick up the
2302                  * value of the first active slave
2303                  */
2304                 ethdev->data->dev_link.link_speed = UINT32_MAX;
2305
2306                 /**
2307                  * link speed is minimum value of all the slaves link speed as
2308                  * packet loss will occur on this slave if transmission at rates
2309                  * greater than this are attempted
2310                  */
2311                 for (idx = 1; idx < bond_ctx->active_slave_count; idx++) {
2312                         link_update(bond_ctx->active_slaves[0], &slave_link);
2313
2314                         if (slave_link.link_speed <
2315                                         ethdev->data->dev_link.link_speed)
2316                                 ethdev->data->dev_link.link_speed =
2317                                                 slave_link.link_speed;
2318                 }
2319                 break;
2320         case BONDING_MODE_ACTIVE_BACKUP:
2321                 /* Current primary slave */
2322                 link_update(bond_ctx->current_primary_port, &slave_link);
2323
2324                 ethdev->data->dev_link.link_speed = slave_link.link_speed;
2325                 break;
2326         case BONDING_MODE_8023AD:
2327                 ethdev->data->dev_link.link_autoneg =
2328                                 bond_ctx->mode4.slave_link.link_autoneg;
2329                 ethdev->data->dev_link.link_duplex =
2330                                 bond_ctx->mode4.slave_link.link_duplex;
2331                 /* fall through to update link speed */
2332         case BONDING_MODE_ROUND_ROBIN:
2333         case BONDING_MODE_BALANCE:
2334         case BONDING_MODE_TLB:
2335         case BONDING_MODE_ALB:
2336         default:
2337                 /**
2338                  * In theses mode the maximum theoretical link speed is the sum
2339                  * of all the slaves
2340                  */
2341                 ethdev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
2342
2343                 for (idx = 0; idx < bond_ctx->active_slave_count; idx++) {
2344                         link_update(bond_ctx->active_slaves[idx], &slave_link);
2345
2346                         ethdev->data->dev_link.link_speed +=
2347                                         slave_link.link_speed;
2348                 }
2349         }
2350
2351
2352         return 0;
2353 }
2354
2355
2356 static int
2357 bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
2358 {
2359         struct bond_dev_private *internals = dev->data->dev_private;
2360         struct rte_eth_stats slave_stats;
2361         int i, j;
2362
2363         for (i = 0; i < internals->slave_count; i++) {
2364                 rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
2365
2366                 stats->ipackets += slave_stats.ipackets;
2367                 stats->opackets += slave_stats.opackets;
2368                 stats->ibytes += slave_stats.ibytes;
2369                 stats->obytes += slave_stats.obytes;
2370                 stats->imissed += slave_stats.imissed;
2371                 stats->ierrors += slave_stats.ierrors;
2372                 stats->oerrors += slave_stats.oerrors;
2373                 stats->rx_nombuf += slave_stats.rx_nombuf;
2374
2375                 for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
2376                         stats->q_ipackets[j] += slave_stats.q_ipackets[j];
2377                         stats->q_opackets[j] += slave_stats.q_opackets[j];
2378                         stats->q_ibytes[j] += slave_stats.q_ibytes[j];
2379                         stats->q_obytes[j] += slave_stats.q_obytes[j];
2380                         stats->q_errors[j] += slave_stats.q_errors[j];
2381                 }
2382
2383         }
2384
2385         return 0;
2386 }
2387
2388 static void
2389 bond_ethdev_stats_reset(struct rte_eth_dev *dev)
2390 {
2391         struct bond_dev_private *internals = dev->data->dev_private;
2392         int i;
2393
2394         for (i = 0; i < internals->slave_count; i++)
2395                 rte_eth_stats_reset(internals->slaves[i].port_id);
2396 }
2397
2398 static void
2399 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
2400 {
2401         struct bond_dev_private *internals = eth_dev->data->dev_private;
2402         int i;
2403
2404         internals->promiscuous_en = 1;
2405
2406         switch (internals->mode) {
2407         /* Promiscuous mode is propagated to all slaves */
2408         case BONDING_MODE_ROUND_ROBIN:
2409         case BONDING_MODE_BALANCE:
2410         case BONDING_MODE_BROADCAST:
2411                 for (i = 0; i < internals->slave_count; i++)
2412                         rte_eth_promiscuous_enable(internals->slaves[i].port_id);
2413                 break;
2414         /* In mode4 promiscus mode is managed when slave is added/removed */
2415         case BONDING_MODE_8023AD:
2416                 break;
2417         /* Promiscuous mode is propagated only to primary slave */
2418         case BONDING_MODE_ACTIVE_BACKUP:
2419         case BONDING_MODE_TLB:
2420         case BONDING_MODE_ALB:
2421         default:
2422                 rte_eth_promiscuous_enable(internals->current_primary_port);
2423         }
2424 }
2425
2426 static void
2427 bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
2428 {
2429         struct bond_dev_private *internals = dev->data->dev_private;
2430         int i;
2431
2432         internals->promiscuous_en = 0;
2433
2434         switch (internals->mode) {
2435         /* Promiscuous mode is propagated to all slaves */
2436         case BONDING_MODE_ROUND_ROBIN:
2437         case BONDING_MODE_BALANCE:
2438         case BONDING_MODE_BROADCAST:
2439                 for (i = 0; i < internals->slave_count; i++)
2440                         rte_eth_promiscuous_disable(internals->slaves[i].port_id);
2441                 break;
2442         /* In mode4 promiscus mode is set managed when slave is added/removed */
2443         case BONDING_MODE_8023AD:
2444                 break;
2445         /* Promiscuous mode is propagated only to primary slave */
2446         case BONDING_MODE_ACTIVE_BACKUP:
2447         case BONDING_MODE_TLB:
2448         case BONDING_MODE_ALB:
2449         default:
2450                 rte_eth_promiscuous_disable(internals->current_primary_port);
2451         }
2452 }
2453
2454 static void
2455 bond_ethdev_delayed_lsc_propagation(void *arg)
2456 {
2457         if (arg == NULL)
2458                 return;
2459
2460         _rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
2461                         RTE_ETH_EVENT_INTR_LSC, NULL, NULL);
2462 }
2463
2464 int
2465 bond_ethdev_lsc_event_callback(uint16_t port_id, enum rte_eth_event_type type,
2466                 void *param, void *ret_param __rte_unused)
2467 {
2468         struct rte_eth_dev *bonded_eth_dev;
2469         struct bond_dev_private *internals;
2470         struct rte_eth_link link;
2471         int rc = -1;
2472
2473         int i, valid_slave = 0;
2474         uint8_t active_pos;
2475         uint8_t lsc_flag = 0;
2476
2477         if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
2478                 return rc;
2479
2480         bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
2481
2482         if (check_for_bonded_ethdev(bonded_eth_dev))
2483                 return rc;
2484
2485         internals = bonded_eth_dev->data->dev_private;
2486
2487         /* If the device isn't started don't handle interrupts */
2488         if (!bonded_eth_dev->data->dev_started)
2489                 return rc;
2490
2491         /* verify that port_id is a valid slave of bonded port */
2492         for (i = 0; i < internals->slave_count; i++) {
2493                 if (internals->slaves[i].port_id == port_id) {
2494                         valid_slave = 1;
2495                         break;
2496                 }
2497         }
2498
2499         if (!valid_slave)
2500                 return rc;
2501
2502         /* Search for port in active port list */
2503         active_pos = find_slave_by_id(internals->active_slaves,
2504                         internals->active_slave_count, port_id);
2505
2506         rte_eth_link_get_nowait(port_id, &link);
2507         if (link.link_status) {
2508                 if (active_pos < internals->active_slave_count)
2509                         return rc;
2510
2511                 /* if no active slave ports then set this port to be primary port */
2512                 if (internals->active_slave_count < 1) {
2513                         /* If first active slave, then change link status */
2514                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2515                         internals->current_primary_port = port_id;
2516                         lsc_flag = 1;
2517
2518                         mac_address_slaves_update(bonded_eth_dev);
2519                 }
2520
2521                 activate_slave(bonded_eth_dev, port_id);
2522
2523                 /* If user has defined the primary port then default to using it */
2524                 if (internals->user_defined_primary_port &&
2525                                 internals->primary_port == port_id)
2526                         bond_ethdev_primary_set(internals, port_id);
2527         } else {
2528                 if (active_pos == internals->active_slave_count)
2529                         return rc;
2530
2531                 /* Remove from active slave list */
2532                 deactivate_slave(bonded_eth_dev, port_id);
2533
2534                 if (internals->active_slave_count < 1)
2535                         lsc_flag = 1;
2536
2537                 /* Update primary id, take first active slave from list or if none
2538                  * available set to -1 */
2539                 if (port_id == internals->current_primary_port) {
2540                         if (internals->active_slave_count > 0)
2541                                 bond_ethdev_primary_set(internals,
2542                                                 internals->active_slaves[0]);
2543                         else
2544                                 internals->current_primary_port = internals->primary_port;
2545                 }
2546         }
2547
2548         /**
2549          * Update bonded device link properties after any change to active
2550          * slaves
2551          */
2552         bond_ethdev_link_update(bonded_eth_dev, 0);
2553
2554         if (lsc_flag) {
2555                 /* Cancel any possible outstanding interrupts if delays are enabled */
2556                 if (internals->link_up_delay_ms > 0 ||
2557                         internals->link_down_delay_ms > 0)
2558                         rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2559                                         bonded_eth_dev);
2560
2561                 if (bonded_eth_dev->data->dev_link.link_status) {
2562                         if (internals->link_up_delay_ms > 0)
2563                                 rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2564                                                 bond_ethdev_delayed_lsc_propagation,
2565                                                 (void *)bonded_eth_dev);
2566                         else
2567                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2568                                                 RTE_ETH_EVENT_INTR_LSC,
2569                                                 NULL, NULL);
2570
2571                 } else {
2572                         if (internals->link_down_delay_ms > 0)
2573                                 rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2574                                                 bond_ethdev_delayed_lsc_propagation,
2575                                                 (void *)bonded_eth_dev);
2576                         else
2577                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2578                                                 RTE_ETH_EVENT_INTR_LSC,
2579                                                 NULL, NULL);
2580                 }
2581         }
2582         return 0;
2583 }
2584
2585 static int
2586 bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2587                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2588 {
2589         unsigned i, j;
2590         int result = 0;
2591         int slave_reta_size;
2592         unsigned reta_count;
2593         struct bond_dev_private *internals = dev->data->dev_private;
2594
2595         if (reta_size != internals->reta_size)
2596                 return -EINVAL;
2597
2598          /* Copy RETA table */
2599         reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2600
2601         for (i = 0; i < reta_count; i++) {
2602                 internals->reta_conf[i].mask = reta_conf[i].mask;
2603                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2604                         if ((reta_conf[i].mask >> j) & 0x01)
2605                                 internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2606         }
2607
2608         /* Fill rest of array */
2609         for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2610                 memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2611                                 sizeof(internals->reta_conf[0]) * reta_count);
2612
2613         /* Propagate RETA over slaves */
2614         for (i = 0; i < internals->slave_count; i++) {
2615                 slave_reta_size = internals->slaves[i].reta_size;
2616                 result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2617                                 &internals->reta_conf[0], slave_reta_size);
2618                 if (result < 0)
2619                         return result;
2620         }
2621
2622         return 0;
2623 }
2624
2625 static int
2626 bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2627                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2628 {
2629         int i, j;
2630         struct bond_dev_private *internals = dev->data->dev_private;
2631
2632         if (reta_size != internals->reta_size)
2633                 return -EINVAL;
2634
2635          /* Copy RETA table */
2636         for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2637                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2638                         if ((reta_conf[i].mask >> j) & 0x01)
2639                                 reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2640
2641         return 0;
2642 }
2643
2644 static int
2645 bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2646                 struct rte_eth_rss_conf *rss_conf)
2647 {
2648         int i, result = 0;
2649         struct bond_dev_private *internals = dev->data->dev_private;
2650         struct rte_eth_rss_conf bond_rss_conf;
2651
2652         memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2653
2654         bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2655
2656         if (bond_rss_conf.rss_hf != 0)
2657                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2658
2659         if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2660                         sizeof(internals->rss_key)) {
2661                 if (bond_rss_conf.rss_key_len == 0)
2662                         bond_rss_conf.rss_key_len = 40;
2663                 internals->rss_key_len = bond_rss_conf.rss_key_len;
2664                 memcpy(internals->rss_key, bond_rss_conf.rss_key,
2665                                 internals->rss_key_len);
2666         }
2667
2668         for (i = 0; i < internals->slave_count; i++) {
2669                 result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2670                                 &bond_rss_conf);
2671                 if (result < 0)
2672                         return result;
2673         }
2674
2675         return 0;
2676 }
2677
2678 static int
2679 bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2680                 struct rte_eth_rss_conf *rss_conf)
2681 {
2682         struct bond_dev_private *internals = dev->data->dev_private;
2683
2684         rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2685         rss_conf->rss_key_len = internals->rss_key_len;
2686         if (rss_conf->rss_key)
2687                 memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2688
2689         return 0;
2690 }
2691
2692 const struct eth_dev_ops default_dev_ops = {
2693         .dev_start            = bond_ethdev_start,
2694         .dev_stop             = bond_ethdev_stop,
2695         .dev_close            = bond_ethdev_close,
2696         .dev_configure        = bond_ethdev_configure,
2697         .dev_infos_get        = bond_ethdev_info,
2698         .vlan_filter_set      = bond_ethdev_vlan_filter_set,
2699         .rx_queue_setup       = bond_ethdev_rx_queue_setup,
2700         .tx_queue_setup       = bond_ethdev_tx_queue_setup,
2701         .rx_queue_release     = bond_ethdev_rx_queue_release,
2702         .tx_queue_release     = bond_ethdev_tx_queue_release,
2703         .link_update          = bond_ethdev_link_update,
2704         .stats_get            = bond_ethdev_stats_get,
2705         .stats_reset          = bond_ethdev_stats_reset,
2706         .promiscuous_enable   = bond_ethdev_promiscuous_enable,
2707         .promiscuous_disable  = bond_ethdev_promiscuous_disable,
2708         .reta_update          = bond_ethdev_rss_reta_update,
2709         .reta_query           = bond_ethdev_rss_reta_query,
2710         .rss_hash_update      = bond_ethdev_rss_hash_update,
2711         .rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get
2712 };
2713
2714 static int
2715 bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
2716 {
2717         const char *name = rte_vdev_device_name(dev);
2718         uint8_t socket_id = dev->device.numa_node;
2719         struct bond_dev_private *internals = NULL;
2720         struct rte_eth_dev *eth_dev = NULL;
2721         uint32_t vlan_filter_bmp_size;
2722
2723         /* now do all data allocation - for eth_dev structure, dummy pci driver
2724          * and internal (private) data
2725          */
2726
2727         /* reserve an ethdev entry */
2728         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internals));
2729         if (eth_dev == NULL) {
2730                 RTE_BOND_LOG(ERR, "Unable to allocate rte_eth_dev");
2731                 goto err;
2732         }
2733
2734         internals = eth_dev->data->dev_private;
2735         eth_dev->data->nb_rx_queues = (uint16_t)1;
2736         eth_dev->data->nb_tx_queues = (uint16_t)1;
2737
2738         eth_dev->data->mac_addrs = rte_zmalloc_socket(name, ETHER_ADDR_LEN, 0,
2739                         socket_id);
2740         if (eth_dev->data->mac_addrs == NULL) {
2741                 RTE_BOND_LOG(ERR, "Unable to malloc mac_addrs");
2742                 goto err;
2743         }
2744
2745         eth_dev->dev_ops = &default_dev_ops;
2746         eth_dev->data->dev_flags = RTE_ETH_DEV_INTR_LSC;
2747
2748         rte_spinlock_init(&internals->lock);
2749
2750         internals->port_id = eth_dev->data->port_id;
2751         internals->mode = BONDING_MODE_INVALID;
2752         internals->current_primary_port = RTE_MAX_ETHPORTS + 1;
2753         internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2;
2754         internals->xmit_hash = xmit_l2_hash;
2755         internals->user_defined_mac = 0;
2756
2757         internals->link_status_polling_enabled = 0;
2758
2759         internals->link_status_polling_interval_ms =
2760                 DEFAULT_POLLING_INTERVAL_10_MS;
2761         internals->link_down_delay_ms = 0;
2762         internals->link_up_delay_ms = 0;
2763
2764         internals->slave_count = 0;
2765         internals->active_slave_count = 0;
2766         internals->rx_offload_capa = 0;
2767         internals->tx_offload_capa = 0;
2768         internals->candidate_max_rx_pktlen = 0;
2769         internals->max_rx_pktlen = 0;
2770
2771         /* Initially allow to choose any offload type */
2772         internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK;
2773
2774         memset(internals->active_slaves, 0, sizeof(internals->active_slaves));
2775         memset(internals->slaves, 0, sizeof(internals->slaves));
2776
2777         /* Set mode 4 default configuration */
2778         bond_mode_8023ad_setup(eth_dev, NULL);
2779         if (bond_ethdev_mode_set(eth_dev, mode)) {
2780                 RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d",
2781                                  eth_dev->data->port_id, mode);
2782                 goto err;
2783         }
2784
2785         vlan_filter_bmp_size =
2786                 rte_bitmap_get_memory_footprint(ETHER_MAX_VLAN_ID + 1);
2787         internals->vlan_filter_bmpmem = rte_malloc(name, vlan_filter_bmp_size,
2788                                                    RTE_CACHE_LINE_SIZE);
2789         if (internals->vlan_filter_bmpmem == NULL) {
2790                 RTE_BOND_LOG(ERR,
2791                              "Failed to allocate vlan bitmap for bonded device %u\n",
2792                              eth_dev->data->port_id);
2793                 goto err;
2794         }
2795
2796         internals->vlan_filter_bmp = rte_bitmap_init(ETHER_MAX_VLAN_ID + 1,
2797                         internals->vlan_filter_bmpmem, vlan_filter_bmp_size);
2798         if (internals->vlan_filter_bmp == NULL) {
2799                 RTE_BOND_LOG(ERR,
2800                              "Failed to init vlan bitmap for bonded device %u\n",
2801                              eth_dev->data->port_id);
2802                 rte_free(internals->vlan_filter_bmpmem);
2803                 goto err;
2804         }
2805
2806         return eth_dev->data->port_id;
2807
2808 err:
2809         rte_free(internals);
2810         if (eth_dev != NULL) {
2811                 rte_free(eth_dev->data->mac_addrs);
2812                 rte_eth_dev_release_port(eth_dev);
2813         }
2814         return -1;
2815 }
2816
2817 static int
2818 bond_probe(struct rte_vdev_device *dev)
2819 {
2820         const char *name;
2821         struct bond_dev_private *internals;
2822         struct rte_kvargs *kvlist;
2823         uint8_t bonding_mode, socket_id/*, agg_mode*/;
2824         int  arg_count, port_id;
2825         uint8_t agg_mode;
2826
2827         if (!dev)
2828                 return -EINVAL;
2829
2830         name = rte_vdev_device_name(dev);
2831         RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
2832
2833         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev),
2834                 pmd_bond_init_valid_arguments);
2835         if (kvlist == NULL)
2836                 return -1;
2837
2838         /* Parse link bonding mode */
2839         if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
2840                 if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
2841                                 &bond_ethdev_parse_slave_mode_kvarg,
2842                                 &bonding_mode) != 0) {
2843                         RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
2844                                         name);
2845                         goto parse_error;
2846                 }
2847         } else {
2848                 RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
2849                                 "device %s\n", name);
2850                 goto parse_error;
2851         }
2852
2853         /* Parse socket id to create bonding device on */
2854         arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
2855         if (arg_count == 1) {
2856                 if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
2857                                 &bond_ethdev_parse_socket_id_kvarg, &socket_id)
2858                                 != 0) {
2859                         RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
2860                                         "bonded device %s\n", name);
2861                         goto parse_error;
2862                 }
2863         } else if (arg_count > 1) {
2864                 RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
2865                                 "bonded device %s\n", name);
2866                 goto parse_error;
2867         } else {
2868                 socket_id = rte_socket_id();
2869         }
2870
2871         dev->device.numa_node = socket_id;
2872
2873         /* Create link bonding eth device */
2874         port_id = bond_alloc(dev, bonding_mode);
2875         if (port_id < 0) {
2876                 RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
2877                                 "socket %u.\n", name, bonding_mode, socket_id);
2878                 goto parse_error;
2879         }
2880         internals = rte_eth_devices[port_id].data->dev_private;
2881         internals->kvlist = kvlist;
2882
2883
2884         if (rte_kvargs_count(kvlist, PMD_BOND_AGG_MODE_KVARG) == 1) {
2885                 if (rte_kvargs_process(kvlist,
2886                                 PMD_BOND_AGG_MODE_KVARG,
2887                                 &bond_ethdev_parse_slave_agg_mode_kvarg,
2888                                 &agg_mode) != 0) {
2889                         RTE_LOG(ERR, EAL,
2890                                         "Failed to parse agg selection mode for bonded device %s\n",
2891                                         name);
2892                         goto parse_error;
2893                 }
2894
2895                 if (internals->mode == BONDING_MODE_8023AD)
2896                         rte_eth_bond_8023ad_agg_selection_set(port_id,
2897                                         agg_mode);
2898         } else {
2899                 rte_eth_bond_8023ad_agg_selection_set(port_id, AGG_STABLE);
2900         }
2901
2902         RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
2903                         "socket %u.\n", name, port_id, bonding_mode, socket_id);
2904         return 0;
2905
2906 parse_error:
2907         rte_kvargs_free(kvlist);
2908
2909         return -1;
2910 }
2911
2912 static int
2913 bond_remove(struct rte_vdev_device *dev)
2914 {
2915         struct rte_eth_dev *eth_dev;
2916         struct bond_dev_private *internals;
2917         const char *name;
2918
2919         if (!dev)
2920                 return -EINVAL;
2921
2922         name = rte_vdev_device_name(dev);
2923         RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
2924
2925         /* now free all data allocation - for eth_dev structure,
2926          * dummy pci driver and internal (private) data
2927          */
2928
2929         /* find an ethdev entry */
2930         eth_dev = rte_eth_dev_allocated(name);
2931         if (eth_dev == NULL)
2932                 return -ENODEV;
2933
2934         RTE_ASSERT(eth_dev->device == &dev->device);
2935
2936         internals = eth_dev->data->dev_private;
2937         if (internals->slave_count != 0)
2938                 return -EBUSY;
2939
2940         if (eth_dev->data->dev_started == 1) {
2941                 bond_ethdev_stop(eth_dev);
2942                 bond_ethdev_close(eth_dev);
2943         }
2944
2945         eth_dev->dev_ops = NULL;
2946         eth_dev->rx_pkt_burst = NULL;
2947         eth_dev->tx_pkt_burst = NULL;
2948
2949         internals = eth_dev->data->dev_private;
2950         rte_bitmap_free(internals->vlan_filter_bmp);
2951         rte_free(internals->vlan_filter_bmpmem);
2952         rte_free(eth_dev->data->dev_private);
2953         rte_free(eth_dev->data->mac_addrs);
2954
2955         rte_eth_dev_release_port(eth_dev);
2956
2957         return 0;
2958 }
2959
2960 /* this part will resolve the slave portids after all the other pdev and vdev
2961  * have been allocated */
2962 static int
2963 bond_ethdev_configure(struct rte_eth_dev *dev)
2964 {
2965         const char *name = dev->device->name;
2966         struct bond_dev_private *internals = dev->data->dev_private;
2967         struct rte_kvargs *kvlist = internals->kvlist;
2968         int arg_count;
2969         uint16_t port_id = dev - rte_eth_devices;
2970         uint8_t agg_mode;
2971
2972         static const uint8_t default_rss_key[40] = {
2973                 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
2974                 0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
2975                 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
2976                 0xBE, 0xAC, 0x01, 0xFA
2977         };
2978
2979         unsigned i, j;
2980
2981         /* If RSS is enabled, fill table and key with default values */
2982         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
2983                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
2984                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
2985                 memcpy(internals->rss_key, default_rss_key, 40);
2986
2987                 for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
2988                         internals->reta_conf[i].mask = ~0LL;
2989                         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2990                                 internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
2991                 }
2992         }
2993
2994         /* set the max_rx_pktlen */
2995         internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
2996
2997         /*
2998          * if no kvlist, it means that this bonded device has been created
2999          * through the bonding api.
3000          */
3001         if (!kvlist)
3002                 return 0;
3003
3004         /* Parse MAC address for bonded device */
3005         arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
3006         if (arg_count == 1) {
3007                 struct ether_addr bond_mac;
3008
3009                 if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
3010                                 &bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
3011                         RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
3012                                         name);
3013                         return -1;
3014                 }
3015
3016                 /* Set MAC address */
3017                 if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
3018                         RTE_LOG(ERR, EAL,
3019                                         "Failed to set mac address on bonded device %s\n",
3020                                         name);
3021                         return -1;
3022                 }
3023         } else if (arg_count > 1) {
3024                 RTE_LOG(ERR, EAL,
3025                                 "MAC address can be specified only once for bonded device %s\n",
3026                                 name);
3027                 return -1;
3028         }
3029
3030         /* Parse/set balance mode transmit policy */
3031         arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
3032         if (arg_count == 1) {
3033                 uint8_t xmit_policy;
3034
3035                 if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
3036                                 &bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
3037                                                 0) {
3038                         RTE_LOG(INFO, EAL,
3039                                         "Invalid xmit policy specified for bonded device %s\n",
3040                                         name);
3041                         return -1;
3042                 }
3043
3044                 /* Set balance mode transmit policy*/
3045                 if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
3046                         RTE_LOG(ERR, EAL,
3047                                         "Failed to set balance xmit policy on bonded device %s\n",
3048                                         name);
3049                         return -1;
3050                 }
3051         } else if (arg_count > 1) {
3052                 RTE_LOG(ERR, EAL,
3053                                 "Transmit policy can be specified only once for bonded device"
3054                                 " %s\n", name);
3055                 return -1;
3056         }
3057
3058         if (rte_kvargs_count(kvlist, PMD_BOND_AGG_MODE_KVARG) == 1) {
3059                 if (rte_kvargs_process(kvlist,
3060                                 PMD_BOND_AGG_MODE_KVARG,
3061                                 &bond_ethdev_parse_slave_agg_mode_kvarg,
3062                                 &agg_mode) != 0) {
3063                         RTE_LOG(ERR, EAL,
3064                                         "Failed to parse agg selection mode for bonded device %s\n",
3065                                         name);
3066                 }
3067                 if (internals->mode == BONDING_MODE_8023AD)
3068                                 rte_eth_bond_8023ad_agg_selection_set(port_id,
3069                                                 agg_mode);
3070         }
3071
3072         /* Parse/add slave ports to bonded device */
3073         if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
3074                 struct bond_ethdev_slave_ports slave_ports;
3075                 unsigned i;
3076
3077                 memset(&slave_ports, 0, sizeof(slave_ports));
3078
3079                 if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
3080                                 &bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
3081                         RTE_LOG(ERR, EAL,
3082                                         "Failed to parse slave ports for bonded device %s\n",
3083                                         name);
3084                         return -1;
3085                 }
3086
3087                 for (i = 0; i < slave_ports.slave_count; i++) {
3088                         if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
3089                                 RTE_LOG(ERR, EAL,
3090                                                 "Failed to add port %d as slave to bonded device %s\n",
3091                                                 slave_ports.slaves[i], name);
3092                         }
3093                 }
3094
3095         } else {
3096                 RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
3097                 return -1;
3098         }
3099
3100         /* Parse/set primary slave port id*/
3101         arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
3102         if (arg_count == 1) {
3103                 uint16_t primary_slave_port_id;
3104
3105                 if (rte_kvargs_process(kvlist,
3106                                 PMD_BOND_PRIMARY_SLAVE_KVARG,
3107                                 &bond_ethdev_parse_primary_slave_port_id_kvarg,
3108                                 &primary_slave_port_id) < 0) {
3109                         RTE_LOG(INFO, EAL,
3110                                         "Invalid primary slave port id specified for bonded device"
3111                                         " %s\n", name);
3112                         return -1;
3113                 }
3114
3115                 /* Set balance mode transmit policy*/
3116                 if (rte_eth_bond_primary_set(port_id, primary_slave_port_id)
3117                                 != 0) {
3118                         RTE_LOG(ERR, EAL,
3119                                         "Failed to set primary slave port %d on bonded device %s\n",
3120                                         primary_slave_port_id, name);
3121                         return -1;
3122                 }
3123         } else if (arg_count > 1) {
3124                 RTE_LOG(INFO, EAL,
3125                                 "Primary slave can be specified only once for bonded device"
3126                                 " %s\n", name);
3127                 return -1;
3128         }
3129
3130         /* Parse link status monitor polling interval */
3131         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
3132         if (arg_count == 1) {
3133                 uint32_t lsc_poll_interval_ms;
3134
3135                 if (rte_kvargs_process(kvlist,
3136                                 PMD_BOND_LSC_POLL_PERIOD_KVARG,
3137                                 &bond_ethdev_parse_time_ms_kvarg,
3138                                 &lsc_poll_interval_ms) < 0) {
3139                         RTE_LOG(INFO, EAL,
3140                                         "Invalid lsc polling interval value specified for bonded"
3141                                         " device %s\n", name);
3142                         return -1;
3143                 }
3144
3145                 if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
3146                                 != 0) {
3147                         RTE_LOG(ERR, EAL,
3148                                         "Failed to set lsc monitor polling interval (%u ms) on"
3149                                         " bonded device %s\n", lsc_poll_interval_ms, name);
3150                         return -1;
3151                 }
3152         } else if (arg_count > 1) {
3153                 RTE_LOG(INFO, EAL,
3154                                 "LSC polling interval can be specified only once for bonded"
3155                                 " device %s\n", name);
3156                 return -1;
3157         }
3158
3159         /* Parse link up interrupt propagation delay */
3160         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
3161         if (arg_count == 1) {
3162                 uint32_t link_up_delay_ms;
3163
3164                 if (rte_kvargs_process(kvlist,
3165                                 PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
3166                                 &bond_ethdev_parse_time_ms_kvarg,
3167                                 &link_up_delay_ms) < 0) {
3168                         RTE_LOG(INFO, EAL,
3169                                         "Invalid link up propagation delay value specified for"
3170                                         " bonded device %s\n", name);
3171                         return -1;
3172                 }
3173
3174                 /* Set balance mode transmit policy*/
3175                 if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
3176                                 != 0) {
3177                         RTE_LOG(ERR, EAL,
3178                                         "Failed to set link up propagation delay (%u ms) on bonded"
3179                                         " device %s\n", link_up_delay_ms, name);
3180                         return -1;
3181                 }
3182         } else if (arg_count > 1) {
3183                 RTE_LOG(INFO, EAL,
3184                                 "Link up propagation delay can be specified only once for"
3185                                 " bonded device %s\n", name);
3186                 return -1;
3187         }
3188
3189         /* Parse link down interrupt propagation delay */
3190         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
3191         if (arg_count == 1) {
3192                 uint32_t link_down_delay_ms;
3193
3194                 if (rte_kvargs_process(kvlist,
3195                                 PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
3196                                 &bond_ethdev_parse_time_ms_kvarg,
3197                                 &link_down_delay_ms) < 0) {
3198                         RTE_LOG(INFO, EAL,
3199                                         "Invalid link down propagation delay value specified for"
3200                                         " bonded device %s\n", name);
3201                         return -1;
3202                 }
3203
3204                 /* Set balance mode transmit policy*/
3205                 if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
3206                                 != 0) {
3207                         RTE_LOG(ERR, EAL,
3208                                         "Failed to set link down propagation delay (%u ms) on"
3209                                         " bonded device %s\n", link_down_delay_ms, name);
3210                         return -1;
3211                 }
3212         } else if (arg_count > 1) {
3213                 RTE_LOG(INFO, EAL,
3214                                 "Link down propagation delay can be specified only once for"
3215                                 " bonded device %s\n", name);
3216                 return -1;
3217         }
3218
3219         return 0;
3220 }
3221
3222 struct rte_vdev_driver pmd_bond_drv = {
3223         .probe = bond_probe,
3224         .remove = bond_remove,
3225 };
3226
3227 RTE_PMD_REGISTER_VDEV(net_bonding, pmd_bond_drv);
3228 RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
3229
3230 RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
3231         "slave=<ifc> "
3232         "primary=<ifc> "
3233         "mode=[0-6] "
3234         "xmit_policy=[l2 | l23 | l34] "
3235         "agg_mode=[count | stable | bandwidth] "
3236         "socket_id=<int> "
3237         "mac=<mac addr> "
3238         "lsc_poll_period_ms=<int> "
3239         "up_delay=<int> "
3240         "down_delay=<int>");