net/bonding: fix link properties management
[dpdk.git] / drivers / net / bonding / rte_eth_bond_pmd.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 #include <stdlib.h>
34 #include <netinet/in.h>
35
36 #include <rte_mbuf.h>
37 #include <rte_malloc.h>
38 #include <rte_ethdev.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_tcp.h>
41 #include <rte_udp.h>
42 #include <rte_ip.h>
43 #include <rte_ip_frag.h>
44 #include <rte_devargs.h>
45 #include <rte_kvargs.h>
46 #include <rte_vdev.h>
47 #include <rte_alarm.h>
48 #include <rte_cycles.h>
49
50 #include "rte_eth_bond.h"
51 #include "rte_eth_bond_private.h"
52 #include "rte_eth_bond_8023ad_private.h"
53
54 #define REORDER_PERIOD_MS 10
55 #define DEFAULT_POLLING_INTERVAL_10_MS (10)
56
57 #define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
58
59 /* Table for statistics in mode 5 TLB */
60 static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
61
62 static inline size_t
63 get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
64 {
65         size_t vlan_offset = 0;
66
67         if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
68                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
69
70                 vlan_offset = sizeof(struct vlan_hdr);
71                 *proto = vlan_hdr->eth_proto;
72
73                 if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
74                         vlan_hdr = vlan_hdr + 1;
75                         *proto = vlan_hdr->eth_proto;
76                         vlan_offset += sizeof(struct vlan_hdr);
77                 }
78         }
79         return vlan_offset;
80 }
81
82 static uint16_t
83 bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
84 {
85         struct bond_dev_private *internals;
86
87         uint16_t num_rx_slave = 0;
88         uint16_t num_rx_total = 0;
89
90         int i;
91
92         /* Cast to structure, containing bonded device's port id and queue id */
93         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
94
95         internals = bd_rx_q->dev_private;
96
97
98         for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
99                 /* Offset of pointer to *bufs increases as packets are received
100                  * from other slaves */
101                 num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
102                                 bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
103                 if (num_rx_slave) {
104                         num_rx_total += num_rx_slave;
105                         nb_pkts -= num_rx_slave;
106                 }
107         }
108
109         return num_rx_total;
110 }
111
112 static uint16_t
113 bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
114                 uint16_t nb_pkts)
115 {
116         struct bond_dev_private *internals;
117
118         /* Cast to structure, containing bonded device's port id and queue id */
119         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
120
121         internals = bd_rx_q->dev_private;
122
123         return rte_eth_rx_burst(internals->current_primary_port,
124                         bd_rx_q->queue_id, bufs, nb_pkts);
125 }
126
127 static inline uint8_t
128 is_lacp_packets(uint16_t ethertype, uint8_t subtype, uint16_t vlan_tci)
129 {
130         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
131
132         return !vlan_tci && (ethertype == ether_type_slow_be &&
133                 (subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
134 }
135
136 /*****************************************************************************
137  * Flow director's setup for mode 4 optimization
138  */
139
140 static struct rte_flow_item_eth flow_item_eth_type_8023ad = {
141         .dst.addr_bytes = { 0 },
142         .src.addr_bytes = { 0 },
143         .type = RTE_BE16(ETHER_TYPE_SLOW),
144 };
145
146 static struct rte_flow_item_eth flow_item_eth_mask_type_8023ad = {
147         .dst.addr_bytes = { 0 },
148         .src.addr_bytes = { 0 },
149         .type = 0xFFFF,
150 };
151
152 static struct rte_flow_item flow_item_8023ad[] = {
153         {
154                 .type = RTE_FLOW_ITEM_TYPE_ETH,
155                 .spec = &flow_item_eth_type_8023ad,
156                 .last = NULL,
157                 .mask = &flow_item_eth_mask_type_8023ad,
158         },
159         {
160                 .type = RTE_FLOW_ITEM_TYPE_END,
161                 .spec = NULL,
162                 .last = NULL,
163                 .mask = NULL,
164         }
165 };
166
167 const struct rte_flow_attr flow_attr_8023ad = {
168         .group = 0,
169         .priority = 0,
170         .ingress = 1,
171         .egress = 0,
172         .reserved = 0,
173 };
174
175 int
176 bond_ethdev_8023ad_flow_verify(struct rte_eth_dev *bond_dev,
177                 uint8_t slave_port) {
178         struct rte_flow_error error;
179         struct bond_dev_private *internals = (struct bond_dev_private *)
180                         (bond_dev->data->dev_private);
181
182         struct rte_flow_action_queue lacp_queue_conf = {
183                 .index = internals->mode4.dedicated_queues.rx_qid,
184         };
185
186         const struct rte_flow_action actions[] = {
187                 {
188                         .type = RTE_FLOW_ACTION_TYPE_QUEUE,
189                         .conf = &lacp_queue_conf
190                 },
191                 {
192                         .type = RTE_FLOW_ACTION_TYPE_END,
193                 }
194         };
195
196         int ret = rte_flow_validate(slave_port, &flow_attr_8023ad,
197                         flow_item_8023ad, actions, &error);
198         if (ret < 0)
199                 return -1;
200
201         return 0;
202 }
203
204 int
205 bond_8023ad_slow_pkt_hw_filter_supported(uint8_t port_id) {
206         struct rte_eth_dev *bond_dev = &rte_eth_devices[port_id];
207         struct bond_dev_private *internals = (struct bond_dev_private *)
208                         (bond_dev->data->dev_private);
209         struct rte_eth_dev_info bond_info, slave_info;
210         uint8_t idx;
211
212         /* Verify if all slaves in bonding supports flow director and */
213         if (internals->slave_count > 0) {
214                 rte_eth_dev_info_get(bond_dev->data->port_id, &bond_info);
215
216                 internals->mode4.dedicated_queues.rx_qid = bond_info.nb_rx_queues;
217                 internals->mode4.dedicated_queues.tx_qid = bond_info.nb_tx_queues;
218
219                 for (idx = 0; idx < internals->slave_count; idx++) {
220                         rte_eth_dev_info_get(internals->slaves[idx].port_id,
221                                         &slave_info);
222
223                         if (bond_ethdev_8023ad_flow_verify(bond_dev,
224                                         internals->slaves[idx].port_id) != 0)
225                                 return -1;
226                 }
227         }
228
229         return 0;
230 }
231
232 int
233 bond_ethdev_8023ad_flow_set(struct rte_eth_dev *bond_dev, uint8_t slave_port) {
234
235         struct rte_flow_error error;
236         struct bond_dev_private *internals = (struct bond_dev_private *)
237                         (bond_dev->data->dev_private);
238
239         struct rte_flow_action_queue lacp_queue_conf = {
240                 .index = internals->mode4.dedicated_queues.rx_qid,
241         };
242
243         const struct rte_flow_action actions[] = {
244                 {
245                         .type = RTE_FLOW_ACTION_TYPE_QUEUE,
246                         .conf = &lacp_queue_conf
247                 },
248                 {
249                         .type = RTE_FLOW_ACTION_TYPE_END,
250                 }
251         };
252
253         internals->mode4.dedicated_queues.flow[slave_port] = rte_flow_create(slave_port,
254                         &flow_attr_8023ad, flow_item_8023ad, actions, &error);
255         if (internals->mode4.dedicated_queues.flow[slave_port] == NULL) {
256                 RTE_BOND_LOG(ERR, "bond_ethdev_8023ad_flow_set: %s "
257                                 "(slave_port=%d queue_id=%d)",
258                                 error.message, slave_port,
259                                 internals->mode4.dedicated_queues.rx_qid);
260                 return -1;
261         }
262
263         return 0;
264 }
265
266 static uint16_t
267 bond_ethdev_rx_burst_8023ad_fast_queue(void *queue, struct rte_mbuf **bufs,
268                 uint16_t nb_pkts)
269 {
270         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
271         struct bond_dev_private *internals = bd_rx_q->dev_private;
272         uint16_t num_rx_total = 0;      /* Total number of received packets */
273         uint8_t slaves[RTE_MAX_ETHPORTS];
274         uint8_t slave_count;
275
276         uint8_t i, idx;
277
278         /* Copy slave list to protect against slave up/down changes during tx
279          * bursting */
280         slave_count = internals->active_slave_count;
281         memcpy(slaves, internals->active_slaves,
282                         sizeof(internals->active_slaves[0]) * slave_count);
283
284         for (i = 0, idx = internals->active_slave;
285                         i < slave_count && num_rx_total < nb_pkts; i++, idx++) {
286                 idx = idx % slave_count;
287
288                 /* Read packets from this slave */
289                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
290                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
291         }
292
293         internals->active_slave = idx;
294
295         return num_rx_total;
296 }
297
298 static uint16_t
299 bond_ethdev_tx_burst_8023ad_fast_queue(void *queue, struct rte_mbuf **bufs,
300                 uint16_t nb_pkts)
301 {
302         struct bond_dev_private *internals;
303         struct bond_tx_queue *bd_tx_q;
304
305         uint8_t num_of_slaves;
306         uint8_t slaves[RTE_MAX_ETHPORTS];
307          /* positions in slaves, not ID */
308         uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
309         uint8_t distributing_count;
310
311         uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
312         uint16_t i, op_slave_idx;
313
314         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
315
316         /* Total amount of packets in slave_bufs */
317         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
318         /* Slow packets placed in each slave */
319
320         if (unlikely(nb_pkts == 0))
321                 return 0;
322
323         bd_tx_q = (struct bond_tx_queue *)queue;
324         internals = bd_tx_q->dev_private;
325
326         /* Copy slave list to protect against slave up/down changes during tx
327          * bursting */
328         num_of_slaves = internals->active_slave_count;
329         if (num_of_slaves < 1)
330                 return num_tx_total;
331
332         memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) *
333                         num_of_slaves);
334
335         distributing_count = 0;
336         for (i = 0; i < num_of_slaves; i++) {
337                 struct port *port = &mode_8023ad_ports[slaves[i]];
338                 if (ACTOR_STATE(port, DISTRIBUTING))
339                         distributing_offsets[distributing_count++] = i;
340         }
341
342         if (likely(distributing_count > 0)) {
343                 /* Populate slaves mbuf with the packets which are to be sent */
344                 for (i = 0; i < nb_pkts; i++) {
345                         /* Select output slave using hash based on xmit policy */
346                         op_slave_idx = internals->xmit_hash(bufs[i],
347                                         distributing_count);
348
349                         /* Populate slave mbuf arrays with mbufs for that slave.
350                          * Use only slaves that are currently distributing.
351                          */
352                         uint8_t slave_offset =
353                                         distributing_offsets[op_slave_idx];
354                         slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] =
355                                         bufs[i];
356                         slave_nb_pkts[slave_offset]++;
357                 }
358         }
359
360         /* Send packet burst on each slave device */
361         for (i = 0; i < num_of_slaves; i++) {
362                 if (slave_nb_pkts[i] == 0)
363                         continue;
364
365                 num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
366                                 slave_bufs[i], slave_nb_pkts[i]);
367
368                 num_tx_total += num_tx_slave;
369                 num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
370
371                 /* If tx burst fails move packets to end of bufs */
372                 if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
373                         uint16_t j = nb_pkts - num_tx_fail_total;
374                         for ( ; num_tx_slave < slave_nb_pkts[i]; j++,
375                                         num_tx_slave++)
376                                 bufs[j] = slave_bufs[i][num_tx_slave];
377                 }
378         }
379
380         return num_tx_total;
381 }
382
383
384 static uint16_t
385 bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
386                 uint16_t nb_pkts)
387 {
388         /* Cast to structure, containing bonded device's port id and queue id */
389         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
390         struct bond_dev_private *internals = bd_rx_q->dev_private;
391         struct ether_addr bond_mac;
392
393         struct ether_hdr *hdr;
394
395         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
396         uint16_t num_rx_total = 0;      /* Total number of received packets */
397         uint8_t slaves[RTE_MAX_ETHPORTS];
398         uint8_t slave_count, idx;
399
400         uint8_t collecting;  /* current slave collecting status */
401         const uint8_t promisc = internals->promiscuous_en;
402         uint8_t i, j, k;
403         uint8_t subtype;
404
405         rte_eth_macaddr_get(internals->port_id, &bond_mac);
406         /* Copy slave list to protect against slave up/down changes during tx
407          * bursting */
408         slave_count = internals->active_slave_count;
409         memcpy(slaves, internals->active_slaves,
410                         sizeof(internals->active_slaves[0]) * slave_count);
411
412         idx = internals->active_slave;
413         if (idx >= slave_count) {
414                 internals->active_slave = 0;
415                 idx = 0;
416         }
417         for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
418                 j = num_rx_total;
419                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
420                                          COLLECTING);
421
422                 /* Read packets from this slave */
423                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
424                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
425
426                 for (k = j; k < 2 && k < num_rx_total; k++)
427                         rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
428
429                 /* Handle slow protocol packets. */
430                 while (j < num_rx_total) {
431
432                         /* If packet is not pure L2 and is known, skip it */
433                         if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) {
434                                 j++;
435                                 continue;
436                         }
437
438                         if (j + 3 < num_rx_total)
439                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
440
441                         hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
442                         subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
443
444                         /* Remove packet from array if it is slow packet or slave is not
445                          * in collecting state or bonding interface is not in promiscuous
446                          * mode and packet address does not match. */
447                         if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]->vlan_tci) ||
448                                 !collecting || (!promisc &&
449                                         !is_multicast_ether_addr(&hdr->d_addr) &&
450                                         !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
451
452                                 if (hdr->ether_type == ether_type_slow_be) {
453                                         bond_mode_8023ad_handle_slow_pkt(
454                                             internals, slaves[idx], bufs[j]);
455                                 } else
456                                         rte_pktmbuf_free(bufs[j]);
457
458                                 /* Packet is managed by mode 4 or dropped, shift the array */
459                                 num_rx_total--;
460                                 if (j < num_rx_total) {
461                                         memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
462                                                 (num_rx_total - j));
463                                 }
464                         } else
465                                 j++;
466                 }
467                 if (unlikely(++idx == slave_count))
468                         idx = 0;
469         }
470
471         internals->active_slave = idx;
472         return num_rx_total;
473 }
474
475 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
476 uint32_t burstnumberRX;
477 uint32_t burstnumberTX;
478
479 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
480
481 static void
482 arp_op_name(uint16_t arp_op, char *buf)
483 {
484         switch (arp_op) {
485         case ARP_OP_REQUEST:
486                 snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
487                 return;
488         case ARP_OP_REPLY:
489                 snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
490                 return;
491         case ARP_OP_REVREQUEST:
492                 snprintf(buf, sizeof("Reverse ARP Request"), "%s",
493                                 "Reverse ARP Request");
494                 return;
495         case ARP_OP_REVREPLY:
496                 snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
497                                 "Reverse ARP Reply");
498                 return;
499         case ARP_OP_INVREQUEST:
500                 snprintf(buf, sizeof("Peer Identify Request"), "%s",
501                                 "Peer Identify Request");
502                 return;
503         case ARP_OP_INVREPLY:
504                 snprintf(buf, sizeof("Peer Identify Reply"), "%s",
505                                 "Peer Identify Reply");
506                 return;
507         default:
508                 break;
509         }
510         snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
511         return;
512 }
513 #endif
514 #define MaxIPv4String   16
515 static void
516 ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
517 {
518         uint32_t ipv4_addr;
519
520         ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
521         snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
522                 (ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
523                 ipv4_addr & 0xFF);
524 }
525
526 #define MAX_CLIENTS_NUMBER      128
527 uint8_t active_clients;
528 struct client_stats_t {
529         uint8_t port;
530         uint32_t ipv4_addr;
531         uint32_t ipv4_rx_packets;
532         uint32_t ipv4_tx_packets;
533 };
534 struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
535
536 static void
537 update_client_stats(uint32_t addr, uint8_t port, uint32_t *TXorRXindicator)
538 {
539         int i = 0;
540
541         for (; i < MAX_CLIENTS_NUMBER; i++)     {
542                 if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))      {
543                         /* Just update RX packets number for this client */
544                         if (TXorRXindicator == &burstnumberRX)
545                                 client_stats[i].ipv4_rx_packets++;
546                         else
547                                 client_stats[i].ipv4_tx_packets++;
548                         return;
549                 }
550         }
551         /* We have a new client. Insert him to the table, and increment stats */
552         if (TXorRXindicator == &burstnumberRX)
553                 client_stats[active_clients].ipv4_rx_packets++;
554         else
555                 client_stats[active_clients].ipv4_tx_packets++;
556         client_stats[active_clients].ipv4_addr = addr;
557         client_stats[active_clients].port = port;
558         active_clients++;
559
560 }
561
562 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
563 #define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)     \
564                 RTE_LOG(DEBUG, PMD, \
565                 "%s " \
566                 "port:%d " \
567                 "SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
568                 "SrcIP:%s " \
569                 "DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
570                 "DstIP:%s " \
571                 "%s " \
572                 "%d\n", \
573                 info, \
574                 port, \
575                 eth_h->s_addr.addr_bytes[0], \
576                 eth_h->s_addr.addr_bytes[1], \
577                 eth_h->s_addr.addr_bytes[2], \
578                 eth_h->s_addr.addr_bytes[3], \
579                 eth_h->s_addr.addr_bytes[4], \
580                 eth_h->s_addr.addr_bytes[5], \
581                 src_ip, \
582                 eth_h->d_addr.addr_bytes[0], \
583                 eth_h->d_addr.addr_bytes[1], \
584                 eth_h->d_addr.addr_bytes[2], \
585                 eth_h->d_addr.addr_bytes[3], \
586                 eth_h->d_addr.addr_bytes[4], \
587                 eth_h->d_addr.addr_bytes[5], \
588                 dst_ip, \
589                 arp_op, \
590                 ++burstnumber)
591 #endif
592
593 static void
594 mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
595                 uint8_t port, uint32_t __attribute__((unused)) *burstnumber)
596 {
597         struct ipv4_hdr *ipv4_h;
598 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
599         struct arp_hdr *arp_h;
600         char dst_ip[16];
601         char ArpOp[24];
602         char buf[16];
603 #endif
604         char src_ip[16];
605
606         uint16_t ether_type = eth_h->ether_type;
607         uint16_t offset = get_vlan_offset(eth_h, &ether_type);
608
609 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
610         snprintf(buf, 16, "%s", info);
611 #endif
612
613         if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
614                 ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
615                 ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
616 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
617                 ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
618                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
619 #endif
620                 update_client_stats(ipv4_h->src_addr, port, burstnumber);
621         }
622 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
623         else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
624                 arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
625                 ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
626                 ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
627                 arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
628                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
629         }
630 #endif
631 }
632 #endif
633
634 static uint16_t
635 bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
636 {
637         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
638         struct bond_dev_private *internals = bd_tx_q->dev_private;
639         struct ether_hdr *eth_h;
640         uint16_t ether_type, offset;
641         uint16_t nb_recv_pkts;
642         int i;
643
644         nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
645
646         for (i = 0; i < nb_recv_pkts; i++) {
647                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
648                 ether_type = eth_h->ether_type;
649                 offset = get_vlan_offset(eth_h, &ether_type);
650
651                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
652 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
653                         mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
654 #endif
655                         bond_mode_alb_arp_recv(eth_h, offset, internals);
656                 }
657 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
658                 else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
659                         mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
660 #endif
661         }
662
663         return nb_recv_pkts;
664 }
665
666 static uint16_t
667 bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
668                 uint16_t nb_pkts)
669 {
670         struct bond_dev_private *internals;
671         struct bond_tx_queue *bd_tx_q;
672
673         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
674         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
675
676         uint8_t num_of_slaves;
677         uint8_t slaves[RTE_MAX_ETHPORTS];
678
679         uint16_t num_tx_total = 0, num_tx_slave;
680
681         static int slave_idx = 0;
682         int i, cslave_idx = 0, tx_fail_total = 0;
683
684         bd_tx_q = (struct bond_tx_queue *)queue;
685         internals = bd_tx_q->dev_private;
686
687         /* Copy slave list to protect against slave up/down changes during tx
688          * bursting */
689         num_of_slaves = internals->active_slave_count;
690         memcpy(slaves, internals->active_slaves,
691                         sizeof(internals->active_slaves[0]) * num_of_slaves);
692
693         if (num_of_slaves < 1)
694                 return num_tx_total;
695
696         /* Populate slaves mbuf with which packets are to be sent on it  */
697         for (i = 0; i < nb_pkts; i++) {
698                 cslave_idx = (slave_idx + i) % num_of_slaves;
699                 slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
700         }
701
702         /* increment current slave index so the next call to tx burst starts on the
703          * next slave */
704         slave_idx = ++cslave_idx;
705
706         /* Send packet burst on each slave device */
707         for (i = 0; i < num_of_slaves; i++) {
708                 if (slave_nb_pkts[i] > 0) {
709                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
710                                         slave_bufs[i], slave_nb_pkts[i]);
711
712                         /* if tx burst fails move packets to end of bufs */
713                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
714                                 int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
715
716                                 tx_fail_total += tx_fail_slave;
717
718                                 memcpy(&bufs[nb_pkts - tx_fail_total],
719                                                 &slave_bufs[i][num_tx_slave],
720                                                 tx_fail_slave * sizeof(bufs[0]));
721                         }
722                         num_tx_total += num_tx_slave;
723                 }
724         }
725
726         return num_tx_total;
727 }
728
729 static uint16_t
730 bond_ethdev_tx_burst_active_backup(void *queue,
731                 struct rte_mbuf **bufs, uint16_t nb_pkts)
732 {
733         struct bond_dev_private *internals;
734         struct bond_tx_queue *bd_tx_q;
735
736         bd_tx_q = (struct bond_tx_queue *)queue;
737         internals = bd_tx_q->dev_private;
738
739         if (internals->active_slave_count < 1)
740                 return 0;
741
742         return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
743                         bufs, nb_pkts);
744 }
745
746 static inline uint16_t
747 ether_hash(struct ether_hdr *eth_hdr)
748 {
749         unaligned_uint16_t *word_src_addr =
750                 (unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
751         unaligned_uint16_t *word_dst_addr =
752                 (unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
753
754         return (word_src_addr[0] ^ word_dst_addr[0]) ^
755                         (word_src_addr[1] ^ word_dst_addr[1]) ^
756                         (word_src_addr[2] ^ word_dst_addr[2]);
757 }
758
759 static inline uint32_t
760 ipv4_hash(struct ipv4_hdr *ipv4_hdr)
761 {
762         return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
763 }
764
765 static inline uint32_t
766 ipv6_hash(struct ipv6_hdr *ipv6_hdr)
767 {
768         unaligned_uint32_t *word_src_addr =
769                 (unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
770         unaligned_uint32_t *word_dst_addr =
771                 (unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
772
773         return (word_src_addr[0] ^ word_dst_addr[0]) ^
774                         (word_src_addr[1] ^ word_dst_addr[1]) ^
775                         (word_src_addr[2] ^ word_dst_addr[2]) ^
776                         (word_src_addr[3] ^ word_dst_addr[3]);
777 }
778
779 uint16_t
780 xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
781 {
782         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
783
784         uint32_t hash = ether_hash(eth_hdr);
785
786         return (hash ^= hash >> 8) % slave_count;
787 }
788
789 uint16_t
790 xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count)
791 {
792         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
793         uint16_t proto = eth_hdr->ether_type;
794         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
795         uint32_t hash, l3hash = 0;
796
797         hash = ether_hash(eth_hdr);
798
799         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
800                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
801                                 ((char *)(eth_hdr + 1) + vlan_offset);
802                 l3hash = ipv4_hash(ipv4_hdr);
803
804         } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
805                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
806                                 ((char *)(eth_hdr + 1) + vlan_offset);
807                 l3hash = ipv6_hash(ipv6_hdr);
808         }
809
810         hash = hash ^ l3hash;
811         hash ^= hash >> 16;
812         hash ^= hash >> 8;
813
814         return hash % slave_count;
815 }
816
817 uint16_t
818 xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count)
819 {
820         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
821         uint16_t proto = eth_hdr->ether_type;
822         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
823
824         struct udp_hdr *udp_hdr = NULL;
825         struct tcp_hdr *tcp_hdr = NULL;
826         uint32_t hash, l3hash = 0, l4hash = 0;
827
828         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
829                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
830                                 ((char *)(eth_hdr + 1) + vlan_offset);
831                 size_t ip_hdr_offset;
832
833                 l3hash = ipv4_hash(ipv4_hdr);
834
835                 /* there is no L4 header in fragmented packet */
836                 if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) {
837                         ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) *
838                                         IPV4_IHL_MULTIPLIER;
839
840                         if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
841                                 tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
842                                                 ip_hdr_offset);
843                                 l4hash = HASH_L4_PORTS(tcp_hdr);
844                         } else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) {
845                                 udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
846                                                 ip_hdr_offset);
847                                 l4hash = HASH_L4_PORTS(udp_hdr);
848                         }
849                 }
850         } else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
851                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
852                                 ((char *)(eth_hdr + 1) + vlan_offset);
853                 l3hash = ipv6_hash(ipv6_hdr);
854
855                 if (ipv6_hdr->proto == IPPROTO_TCP) {
856                         tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
857                         l4hash = HASH_L4_PORTS(tcp_hdr);
858                 } else if (ipv6_hdr->proto == IPPROTO_UDP) {
859                         udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
860                         l4hash = HASH_L4_PORTS(udp_hdr);
861                 }
862         }
863
864         hash = l3hash ^ l4hash;
865         hash ^= hash >> 16;
866         hash ^= hash >> 8;
867
868         return hash % slave_count;
869 }
870
871 struct bwg_slave {
872         uint64_t bwg_left_int;
873         uint64_t bwg_left_remainder;
874         uint8_t slave;
875 };
876
877 void
878 bond_tlb_activate_slave(struct bond_dev_private *internals) {
879         int i;
880
881         for (i = 0; i < internals->active_slave_count; i++) {
882                 tlb_last_obytets[internals->active_slaves[i]] = 0;
883         }
884 }
885
886 static int
887 bandwidth_cmp(const void *a, const void *b)
888 {
889         const struct bwg_slave *bwg_a = a;
890         const struct bwg_slave *bwg_b = b;
891         int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
892         int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
893                         (int64_t)bwg_a->bwg_left_remainder;
894         if (diff > 0)
895                 return 1;
896         else if (diff < 0)
897                 return -1;
898         else if (diff2 > 0)
899                 return 1;
900         else if (diff2 < 0)
901                 return -1;
902         else
903                 return 0;
904 }
905
906 static void
907 bandwidth_left(uint8_t port_id, uint64_t load, uint8_t update_idx,
908                 struct bwg_slave *bwg_slave)
909 {
910         struct rte_eth_link link_status;
911
912         rte_eth_link_get_nowait(port_id, &link_status);
913         uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
914         if (link_bwg == 0)
915                 return;
916         link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
917         bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
918         bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
919 }
920
921 static void
922 bond_ethdev_update_tlb_slave_cb(void *arg)
923 {
924         struct bond_dev_private *internals = arg;
925         struct rte_eth_stats slave_stats;
926         struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
927         uint8_t slave_count;
928         uint64_t tx_bytes;
929
930         uint8_t update_stats = 0;
931         uint8_t i, slave_id;
932
933         internals->slave_update_idx++;
934
935
936         if (internals->slave_update_idx >= REORDER_PERIOD_MS)
937                 update_stats = 1;
938
939         for (i = 0; i < internals->active_slave_count; i++) {
940                 slave_id = internals->active_slaves[i];
941                 rte_eth_stats_get(slave_id, &slave_stats);
942                 tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
943                 bandwidth_left(slave_id, tx_bytes,
944                                 internals->slave_update_idx, &bwg_array[i]);
945                 bwg_array[i].slave = slave_id;
946
947                 if (update_stats) {
948                         tlb_last_obytets[slave_id] = slave_stats.obytes;
949                 }
950         }
951
952         if (update_stats == 1)
953                 internals->slave_update_idx = 0;
954
955         slave_count = i;
956         qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
957         for (i = 0; i < slave_count; i++)
958                 internals->tlb_slaves_order[i] = bwg_array[i].slave;
959
960         rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
961                         (struct bond_dev_private *)internals);
962 }
963
964 static uint16_t
965 bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
966 {
967         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
968         struct bond_dev_private *internals = bd_tx_q->dev_private;
969
970         struct rte_eth_dev *primary_port =
971                         &rte_eth_devices[internals->primary_port];
972         uint16_t num_tx_total = 0;
973         uint8_t i, j;
974
975         uint8_t num_of_slaves = internals->active_slave_count;
976         uint8_t slaves[RTE_MAX_ETHPORTS];
977
978         struct ether_hdr *ether_hdr;
979         struct ether_addr primary_slave_addr;
980         struct ether_addr active_slave_addr;
981
982         if (num_of_slaves < 1)
983                 return num_tx_total;
984
985         memcpy(slaves, internals->tlb_slaves_order,
986                                 sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
987
988
989         ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
990
991         if (nb_pkts > 3) {
992                 for (i = 0; i < 3; i++)
993                         rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
994         }
995
996         for (i = 0; i < num_of_slaves; i++) {
997                 rte_eth_macaddr_get(slaves[i], &active_slave_addr);
998                 for (j = num_tx_total; j < nb_pkts; j++) {
999                         if (j + 3 < nb_pkts)
1000                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
1001
1002                         ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
1003                         if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
1004                                 ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
1005 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1006                                         mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
1007 #endif
1008                 }
1009
1010                 num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1011                                 bufs + num_tx_total, nb_pkts - num_tx_total);
1012
1013                 if (num_tx_total == nb_pkts)
1014                         break;
1015         }
1016
1017         return num_tx_total;
1018 }
1019
1020 void
1021 bond_tlb_disable(struct bond_dev_private *internals)
1022 {
1023         rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
1024 }
1025
1026 void
1027 bond_tlb_enable(struct bond_dev_private *internals)
1028 {
1029         bond_ethdev_update_tlb_slave_cb(internals);
1030 }
1031
1032 static uint16_t
1033 bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
1034 {
1035         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
1036         struct bond_dev_private *internals = bd_tx_q->dev_private;
1037
1038         struct ether_hdr *eth_h;
1039         uint16_t ether_type, offset;
1040
1041         struct client_data *client_info;
1042
1043         /*
1044          * We create transmit buffers for every slave and one additional to send
1045          * through tlb. In worst case every packet will be send on one port.
1046          */
1047         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
1048         uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
1049
1050         /*
1051          * We create separate transmit buffers for update packets as they won't
1052          * be counted in num_tx_total.
1053          */
1054         struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
1055         uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
1056
1057         struct rte_mbuf *upd_pkt;
1058         size_t pkt_size;
1059
1060         uint16_t num_send, num_not_send = 0;
1061         uint16_t num_tx_total = 0;
1062         uint8_t slave_idx;
1063
1064         int i, j;
1065
1066         /* Search tx buffer for ARP packets and forward them to alb */
1067         for (i = 0; i < nb_pkts; i++) {
1068                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
1069                 ether_type = eth_h->ether_type;
1070                 offset = get_vlan_offset(eth_h, &ether_type);
1071
1072                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
1073                         slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
1074
1075                         /* Change src mac in eth header */
1076                         rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
1077
1078                         /* Add packet to slave tx buffer */
1079                         slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
1080                         slave_bufs_pkts[slave_idx]++;
1081                 } else {
1082                         /* If packet is not ARP, send it with TLB policy */
1083                         slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
1084                                         bufs[i];
1085                         slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
1086                 }
1087         }
1088
1089         /* Update connected client ARP tables */
1090         if (internals->mode6.ntt) {
1091                 for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
1092                         client_info = &internals->mode6.client_table[i];
1093
1094                         if (client_info->in_use) {
1095                                 /* Allocate new packet to send ARP update on current slave */
1096                                 upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
1097                                 if (upd_pkt == NULL) {
1098                                         RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
1099                                         continue;
1100                                 }
1101                                 pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
1102                                                 + client_info->vlan_count * sizeof(struct vlan_hdr);
1103                                 upd_pkt->data_len = pkt_size;
1104                                 upd_pkt->pkt_len = pkt_size;
1105
1106                                 slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
1107                                                 internals);
1108
1109                                 /* Add packet to update tx buffer */
1110                                 update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
1111                                 update_bufs_pkts[slave_idx]++;
1112                         }
1113                 }
1114                 internals->mode6.ntt = 0;
1115         }
1116
1117         /* Send ARP packets on proper slaves */
1118         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1119                 if (slave_bufs_pkts[i] > 0) {
1120                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
1121                                         slave_bufs[i], slave_bufs_pkts[i]);
1122                         for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
1123                                 bufs[nb_pkts - 1 - num_not_send - j] =
1124                                                 slave_bufs[i][nb_pkts - 1 - j];
1125                         }
1126
1127                         num_tx_total += num_send;
1128                         num_not_send += slave_bufs_pkts[i] - num_send;
1129
1130 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1131         /* Print TX stats including update packets */
1132                         for (j = 0; j < slave_bufs_pkts[i]; j++) {
1133                                 eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
1134                                 mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
1135                         }
1136 #endif
1137                 }
1138         }
1139
1140         /* Send update packets on proper slaves */
1141         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
1142                 if (update_bufs_pkts[i] > 0) {
1143                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
1144                                         update_bufs_pkts[i]);
1145                         for (j = num_send; j < update_bufs_pkts[i]; j++) {
1146                                 rte_pktmbuf_free(update_bufs[i][j]);
1147                         }
1148 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
1149                         for (j = 0; j < update_bufs_pkts[i]; j++) {
1150                                 eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
1151                                 mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
1152                         }
1153 #endif
1154                 }
1155         }
1156
1157         /* Send non-ARP packets using tlb policy */
1158         if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
1159                 num_send = bond_ethdev_tx_burst_tlb(queue,
1160                                 slave_bufs[RTE_MAX_ETHPORTS],
1161                                 slave_bufs_pkts[RTE_MAX_ETHPORTS]);
1162
1163                 for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
1164                         bufs[nb_pkts - 1 - num_not_send - j] =
1165                                         slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
1166                 }
1167
1168                 num_tx_total += num_send;
1169         }
1170
1171         return num_tx_total;
1172 }
1173
1174 static uint16_t
1175 bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
1176                 uint16_t nb_pkts)
1177 {
1178         struct bond_dev_private *internals;
1179         struct bond_tx_queue *bd_tx_q;
1180
1181         uint8_t num_of_slaves;
1182         uint8_t slaves[RTE_MAX_ETHPORTS];
1183
1184         uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0;
1185
1186         int i, op_slave_id;
1187
1188         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
1189         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1190
1191         bd_tx_q = (struct bond_tx_queue *)queue;
1192         internals = bd_tx_q->dev_private;
1193
1194         /* Copy slave list to protect against slave up/down changes during tx
1195          * bursting */
1196         num_of_slaves = internals->active_slave_count;
1197         memcpy(slaves, internals->active_slaves,
1198                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1199
1200         if (num_of_slaves < 1)
1201                 return num_tx_total;
1202
1203         /* Populate slaves mbuf with the packets which are to be sent on it  */
1204         for (i = 0; i < nb_pkts; i++) {
1205                 /* Select output slave using hash based on xmit policy */
1206                 op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves);
1207
1208                 /* Populate slave mbuf arrays with mbufs for that slave */
1209                 slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i];
1210         }
1211
1212         /* Send packet burst on each slave device */
1213         for (i = 0; i < num_of_slaves; i++) {
1214                 if (slave_nb_pkts[i] > 0) {
1215                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1216                                         slave_bufs[i], slave_nb_pkts[i]);
1217
1218                         /* if tx burst fails move packets to end of bufs */
1219                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1220                                 int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave;
1221
1222                                 tx_fail_total += slave_tx_fail_count;
1223                                 memcpy(&bufs[nb_pkts - tx_fail_total],
1224                                                 &slave_bufs[i][num_tx_slave],
1225                                                 slave_tx_fail_count * sizeof(bufs[0]));
1226                         }
1227
1228                         num_tx_total += num_tx_slave;
1229                 }
1230         }
1231
1232         return num_tx_total;
1233 }
1234
1235 static uint16_t
1236 bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
1237                 uint16_t nb_pkts)
1238 {
1239         struct bond_dev_private *internals;
1240         struct bond_tx_queue *bd_tx_q;
1241
1242         uint8_t num_of_slaves;
1243         uint8_t slaves[RTE_MAX_ETHPORTS];
1244          /* positions in slaves, not ID */
1245         uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
1246         uint8_t distributing_count;
1247
1248         uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
1249         uint16_t i, j, op_slave_idx;
1250         const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1;
1251
1252         /* Allocate additional packets in case 8023AD mode. */
1253         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size];
1254         void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS] = { NULL };
1255
1256         /* Total amount of packets in slave_bufs */
1257         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1258         /* Slow packets placed in each slave */
1259         uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1260
1261         bd_tx_q = (struct bond_tx_queue *)queue;
1262         internals = bd_tx_q->dev_private;
1263
1264         /* Copy slave list to protect against slave up/down changes during tx
1265          * bursting */
1266         num_of_slaves = internals->active_slave_count;
1267         if (num_of_slaves < 1)
1268                 return num_tx_total;
1269
1270         memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves);
1271
1272         distributing_count = 0;
1273         for (i = 0; i < num_of_slaves; i++) {
1274                 struct port *port = &mode_8023ad_ports[slaves[i]];
1275
1276                 slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring,
1277                                 slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS,
1278                                 NULL);
1279                 slave_nb_pkts[i] = slave_slow_nb_pkts[i];
1280
1281                 for (j = 0; j < slave_slow_nb_pkts[i]; j++)
1282                         slave_bufs[i][j] = slow_pkts[j];
1283
1284                 if (ACTOR_STATE(port, DISTRIBUTING))
1285                         distributing_offsets[distributing_count++] = i;
1286         }
1287
1288         if (likely(distributing_count > 0)) {
1289                 /* Populate slaves mbuf with the packets which are to be sent on it */
1290                 for (i = 0; i < nb_pkts; i++) {
1291                         /* Select output slave using hash based on xmit policy */
1292                         op_slave_idx = internals->xmit_hash(bufs[i], distributing_count);
1293
1294                         /* Populate slave mbuf arrays with mbufs for that slave. Use only
1295                          * slaves that are currently distributing. */
1296                         uint8_t slave_offset = distributing_offsets[op_slave_idx];
1297                         slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i];
1298                         slave_nb_pkts[slave_offset]++;
1299                 }
1300         }
1301
1302         /* Send packet burst on each slave device */
1303         for (i = 0; i < num_of_slaves; i++) {
1304                 if (slave_nb_pkts[i] == 0)
1305                         continue;
1306
1307                 num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1308                                 slave_bufs[i], slave_nb_pkts[i]);
1309
1310                 /* If tx burst fails drop slow packets */
1311                 for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++)
1312                         rte_pktmbuf_free(slave_bufs[i][num_tx_slave]);
1313
1314                 num_tx_total += num_tx_slave - slave_slow_nb_pkts[i];
1315                 num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
1316
1317                 /* If tx burst fails move packets to end of bufs */
1318                 if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1319                         uint16_t j = nb_pkts - num_tx_fail_total;
1320                         for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++)
1321                                 bufs[j] = slave_bufs[i][num_tx_slave];
1322                 }
1323         }
1324
1325         return num_tx_total;
1326 }
1327
1328 static uint16_t
1329 bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1330                 uint16_t nb_pkts)
1331 {
1332         struct bond_dev_private *internals;
1333         struct bond_tx_queue *bd_tx_q;
1334
1335         uint8_t tx_failed_flag = 0, num_of_slaves;
1336         uint8_t slaves[RTE_MAX_ETHPORTS];
1337
1338         uint16_t max_nb_of_tx_pkts = 0;
1339
1340         int slave_tx_total[RTE_MAX_ETHPORTS];
1341         int i, most_successful_tx_slave = -1;
1342
1343         bd_tx_q = (struct bond_tx_queue *)queue;
1344         internals = bd_tx_q->dev_private;
1345
1346         /* Copy slave list to protect against slave up/down changes during tx
1347          * bursting */
1348         num_of_slaves = internals->active_slave_count;
1349         memcpy(slaves, internals->active_slaves,
1350                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1351
1352         if (num_of_slaves < 1)
1353                 return 0;
1354
1355         /* Increment reference count on mbufs */
1356         for (i = 0; i < nb_pkts; i++)
1357                 rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1358
1359         /* Transmit burst on each active slave */
1360         for (i = 0; i < num_of_slaves; i++) {
1361                 slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1362                                         bufs, nb_pkts);
1363
1364                 if (unlikely(slave_tx_total[i] < nb_pkts))
1365                         tx_failed_flag = 1;
1366
1367                 /* record the value and slave index for the slave which transmits the
1368                  * maximum number of packets */
1369                 if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1370                         max_nb_of_tx_pkts = slave_tx_total[i];
1371                         most_successful_tx_slave = i;
1372                 }
1373         }
1374
1375         /* if slaves fail to transmit packets from burst, the calling application
1376          * is not expected to know about multiple references to packets so we must
1377          * handle failures of all packets except those of the most successful slave
1378          */
1379         if (unlikely(tx_failed_flag))
1380                 for (i = 0; i < num_of_slaves; i++)
1381                         if (i != most_successful_tx_slave)
1382                                 while (slave_tx_total[i] < nb_pkts)
1383                                         rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1384
1385         return max_nb_of_tx_pkts;
1386 }
1387
1388 void
1389 link_properties_set(struct rte_eth_dev *ethdev, struct rte_eth_link *slave_link)
1390 {
1391         struct bond_dev_private *bond_ctx = ethdev->data->dev_private;
1392
1393         if (bond_ctx->mode == BONDING_MODE_8023AD) {
1394                 /**
1395                  * If in mode 4 then save the link properties of the first
1396                  * slave, all subsequent slaves must match these properties
1397                  */
1398                 struct rte_eth_link *bond_link = &bond_ctx->mode4.slave_link;
1399
1400                 bond_link->link_autoneg = slave_link->link_autoneg;
1401                 bond_link->link_duplex = slave_link->link_duplex;
1402                 bond_link->link_speed = slave_link->link_speed;
1403         } else {
1404                 /**
1405                  * In any other mode the link properties are set to default
1406                  * values of AUTONEG/DUPLEX
1407                  */
1408                 ethdev->data->dev_link.link_autoneg = ETH_LINK_AUTONEG;
1409                 ethdev->data->dev_link.link_duplex = ETH_LINK_FULL_DUPLEX;
1410         }
1411 }
1412
1413 int
1414 link_properties_valid(struct rte_eth_dev *ethdev,
1415                 struct rte_eth_link *slave_link)
1416 {
1417         struct bond_dev_private *bond_ctx = ethdev->data->dev_private;
1418
1419         if (bond_ctx->mode == BONDING_MODE_8023AD) {
1420                 struct rte_eth_link *bond_link = &bond_ctx->mode4.slave_link;
1421
1422                 if (bond_link->link_duplex != slave_link->link_duplex ||
1423                         bond_link->link_autoneg != slave_link->link_autoneg ||
1424                         bond_link->link_speed != slave_link->link_speed)
1425                         return -1;
1426         }
1427
1428         return 0;
1429 }
1430
1431 int
1432 mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1433 {
1434         struct ether_addr *mac_addr;
1435
1436         if (eth_dev == NULL) {
1437                 RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1438                 return -1;
1439         }
1440
1441         if (dst_mac_addr == NULL) {
1442                 RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1443                 return -1;
1444         }
1445
1446         mac_addr = eth_dev->data->mac_addrs;
1447
1448         ether_addr_copy(mac_addr, dst_mac_addr);
1449         return 0;
1450 }
1451
1452 int
1453 mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1454 {
1455         struct ether_addr *mac_addr;
1456
1457         if (eth_dev == NULL) {
1458                 RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1459                 return -1;
1460         }
1461
1462         if (new_mac_addr == NULL) {
1463                 RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1464                 return -1;
1465         }
1466
1467         mac_addr = eth_dev->data->mac_addrs;
1468
1469         /* If new MAC is different to current MAC then update */
1470         if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1471                 memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1472
1473         return 0;
1474 }
1475
1476 int
1477 mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1478 {
1479         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1480         int i;
1481
1482         /* Update slave devices MAC addresses */
1483         if (internals->slave_count < 1)
1484                 return -1;
1485
1486         switch (internals->mode) {
1487         case BONDING_MODE_ROUND_ROBIN:
1488         case BONDING_MODE_BALANCE:
1489         case BONDING_MODE_BROADCAST:
1490                 for (i = 0; i < internals->slave_count; i++) {
1491                         if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id],
1492                                         bonded_eth_dev->data->mac_addrs)) {
1493                                 RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1494                                                 internals->slaves[i].port_id);
1495                                 return -1;
1496                         }
1497                 }
1498                 break;
1499         case BONDING_MODE_8023AD:
1500                 bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1501                 break;
1502         case BONDING_MODE_ACTIVE_BACKUP:
1503         case BONDING_MODE_TLB:
1504         case BONDING_MODE_ALB:
1505         default:
1506                 for (i = 0; i < internals->slave_count; i++) {
1507                         if (internals->slaves[i].port_id ==
1508                                         internals->current_primary_port) {
1509                                 if (mac_address_set(&rte_eth_devices[internals->primary_port],
1510                                                 bonded_eth_dev->data->mac_addrs)) {
1511                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1512                                                         internals->current_primary_port);
1513                                         return -1;
1514                                 }
1515                         } else {
1516                                 if (mac_address_set(
1517                                                 &rte_eth_devices[internals->slaves[i].port_id],
1518                                                 &internals->slaves[i].persisted_mac_addr)) {
1519                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1520                                                         internals->slaves[i].port_id);
1521                                         return -1;
1522                                 }
1523                         }
1524                 }
1525         }
1526
1527         return 0;
1528 }
1529
1530 int
1531 bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1532 {
1533         struct bond_dev_private *internals;
1534
1535         internals = eth_dev->data->dev_private;
1536
1537         switch (mode) {
1538         case BONDING_MODE_ROUND_ROBIN:
1539                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1540                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1541                 break;
1542         case BONDING_MODE_ACTIVE_BACKUP:
1543                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1544                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1545                 break;
1546         case BONDING_MODE_BALANCE:
1547                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1548                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1549                 break;
1550         case BONDING_MODE_BROADCAST:
1551                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1552                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1553                 break;
1554         case BONDING_MODE_8023AD:
1555                 if (bond_mode_8023ad_enable(eth_dev) != 0)
1556                         return -1;
1557
1558                 if (internals->mode4.dedicated_queues.enabled == 0) {
1559                         eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1560                         eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1561                         RTE_LOG(WARNING, PMD,
1562                                 "Using mode 4, it is necessary to do TX burst "
1563                                 "and RX burst at least every 100ms.\n");
1564                 } else {
1565                         /* Use flow director's optimization */
1566                         eth_dev->rx_pkt_burst =
1567                                         bond_ethdev_rx_burst_8023ad_fast_queue;
1568                         eth_dev->tx_pkt_burst =
1569                                         bond_ethdev_tx_burst_8023ad_fast_queue;
1570                 }
1571                 break;
1572         case BONDING_MODE_TLB:
1573                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1574                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1575                 break;
1576         case BONDING_MODE_ALB:
1577                 if (bond_mode_alb_enable(eth_dev) != 0)
1578                         return -1;
1579
1580                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1581                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1582                 break;
1583         default:
1584                 return -1;
1585         }
1586
1587         internals->mode = mode;
1588
1589         return 0;
1590 }
1591
1592
1593 static int
1594 slave_configure_slow_queue(struct rte_eth_dev *bonded_eth_dev,
1595                 struct rte_eth_dev *slave_eth_dev)
1596 {
1597         int errval = 0;
1598         struct bond_dev_private *internals = (struct bond_dev_private *)
1599                 bonded_eth_dev->data->dev_private;
1600         struct port *port = &mode_8023ad_ports[slave_eth_dev->data->port_id];
1601
1602         if (port->slow_pool == NULL) {
1603                 char mem_name[256];
1604                 int slave_id = slave_eth_dev->data->port_id;
1605
1606                 snprintf(mem_name, RTE_DIM(mem_name), "slave_port%u_slow_pool",
1607                                 slave_id);
1608                 port->slow_pool = rte_pktmbuf_pool_create(mem_name, 8191,
1609                         250, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
1610                         slave_eth_dev->data->numa_node);
1611
1612                 /* Any memory allocation failure in initialization is critical because
1613                  * resources can't be free, so reinitialization is impossible. */
1614                 if (port->slow_pool == NULL) {
1615                         rte_panic("Slave %u: Failed to create memory pool '%s': %s\n",
1616                                 slave_id, mem_name, rte_strerror(rte_errno));
1617                 }
1618         }
1619
1620         if (internals->mode4.dedicated_queues.enabled == 1) {
1621                 /* Configure slow Rx queue */
1622
1623                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id,
1624                                 internals->mode4.dedicated_queues.rx_qid, 128,
1625                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1626                                 NULL, port->slow_pool);
1627                 if (errval != 0) {
1628                         RTE_BOND_LOG(ERR,
1629                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1630                                         slave_eth_dev->data->port_id,
1631                                         internals->mode4.dedicated_queues.rx_qid,
1632                                         errval);
1633                         return errval;
1634                 }
1635
1636                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id,
1637                                 internals->mode4.dedicated_queues.tx_qid, 512,
1638                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1639                                 NULL);
1640                 if (errval != 0) {
1641                         RTE_BOND_LOG(ERR,
1642                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1643                                 slave_eth_dev->data->port_id,
1644                                 internals->mode4.dedicated_queues.tx_qid,
1645                                 errval);
1646                         return errval;
1647                 }
1648         }
1649         return 0;
1650 }
1651
1652 int
1653 slave_configure(struct rte_eth_dev *bonded_eth_dev,
1654                 struct rte_eth_dev *slave_eth_dev)
1655 {
1656         struct bond_rx_queue *bd_rx_q;
1657         struct bond_tx_queue *bd_tx_q;
1658         uint16_t nb_rx_queues;
1659         uint16_t nb_tx_queues;
1660
1661         int errval;
1662         uint16_t q_id;
1663         struct rte_flow_error flow_error;
1664
1665         struct bond_dev_private *internals = (struct bond_dev_private *)
1666                 bonded_eth_dev->data->dev_private;
1667
1668         /* Stop slave */
1669         rte_eth_dev_stop(slave_eth_dev->data->port_id);
1670
1671         /* Enable interrupts on slave device if supported */
1672         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1673                 slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1674
1675         /* If RSS is enabled for bonding, try to enable it for slaves  */
1676         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1677                 if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1678                                 != 0) {
1679                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1680                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1681                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1682                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1683                 } else {
1684                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1685                 }
1686
1687                 slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1688                                 bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1689                 slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1690                                 bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1691         }
1692
1693         slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter =
1694                         bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter;
1695
1696         nb_rx_queues = bonded_eth_dev->data->nb_rx_queues;
1697         nb_tx_queues = bonded_eth_dev->data->nb_tx_queues;
1698
1699         if (internals->mode == BONDING_MODE_8023AD) {
1700                 if (internals->mode4.dedicated_queues.enabled == 1) {
1701                         nb_rx_queues++;
1702                         nb_tx_queues++;
1703                 }
1704         }
1705
1706         /* Configure device */
1707         errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1708                         nb_rx_queues, nb_tx_queues,
1709                         &(slave_eth_dev->data->dev_conf));
1710         if (errval != 0) {
1711                 RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1712                                 slave_eth_dev->data->port_id, errval);
1713                 return errval;
1714         }
1715
1716         /* Setup Rx Queues */
1717         for (q_id = 0; q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1718                 bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1719
1720                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1721                                 bd_rx_q->nb_rx_desc,
1722                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1723                                 &(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1724                 if (errval != 0) {
1725                         RTE_BOND_LOG(ERR,
1726                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1727                                         slave_eth_dev->data->port_id, q_id, errval);
1728                         return errval;
1729                 }
1730         }
1731
1732         /* Setup Tx Queues */
1733         for (q_id = 0; q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1734                 bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1735
1736                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1737                                 bd_tx_q->nb_tx_desc,
1738                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1739                                 &bd_tx_q->tx_conf);
1740                 if (errval != 0) {
1741                         RTE_BOND_LOG(ERR,
1742                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1743                                 slave_eth_dev->data->port_id, q_id, errval);
1744                         return errval;
1745                 }
1746         }
1747
1748         if (internals->mode == BONDING_MODE_8023AD &&
1749                         internals->mode4.dedicated_queues.enabled == 1) {
1750                 if (slave_configure_slow_queue(bonded_eth_dev, slave_eth_dev)
1751                                 != 0)
1752                         return errval;
1753
1754                 if (bond_ethdev_8023ad_flow_verify(bonded_eth_dev,
1755                                 slave_eth_dev->data->port_id) != 0) {
1756                         RTE_BOND_LOG(ERR,
1757                                 "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1758                                 slave_eth_dev->data->port_id, q_id, errval);
1759                         return -1;
1760                 }
1761
1762                 if (internals->mode4.dedicated_queues.flow[slave_eth_dev->data->port_id] != NULL)
1763                         rte_flow_destroy(slave_eth_dev->data->port_id,
1764                                         internals->mode4.dedicated_queues.flow[slave_eth_dev->data->port_id],
1765                                         &flow_error);
1766
1767                 bond_ethdev_8023ad_flow_set(bonded_eth_dev,
1768                                 slave_eth_dev->data->port_id);
1769         }
1770
1771         /* Start device */
1772         errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1773         if (errval != 0) {
1774                 RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1775                                 slave_eth_dev->data->port_id, errval);
1776                 return -1;
1777         }
1778
1779         /* If RSS is enabled for bonding, synchronize RETA */
1780         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1781                 int i;
1782                 struct bond_dev_private *internals;
1783
1784                 internals = bonded_eth_dev->data->dev_private;
1785
1786                 for (i = 0; i < internals->slave_count; i++) {
1787                         if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1788                                 errval = rte_eth_dev_rss_reta_update(
1789                                                 slave_eth_dev->data->port_id,
1790                                                 &internals->reta_conf[0],
1791                                                 internals->slaves[i].reta_size);
1792                                 if (errval != 0) {
1793                                         RTE_LOG(WARNING, PMD,
1794                                                         "rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1795                                                         " RSS Configuration for bonding may be inconsistent.\n",
1796                                                         slave_eth_dev->data->port_id, errval);
1797                                 }
1798                                 break;
1799                         }
1800                 }
1801         }
1802
1803         /* If lsc interrupt is set, check initial slave's link status */
1804         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) {
1805                 slave_eth_dev->dev_ops->link_update(slave_eth_dev, 0);
1806                 bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1807                         RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id,
1808                         NULL);
1809         }
1810
1811         return 0;
1812 }
1813
1814 void
1815 slave_remove(struct bond_dev_private *internals,
1816                 struct rte_eth_dev *slave_eth_dev)
1817 {
1818         uint8_t i;
1819
1820         for (i = 0; i < internals->slave_count; i++)
1821                 if (internals->slaves[i].port_id ==
1822                                 slave_eth_dev->data->port_id)
1823                         break;
1824
1825         if (i < (internals->slave_count - 1))
1826                 memmove(&internals->slaves[i], &internals->slaves[i + 1],
1827                                 sizeof(internals->slaves[0]) *
1828                                 (internals->slave_count - i - 1));
1829
1830         internals->slave_count--;
1831
1832         /* force reconfiguration of slave interfaces */
1833         _rte_eth_dev_reset(slave_eth_dev);
1834 }
1835
1836 static void
1837 bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1838
1839 void
1840 slave_add(struct bond_dev_private *internals,
1841                 struct rte_eth_dev *slave_eth_dev)
1842 {
1843         struct bond_slave_details *slave_details =
1844                         &internals->slaves[internals->slave_count];
1845
1846         slave_details->port_id = slave_eth_dev->data->port_id;
1847         slave_details->last_link_status = 0;
1848
1849         /* Mark slave devices that don't support interrupts so we can
1850          * compensate when we start the bond
1851          */
1852         if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1853                 slave_details->link_status_poll_enabled = 1;
1854         }
1855
1856         slave_details->link_status_wait_to_complete = 0;
1857         /* clean tlb_last_obytes when adding port for bonding device */
1858         memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
1859                         sizeof(struct ether_addr));
1860 }
1861
1862 void
1863 bond_ethdev_primary_set(struct bond_dev_private *internals,
1864                 uint8_t slave_port_id)
1865 {
1866         int i;
1867
1868         if (internals->active_slave_count < 1)
1869                 internals->current_primary_port = slave_port_id;
1870         else
1871                 /* Search bonded device slave ports for new proposed primary port */
1872                 for (i = 0; i < internals->active_slave_count; i++) {
1873                         if (internals->active_slaves[i] == slave_port_id)
1874                                 internals->current_primary_port = slave_port_id;
1875                 }
1876 }
1877
1878 static void
1879 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
1880
1881 static int
1882 bond_ethdev_start(struct rte_eth_dev *eth_dev)
1883 {
1884         struct bond_dev_private *internals;
1885         int i;
1886
1887         /* slave eth dev will be started by bonded device */
1888         if (check_for_bonded_ethdev(eth_dev)) {
1889                 RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
1890                                 eth_dev->data->port_id);
1891                 return -1;
1892         }
1893
1894         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1895         eth_dev->data->dev_started = 1;
1896
1897         internals = eth_dev->data->dev_private;
1898
1899         if (internals->slave_count == 0) {
1900                 RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
1901                 return -1;
1902         }
1903
1904         if (internals->user_defined_mac == 0) {
1905                 struct ether_addr *new_mac_addr = NULL;
1906
1907                 for (i = 0; i < internals->slave_count; i++)
1908                         if (internals->slaves[i].port_id == internals->primary_port)
1909                                 new_mac_addr = &internals->slaves[i].persisted_mac_addr;
1910
1911                 if (new_mac_addr == NULL)
1912                         return -1;
1913
1914                 if (mac_address_set(eth_dev, new_mac_addr) != 0) {
1915                         RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
1916                                         eth_dev->data->port_id);
1917                         return -1;
1918                 }
1919         }
1920
1921         /* Update all slave devices MACs*/
1922         if (mac_address_slaves_update(eth_dev) != 0)
1923                 return -1;
1924
1925         /* If bonded device is configure in promiscuous mode then re-apply config */
1926         if (internals->promiscuous_en)
1927                 bond_ethdev_promiscuous_enable(eth_dev);
1928
1929         if (internals->mode == BONDING_MODE_8023AD) {
1930                 if (internals->mode4.dedicated_queues.enabled == 1) {
1931                         internals->mode4.dedicated_queues.rx_qid =
1932                                         eth_dev->data->nb_rx_queues;
1933                         internals->mode4.dedicated_queues.tx_qid =
1934                                         eth_dev->data->nb_tx_queues;
1935                 }
1936         }
1937
1938
1939         /* Reconfigure each slave device if starting bonded device */
1940         for (i = 0; i < internals->slave_count; i++) {
1941                 struct rte_eth_dev *slave_ethdev =
1942                                 &(rte_eth_devices[internals->slaves[i].port_id]);
1943                 if (slave_configure(eth_dev, slave_ethdev) != 0) {
1944                         RTE_BOND_LOG(ERR,
1945                                 "bonded port (%d) failed to reconfigure slave device (%d)",
1946                                 eth_dev->data->port_id,
1947                                 internals->slaves[i].port_id);
1948                         return -1;
1949                 }
1950                 /* We will need to poll for link status if any slave doesn't
1951                  * support interrupts
1952                  */
1953                 if (internals->slaves[i].link_status_poll_enabled)
1954                         internals->link_status_polling_enabled = 1;
1955         }
1956         /* start polling if needed */
1957         if (internals->link_status_polling_enabled) {
1958                 rte_eal_alarm_set(
1959                         internals->link_status_polling_interval_ms * 1000,
1960                         bond_ethdev_slave_link_status_change_monitor,
1961                         (void *)&rte_eth_devices[internals->port_id]);
1962         }
1963
1964         if (internals->user_defined_primary_port)
1965                 bond_ethdev_primary_set(internals, internals->primary_port);
1966
1967         if (internals->mode == BONDING_MODE_8023AD)
1968                 bond_mode_8023ad_start(eth_dev);
1969
1970         if (internals->mode == BONDING_MODE_TLB ||
1971                         internals->mode == BONDING_MODE_ALB)
1972                 bond_tlb_enable(internals);
1973
1974         return 0;
1975 }
1976
1977 static void
1978 bond_ethdev_free_queues(struct rte_eth_dev *dev)
1979 {
1980         uint8_t i;
1981
1982         if (dev->data->rx_queues != NULL) {
1983                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1984                         rte_free(dev->data->rx_queues[i]);
1985                         dev->data->rx_queues[i] = NULL;
1986                 }
1987                 dev->data->nb_rx_queues = 0;
1988         }
1989
1990         if (dev->data->tx_queues != NULL) {
1991                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1992                         rte_free(dev->data->tx_queues[i]);
1993                         dev->data->tx_queues[i] = NULL;
1994                 }
1995                 dev->data->nb_tx_queues = 0;
1996         }
1997 }
1998
1999 void
2000 bond_ethdev_stop(struct rte_eth_dev *eth_dev)
2001 {
2002         struct bond_dev_private *internals = eth_dev->data->dev_private;
2003         uint8_t i;
2004
2005         if (internals->mode == BONDING_MODE_8023AD) {
2006                 struct port *port;
2007                 void *pkt = NULL;
2008
2009                 bond_mode_8023ad_stop(eth_dev);
2010
2011                 /* Discard all messages to/from mode 4 state machines */
2012                 for (i = 0; i < internals->active_slave_count; i++) {
2013                         port = &mode_8023ad_ports[internals->active_slaves[i]];
2014
2015                         RTE_ASSERT(port->rx_ring != NULL);
2016                         while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
2017                                 rte_pktmbuf_free(pkt);
2018
2019                         RTE_ASSERT(port->tx_ring != NULL);
2020                         while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
2021                                 rte_pktmbuf_free(pkt);
2022                 }
2023         }
2024
2025         if (internals->mode == BONDING_MODE_TLB ||
2026                         internals->mode == BONDING_MODE_ALB) {
2027                 bond_tlb_disable(internals);
2028                 for (i = 0; i < internals->active_slave_count; i++)
2029                         tlb_last_obytets[internals->active_slaves[i]] = 0;
2030         }
2031
2032         internals->active_slave_count = 0;
2033         internals->link_status_polling_enabled = 0;
2034         for (i = 0; i < internals->slave_count; i++)
2035                 internals->slaves[i].last_link_status = 0;
2036
2037         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2038         eth_dev->data->dev_started = 0;
2039 }
2040
2041 void
2042 bond_ethdev_close(struct rte_eth_dev *dev)
2043 {
2044         struct bond_dev_private *internals = dev->data->dev_private;
2045         uint8_t bond_port_id = internals->port_id;
2046         int skipped = 0;
2047
2048         RTE_LOG(INFO, EAL, "Closing bonded device %s\n", dev->device->name);
2049         while (internals->slave_count != skipped) {
2050                 uint8_t port_id = internals->slaves[skipped].port_id;
2051
2052                 rte_eth_dev_stop(port_id);
2053
2054                 if (rte_eth_bond_slave_remove(bond_port_id, port_id) != 0) {
2055                         RTE_LOG(ERR, EAL,
2056                                 "Failed to remove port %d from bonded device "
2057                                 "%s\n", port_id, dev->device->name);
2058                         skipped++;
2059                 }
2060         }
2061         bond_ethdev_free_queues(dev);
2062         rte_bitmap_reset(internals->vlan_filter_bmp);
2063 }
2064
2065 /* forward declaration */
2066 static int bond_ethdev_configure(struct rte_eth_dev *dev);
2067
2068 static void
2069 bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
2070 {
2071         struct bond_dev_private *internals = dev->data->dev_private;
2072
2073         uint16_t max_nb_rx_queues = UINT16_MAX;
2074         uint16_t max_nb_tx_queues = UINT16_MAX;
2075
2076         dev_info->max_mac_addrs = 1;
2077
2078         dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen ?
2079                         internals->candidate_max_rx_pktlen :
2080                         ETHER_MAX_JUMBO_FRAME_LEN;
2081
2082         /* Max number of tx/rx queues that the bonded device can support is the
2083          * minimum values of the bonded slaves, as all slaves must be capable
2084          * of supporting the same number of tx/rx queues.
2085          */
2086         if (internals->slave_count > 0) {
2087                 struct rte_eth_dev_info slave_info;
2088                 uint8_t idx;
2089
2090                 for (idx = 0; idx < internals->slave_count; idx++) {
2091                         rte_eth_dev_info_get(internals->slaves[idx].port_id,
2092                                         &slave_info);
2093
2094                         if (slave_info.max_rx_queues < max_nb_rx_queues)
2095                                 max_nb_rx_queues = slave_info.max_rx_queues;
2096
2097                         if (slave_info.max_tx_queues < max_nb_tx_queues)
2098                                 max_nb_tx_queues = slave_info.max_tx_queues;
2099                 }
2100         }
2101
2102         dev_info->max_rx_queues = max_nb_rx_queues;
2103         dev_info->max_tx_queues = max_nb_tx_queues;
2104
2105         /**
2106          * If dedicated hw queues enabled for link bonding device in LACP mode
2107          * then we need to reduce the maximum number of data path queues by 1.
2108          */
2109         if (internals->mode == BONDING_MODE_8023AD &&
2110                 internals->mode4.dedicated_queues.enabled == 1) {
2111                 dev_info->max_rx_queues--;
2112                 dev_info->max_tx_queues--;
2113         }
2114
2115         dev_info->min_rx_bufsize = 0;
2116
2117         dev_info->rx_offload_capa = internals->rx_offload_capa;
2118         dev_info->tx_offload_capa = internals->tx_offload_capa;
2119         dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
2120
2121         dev_info->reta_size = internals->reta_size;
2122 }
2123
2124 static int
2125 bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
2126 {
2127         int res;
2128         uint8_t i;
2129         struct bond_dev_private *internals = dev->data->dev_private;
2130
2131         /* don't do this while a slave is being added */
2132         rte_spinlock_lock(&internals->lock);
2133
2134         if (on)
2135                 rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
2136         else
2137                 rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
2138
2139         for (i = 0; i < internals->slave_count; i++) {
2140                 uint8_t port_id = internals->slaves[i].port_id;
2141
2142                 res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
2143                 if (res == ENOTSUP)
2144                         RTE_LOG(WARNING, PMD,
2145                                 "Setting VLAN filter on slave port %u not supported.\n",
2146                                 port_id);
2147         }
2148
2149         rte_spinlock_unlock(&internals->lock);
2150         return 0;
2151 }
2152
2153 static int
2154 bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
2155                 uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
2156                 const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
2157 {
2158         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
2159                         rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
2160                                         0, dev->data->numa_node);
2161         if (bd_rx_q == NULL)
2162                 return -1;
2163
2164         bd_rx_q->queue_id = rx_queue_id;
2165         bd_rx_q->dev_private = dev->data->dev_private;
2166
2167         bd_rx_q->nb_rx_desc = nb_rx_desc;
2168
2169         memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
2170         bd_rx_q->mb_pool = mb_pool;
2171
2172         dev->data->rx_queues[rx_queue_id] = bd_rx_q;
2173
2174         return 0;
2175 }
2176
2177 static int
2178 bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
2179                 uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
2180                 const struct rte_eth_txconf *tx_conf)
2181 {
2182         struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
2183                         rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
2184                                         0, dev->data->numa_node);
2185
2186         if (bd_tx_q == NULL)
2187                 return -1;
2188
2189         bd_tx_q->queue_id = tx_queue_id;
2190         bd_tx_q->dev_private = dev->data->dev_private;
2191
2192         bd_tx_q->nb_tx_desc = nb_tx_desc;
2193         memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
2194
2195         dev->data->tx_queues[tx_queue_id] = bd_tx_q;
2196
2197         return 0;
2198 }
2199
2200 static void
2201 bond_ethdev_rx_queue_release(void *queue)
2202 {
2203         if (queue == NULL)
2204                 return;
2205
2206         rte_free(queue);
2207 }
2208
2209 static void
2210 bond_ethdev_tx_queue_release(void *queue)
2211 {
2212         if (queue == NULL)
2213                 return;
2214
2215         rte_free(queue);
2216 }
2217
2218 static void
2219 bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
2220 {
2221         struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
2222         struct bond_dev_private *internals;
2223
2224         /* Default value for polling slave found is true as we don't want to
2225          * disable the polling thread if we cannot get the lock */
2226         int i, polling_slave_found = 1;
2227
2228         if (cb_arg == NULL)
2229                 return;
2230
2231         bonded_ethdev = (struct rte_eth_dev *)cb_arg;
2232         internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
2233
2234         if (!bonded_ethdev->data->dev_started ||
2235                 !internals->link_status_polling_enabled)
2236                 return;
2237
2238         /* If device is currently being configured then don't check slaves link
2239          * status, wait until next period */
2240         if (rte_spinlock_trylock(&internals->lock)) {
2241                 if (internals->slave_count > 0)
2242                         polling_slave_found = 0;
2243
2244                 for (i = 0; i < internals->slave_count; i++) {
2245                         if (!internals->slaves[i].link_status_poll_enabled)
2246                                 continue;
2247
2248                         slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
2249                         polling_slave_found = 1;
2250
2251                         /* Update slave link status */
2252                         (*slave_ethdev->dev_ops->link_update)(slave_ethdev,
2253                                         internals->slaves[i].link_status_wait_to_complete);
2254
2255                         /* if link status has changed since last checked then call lsc
2256                          * event callback */
2257                         if (slave_ethdev->data->dev_link.link_status !=
2258                                         internals->slaves[i].last_link_status) {
2259                                 internals->slaves[i].last_link_status =
2260                                                 slave_ethdev->data->dev_link.link_status;
2261
2262                                 bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
2263                                                 RTE_ETH_EVENT_INTR_LSC,
2264                                                 &bonded_ethdev->data->port_id,
2265                                                 NULL);
2266                         }
2267                 }
2268                 rte_spinlock_unlock(&internals->lock);
2269         }
2270
2271         if (polling_slave_found)
2272                 /* Set alarm to continue monitoring link status of slave ethdev's */
2273                 rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
2274                                 bond_ethdev_slave_link_status_change_monitor, cb_arg);
2275 }
2276
2277 static int
2278 bond_ethdev_link_update(struct rte_eth_dev *ethdev, int wait_to_complete)
2279 {
2280         void (*link_update)(uint8_t port_id, struct rte_eth_link *eth_link);
2281
2282         struct bond_dev_private *bond_ctx;
2283         struct rte_eth_link slave_link;
2284
2285         uint32_t idx;
2286
2287         bond_ctx = ethdev->data->dev_private;
2288
2289         ethdev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
2290
2291         if (ethdev->data->dev_started == 0 ||
2292                         bond_ctx->active_slave_count == 0) {
2293                 ethdev->data->dev_link.link_status = ETH_LINK_DOWN;
2294                 return 0;
2295         }
2296
2297         ethdev->data->dev_link.link_status = ETH_LINK_UP;
2298
2299         if (wait_to_complete)
2300                 link_update = rte_eth_link_get;
2301         else
2302                 link_update = rte_eth_link_get_nowait;
2303
2304         switch (bond_ctx->mode) {
2305         case BONDING_MODE_BROADCAST:
2306                 /**
2307                  * Setting link speed to UINT32_MAX to ensure we pick up the
2308                  * value of the first active slave
2309                  */
2310                 ethdev->data->dev_link.link_speed = UINT32_MAX;
2311
2312                 /**
2313                  * link speed is minimum value of all the slaves link speed as
2314                  * packet loss will occur on this slave if transmission at rates
2315                  * greater than this are attempted
2316                  */
2317                 for (idx = 1; idx < bond_ctx->active_slave_count; idx++) {
2318                         link_update(bond_ctx->active_slaves[0], &slave_link);
2319
2320                         if (slave_link.link_speed <
2321                                         ethdev->data->dev_link.link_speed)
2322                                 ethdev->data->dev_link.link_speed =
2323                                                 slave_link.link_speed;
2324                 }
2325                 break;
2326         case BONDING_MODE_ACTIVE_BACKUP:
2327                 /* Current primary slave */
2328                 link_update(bond_ctx->current_primary_port, &slave_link);
2329
2330                 ethdev->data->dev_link.link_speed = slave_link.link_speed;
2331                 break;
2332         case BONDING_MODE_8023AD:
2333                 ethdev->data->dev_link.link_autoneg =
2334                                 bond_ctx->mode4.slave_link.link_autoneg;
2335                 ethdev->data->dev_link.link_duplex =
2336                                 bond_ctx->mode4.slave_link.link_duplex;
2337                 /* fall through to update link speed */
2338         case BONDING_MODE_ROUND_ROBIN:
2339         case BONDING_MODE_BALANCE:
2340         case BONDING_MODE_TLB:
2341         case BONDING_MODE_ALB:
2342         default:
2343                 /**
2344                  * In theses mode the maximum theoretical link speed is the sum
2345                  * of all the slaves
2346                  */
2347                 ethdev->data->dev_link.link_speed = ETH_SPEED_NUM_NONE;
2348
2349                 for (idx = 0; idx < bond_ctx->active_slave_count; idx++) {
2350                         link_update(bond_ctx->active_slaves[idx], &slave_link);
2351
2352                         ethdev->data->dev_link.link_speed +=
2353                                         slave_link.link_speed;
2354                 }
2355         }
2356
2357
2358         return 0;
2359 }
2360
2361
2362 static void
2363 bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
2364 {
2365         struct bond_dev_private *internals = dev->data->dev_private;
2366         struct rte_eth_stats slave_stats;
2367         int i, j;
2368
2369         for (i = 0; i < internals->slave_count; i++) {
2370                 rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
2371
2372                 stats->ipackets += slave_stats.ipackets;
2373                 stats->opackets += slave_stats.opackets;
2374                 stats->ibytes += slave_stats.ibytes;
2375                 stats->obytes += slave_stats.obytes;
2376                 stats->imissed += slave_stats.imissed;
2377                 stats->ierrors += slave_stats.ierrors;
2378                 stats->oerrors += slave_stats.oerrors;
2379                 stats->rx_nombuf += slave_stats.rx_nombuf;
2380
2381                 for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
2382                         stats->q_ipackets[j] += slave_stats.q_ipackets[j];
2383                         stats->q_opackets[j] += slave_stats.q_opackets[j];
2384                         stats->q_ibytes[j] += slave_stats.q_ibytes[j];
2385                         stats->q_obytes[j] += slave_stats.q_obytes[j];
2386                         stats->q_errors[j] += slave_stats.q_errors[j];
2387                 }
2388
2389         }
2390 }
2391
2392 static void
2393 bond_ethdev_stats_reset(struct rte_eth_dev *dev)
2394 {
2395         struct bond_dev_private *internals = dev->data->dev_private;
2396         int i;
2397
2398         for (i = 0; i < internals->slave_count; i++)
2399                 rte_eth_stats_reset(internals->slaves[i].port_id);
2400 }
2401
2402 static void
2403 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
2404 {
2405         struct bond_dev_private *internals = eth_dev->data->dev_private;
2406         int i;
2407
2408         internals->promiscuous_en = 1;
2409
2410         switch (internals->mode) {
2411         /* Promiscuous mode is propagated to all slaves */
2412         case BONDING_MODE_ROUND_ROBIN:
2413         case BONDING_MODE_BALANCE:
2414         case BONDING_MODE_BROADCAST:
2415                 for (i = 0; i < internals->slave_count; i++)
2416                         rte_eth_promiscuous_enable(internals->slaves[i].port_id);
2417                 break;
2418         /* In mode4 promiscus mode is managed when slave is added/removed */
2419         case BONDING_MODE_8023AD:
2420                 break;
2421         /* Promiscuous mode is propagated only to primary slave */
2422         case BONDING_MODE_ACTIVE_BACKUP:
2423         case BONDING_MODE_TLB:
2424         case BONDING_MODE_ALB:
2425         default:
2426                 rte_eth_promiscuous_enable(internals->current_primary_port);
2427         }
2428 }
2429
2430 static void
2431 bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
2432 {
2433         struct bond_dev_private *internals = dev->data->dev_private;
2434         int i;
2435
2436         internals->promiscuous_en = 0;
2437
2438         switch (internals->mode) {
2439         /* Promiscuous mode is propagated to all slaves */
2440         case BONDING_MODE_ROUND_ROBIN:
2441         case BONDING_MODE_BALANCE:
2442         case BONDING_MODE_BROADCAST:
2443                 for (i = 0; i < internals->slave_count; i++)
2444                         rte_eth_promiscuous_disable(internals->slaves[i].port_id);
2445                 break;
2446         /* In mode4 promiscus mode is set managed when slave is added/removed */
2447         case BONDING_MODE_8023AD:
2448                 break;
2449         /* Promiscuous mode is propagated only to primary slave */
2450         case BONDING_MODE_ACTIVE_BACKUP:
2451         case BONDING_MODE_TLB:
2452         case BONDING_MODE_ALB:
2453         default:
2454                 rte_eth_promiscuous_disable(internals->current_primary_port);
2455         }
2456 }
2457
2458 static void
2459 bond_ethdev_delayed_lsc_propagation(void *arg)
2460 {
2461         if (arg == NULL)
2462                 return;
2463
2464         _rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
2465                         RTE_ETH_EVENT_INTR_LSC, NULL, NULL);
2466 }
2467
2468 int
2469 bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type,
2470                 void *param, void *ret_param __rte_unused)
2471 {
2472         struct rte_eth_dev *bonded_eth_dev;
2473         struct bond_dev_private *internals;
2474         struct rte_eth_link link;
2475         int rc = -1;
2476
2477         int i, valid_slave = 0;
2478         uint8_t active_pos;
2479         uint8_t lsc_flag = 0;
2480
2481         if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
2482                 return rc;
2483
2484         bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
2485
2486         if (check_for_bonded_ethdev(bonded_eth_dev))
2487                 return rc;
2488
2489         internals = bonded_eth_dev->data->dev_private;
2490
2491         /* If the device isn't started don't handle interrupts */
2492         if (!bonded_eth_dev->data->dev_started)
2493                 return rc;
2494
2495         /* verify that port_id is a valid slave of bonded port */
2496         for (i = 0; i < internals->slave_count; i++) {
2497                 if (internals->slaves[i].port_id == port_id) {
2498                         valid_slave = 1;
2499                         break;
2500                 }
2501         }
2502
2503         if (!valid_slave)
2504                 return rc;
2505
2506         /* Search for port in active port list */
2507         active_pos = find_slave_by_id(internals->active_slaves,
2508                         internals->active_slave_count, port_id);
2509
2510         rte_eth_link_get_nowait(port_id, &link);
2511         if (link.link_status) {
2512                 if (active_pos < internals->active_slave_count)
2513                         return rc;
2514
2515                 /* if no active slave ports then set this port to be primary port */
2516                 if (internals->active_slave_count < 1) {
2517                         /* If first active slave, then change link status */
2518                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2519                         internals->current_primary_port = port_id;
2520                         lsc_flag = 1;
2521
2522                         mac_address_slaves_update(bonded_eth_dev);
2523                 }
2524
2525                 activate_slave(bonded_eth_dev, port_id);
2526
2527                 /* If user has defined the primary port then default to using it */
2528                 if (internals->user_defined_primary_port &&
2529                                 internals->primary_port == port_id)
2530                         bond_ethdev_primary_set(internals, port_id);
2531         } else {
2532                 if (active_pos == internals->active_slave_count)
2533                         return rc;
2534
2535                 /* Remove from active slave list */
2536                 deactivate_slave(bonded_eth_dev, port_id);
2537
2538                 /* Update primary id, take first active slave from list or if none
2539                  * available set to -1 */
2540                 if (port_id == internals->current_primary_port) {
2541                         if (internals->active_slave_count > 0)
2542                                 bond_ethdev_primary_set(internals,
2543                                                 internals->active_slaves[0]);
2544                         else
2545                                 internals->current_primary_port = internals->primary_port;
2546                 }
2547         }
2548
2549         /**
2550          * Update bonded device link properties after any change to active
2551          * slaves
2552          */
2553         bond_ethdev_link_update(bonded_eth_dev, 0);
2554
2555         if (lsc_flag) {
2556                 /* Cancel any possible outstanding interrupts if delays are enabled */
2557                 if (internals->link_up_delay_ms > 0 ||
2558                         internals->link_down_delay_ms > 0)
2559                         rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2560                                         bonded_eth_dev);
2561
2562                 if (bonded_eth_dev->data->dev_link.link_status) {
2563                         if (internals->link_up_delay_ms > 0)
2564                                 rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2565                                                 bond_ethdev_delayed_lsc_propagation,
2566                                                 (void *)bonded_eth_dev);
2567                         else
2568                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2569                                                 RTE_ETH_EVENT_INTR_LSC,
2570                                                 NULL, NULL);
2571
2572                 } else {
2573                         if (internals->link_down_delay_ms > 0)
2574                                 rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2575                                                 bond_ethdev_delayed_lsc_propagation,
2576                                                 (void *)bonded_eth_dev);
2577                         else
2578                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2579                                                 RTE_ETH_EVENT_INTR_LSC,
2580                                                 NULL, NULL);
2581                 }
2582         }
2583         return 0;
2584 }
2585
2586 static int
2587 bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2588                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2589 {
2590         unsigned i, j;
2591         int result = 0;
2592         int slave_reta_size;
2593         unsigned reta_count;
2594         struct bond_dev_private *internals = dev->data->dev_private;
2595
2596         if (reta_size != internals->reta_size)
2597                 return -EINVAL;
2598
2599          /* Copy RETA table */
2600         reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2601
2602         for (i = 0; i < reta_count; i++) {
2603                 internals->reta_conf[i].mask = reta_conf[i].mask;
2604                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2605                         if ((reta_conf[i].mask >> j) & 0x01)
2606                                 internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2607         }
2608
2609         /* Fill rest of array */
2610         for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2611                 memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2612                                 sizeof(internals->reta_conf[0]) * reta_count);
2613
2614         /* Propagate RETA over slaves */
2615         for (i = 0; i < internals->slave_count; i++) {
2616                 slave_reta_size = internals->slaves[i].reta_size;
2617                 result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2618                                 &internals->reta_conf[0], slave_reta_size);
2619                 if (result < 0)
2620                         return result;
2621         }
2622
2623         return 0;
2624 }
2625
2626 static int
2627 bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2628                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2629 {
2630         int i, j;
2631         struct bond_dev_private *internals = dev->data->dev_private;
2632
2633         if (reta_size != internals->reta_size)
2634                 return -EINVAL;
2635
2636          /* Copy RETA table */
2637         for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2638                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2639                         if ((reta_conf[i].mask >> j) & 0x01)
2640                                 reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2641
2642         return 0;
2643 }
2644
2645 static int
2646 bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2647                 struct rte_eth_rss_conf *rss_conf)
2648 {
2649         int i, result = 0;
2650         struct bond_dev_private *internals = dev->data->dev_private;
2651         struct rte_eth_rss_conf bond_rss_conf;
2652
2653         memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2654
2655         bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2656
2657         if (bond_rss_conf.rss_hf != 0)
2658                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2659
2660         if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2661                         sizeof(internals->rss_key)) {
2662                 if (bond_rss_conf.rss_key_len == 0)
2663                         bond_rss_conf.rss_key_len = 40;
2664                 internals->rss_key_len = bond_rss_conf.rss_key_len;
2665                 memcpy(internals->rss_key, bond_rss_conf.rss_key,
2666                                 internals->rss_key_len);
2667         }
2668
2669         for (i = 0; i < internals->slave_count; i++) {
2670                 result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2671                                 &bond_rss_conf);
2672                 if (result < 0)
2673                         return result;
2674         }
2675
2676         return 0;
2677 }
2678
2679 static int
2680 bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2681                 struct rte_eth_rss_conf *rss_conf)
2682 {
2683         struct bond_dev_private *internals = dev->data->dev_private;
2684
2685         rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2686         rss_conf->rss_key_len = internals->rss_key_len;
2687         if (rss_conf->rss_key)
2688                 memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2689
2690         return 0;
2691 }
2692
2693 const struct eth_dev_ops default_dev_ops = {
2694         .dev_start            = bond_ethdev_start,
2695         .dev_stop             = bond_ethdev_stop,
2696         .dev_close            = bond_ethdev_close,
2697         .dev_configure        = bond_ethdev_configure,
2698         .dev_infos_get        = bond_ethdev_info,
2699         .vlan_filter_set      = bond_ethdev_vlan_filter_set,
2700         .rx_queue_setup       = bond_ethdev_rx_queue_setup,
2701         .tx_queue_setup       = bond_ethdev_tx_queue_setup,
2702         .rx_queue_release     = bond_ethdev_rx_queue_release,
2703         .tx_queue_release     = bond_ethdev_tx_queue_release,
2704         .link_update          = bond_ethdev_link_update,
2705         .stats_get            = bond_ethdev_stats_get,
2706         .stats_reset          = bond_ethdev_stats_reset,
2707         .promiscuous_enable   = bond_ethdev_promiscuous_enable,
2708         .promiscuous_disable  = bond_ethdev_promiscuous_disable,
2709         .reta_update          = bond_ethdev_rss_reta_update,
2710         .reta_query           = bond_ethdev_rss_reta_query,
2711         .rss_hash_update      = bond_ethdev_rss_hash_update,
2712         .rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get
2713 };
2714
2715 static int
2716 bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
2717 {
2718         const char *name = rte_vdev_device_name(dev);
2719         uint8_t socket_id = dev->device.numa_node;
2720         struct bond_dev_private *internals = NULL;
2721         struct rte_eth_dev *eth_dev = NULL;
2722         uint32_t vlan_filter_bmp_size;
2723
2724         /* now do all data allocation - for eth_dev structure, dummy pci driver
2725          * and internal (private) data
2726          */
2727
2728         /* reserve an ethdev entry */
2729         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internals));
2730         if (eth_dev == NULL) {
2731                 RTE_BOND_LOG(ERR, "Unable to allocate rte_eth_dev");
2732                 goto err;
2733         }
2734
2735         internals = eth_dev->data->dev_private;
2736         eth_dev->data->nb_rx_queues = (uint16_t)1;
2737         eth_dev->data->nb_tx_queues = (uint16_t)1;
2738
2739         eth_dev->data->mac_addrs = rte_zmalloc_socket(name, ETHER_ADDR_LEN, 0,
2740                         socket_id);
2741         if (eth_dev->data->mac_addrs == NULL) {
2742                 RTE_BOND_LOG(ERR, "Unable to malloc mac_addrs");
2743                 goto err;
2744         }
2745
2746         eth_dev->dev_ops = &default_dev_ops;
2747         eth_dev->data->dev_flags = RTE_ETH_DEV_INTR_LSC |
2748                 RTE_ETH_DEV_DETACHABLE;
2749
2750         rte_spinlock_init(&internals->lock);
2751
2752         internals->port_id = eth_dev->data->port_id;
2753         internals->mode = BONDING_MODE_INVALID;
2754         internals->current_primary_port = RTE_MAX_ETHPORTS + 1;
2755         internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2;
2756         internals->xmit_hash = xmit_l2_hash;
2757         internals->user_defined_mac = 0;
2758
2759         internals->link_status_polling_enabled = 0;
2760
2761         internals->link_status_polling_interval_ms =
2762                 DEFAULT_POLLING_INTERVAL_10_MS;
2763         internals->link_down_delay_ms = 0;
2764         internals->link_up_delay_ms = 0;
2765
2766         internals->slave_count = 0;
2767         internals->active_slave_count = 0;
2768         internals->rx_offload_capa = 0;
2769         internals->tx_offload_capa = 0;
2770         internals->candidate_max_rx_pktlen = 0;
2771         internals->max_rx_pktlen = 0;
2772
2773         /* Initially allow to choose any offload type */
2774         internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK;
2775
2776         memset(internals->active_slaves, 0, sizeof(internals->active_slaves));
2777         memset(internals->slaves, 0, sizeof(internals->slaves));
2778
2779         /* Set mode 4 default configuration */
2780         bond_mode_8023ad_setup(eth_dev, NULL);
2781         if (bond_ethdev_mode_set(eth_dev, mode)) {
2782                 RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d",
2783                                  eth_dev->data->port_id, mode);
2784                 goto err;
2785         }
2786
2787         vlan_filter_bmp_size =
2788                 rte_bitmap_get_memory_footprint(ETHER_MAX_VLAN_ID + 1);
2789         internals->vlan_filter_bmpmem = rte_malloc(name, vlan_filter_bmp_size,
2790                                                    RTE_CACHE_LINE_SIZE);
2791         if (internals->vlan_filter_bmpmem == NULL) {
2792                 RTE_BOND_LOG(ERR,
2793                              "Failed to allocate vlan bitmap for bonded device %u\n",
2794                              eth_dev->data->port_id);
2795                 goto err;
2796         }
2797
2798         internals->vlan_filter_bmp = rte_bitmap_init(ETHER_MAX_VLAN_ID + 1,
2799                         internals->vlan_filter_bmpmem, vlan_filter_bmp_size);
2800         if (internals->vlan_filter_bmp == NULL) {
2801                 RTE_BOND_LOG(ERR,
2802                              "Failed to init vlan bitmap for bonded device %u\n",
2803                              eth_dev->data->port_id);
2804                 rte_free(internals->vlan_filter_bmpmem);
2805                 goto err;
2806         }
2807
2808         return eth_dev->data->port_id;
2809
2810 err:
2811         rte_free(internals);
2812         if (eth_dev != NULL) {
2813                 rte_free(eth_dev->data->mac_addrs);
2814                 rte_eth_dev_release_port(eth_dev);
2815         }
2816         return -1;
2817 }
2818
2819 static int
2820 bond_probe(struct rte_vdev_device *dev)
2821 {
2822         const char *name;
2823         struct bond_dev_private *internals;
2824         struct rte_kvargs *kvlist;
2825         uint8_t bonding_mode, socket_id;
2826         int  arg_count, port_id;
2827
2828         if (!dev)
2829                 return -EINVAL;
2830
2831         name = rte_vdev_device_name(dev);
2832         RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
2833
2834         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev),
2835                 pmd_bond_init_valid_arguments);
2836         if (kvlist == NULL)
2837                 return -1;
2838
2839         /* Parse link bonding mode */
2840         if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
2841                 if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
2842                                 &bond_ethdev_parse_slave_mode_kvarg,
2843                                 &bonding_mode) != 0) {
2844                         RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
2845                                         name);
2846                         goto parse_error;
2847                 }
2848         } else {
2849                 RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
2850                                 "device %s\n", name);
2851                 goto parse_error;
2852         }
2853
2854         /* Parse socket id to create bonding device on */
2855         arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
2856         if (arg_count == 1) {
2857                 if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
2858                                 &bond_ethdev_parse_socket_id_kvarg, &socket_id)
2859                                 != 0) {
2860                         RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
2861                                         "bonded device %s\n", name);
2862                         goto parse_error;
2863                 }
2864         } else if (arg_count > 1) {
2865                 RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
2866                                 "bonded device %s\n", name);
2867                 goto parse_error;
2868         } else {
2869                 socket_id = rte_socket_id();
2870         }
2871
2872         dev->device.numa_node = socket_id;
2873
2874         /* Create link bonding eth device */
2875         port_id = bond_alloc(dev, bonding_mode);
2876         if (port_id < 0) {
2877                 RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
2878                                 "socket %u.\n", name, bonding_mode, socket_id);
2879                 goto parse_error;
2880         }
2881         internals = rte_eth_devices[port_id].data->dev_private;
2882         internals->kvlist = kvlist;
2883
2884         RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
2885                         "socket %u.\n", name, port_id, bonding_mode, socket_id);
2886         return 0;
2887
2888 parse_error:
2889         rte_kvargs_free(kvlist);
2890
2891         return -1;
2892 }
2893
2894 static int
2895 bond_remove(struct rte_vdev_device *dev)
2896 {
2897         struct rte_eth_dev *eth_dev;
2898         struct bond_dev_private *internals;
2899         const char *name;
2900
2901         if (!dev)
2902                 return -EINVAL;
2903
2904         name = rte_vdev_device_name(dev);
2905         RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
2906
2907         /* now free all data allocation - for eth_dev structure,
2908          * dummy pci driver and internal (private) data
2909          */
2910
2911         /* find an ethdev entry */
2912         eth_dev = rte_eth_dev_allocated(name);
2913         if (eth_dev == NULL)
2914                 return -ENODEV;
2915
2916         RTE_ASSERT(eth_dev->device == &dev->device);
2917
2918         internals = eth_dev->data->dev_private;
2919         if (internals->slave_count != 0)
2920                 return -EBUSY;
2921
2922         if (eth_dev->data->dev_started == 1) {
2923                 bond_ethdev_stop(eth_dev);
2924                 bond_ethdev_close(eth_dev);
2925         }
2926
2927         eth_dev->dev_ops = NULL;
2928         eth_dev->rx_pkt_burst = NULL;
2929         eth_dev->tx_pkt_burst = NULL;
2930
2931         internals = eth_dev->data->dev_private;
2932         rte_bitmap_free(internals->vlan_filter_bmp);
2933         rte_free(internals->vlan_filter_bmpmem);
2934         rte_free(eth_dev->data->dev_private);
2935         rte_free(eth_dev->data->mac_addrs);
2936
2937         rte_eth_dev_release_port(eth_dev);
2938
2939         return 0;
2940 }
2941
2942 /* this part will resolve the slave portids after all the other pdev and vdev
2943  * have been allocated */
2944 static int
2945 bond_ethdev_configure(struct rte_eth_dev *dev)
2946 {
2947         const char *name = dev->device->name;
2948         struct bond_dev_private *internals = dev->data->dev_private;
2949         struct rte_kvargs *kvlist = internals->kvlist;
2950         int arg_count;
2951         uint8_t port_id = dev - rte_eth_devices;
2952
2953         static const uint8_t default_rss_key[40] = {
2954                 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
2955                 0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
2956                 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
2957                 0xBE, 0xAC, 0x01, 0xFA
2958         };
2959
2960         unsigned i, j;
2961
2962         /* If RSS is enabled, fill table and key with default values */
2963         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
2964                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
2965                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
2966                 memcpy(internals->rss_key, default_rss_key, 40);
2967
2968                 for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
2969                         internals->reta_conf[i].mask = ~0LL;
2970                         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2971                                 internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
2972                 }
2973         }
2974
2975         /* set the max_rx_pktlen */
2976         internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
2977
2978         /*
2979          * if no kvlist, it means that this bonded device has been created
2980          * through the bonding api.
2981          */
2982         if (!kvlist)
2983                 return 0;
2984
2985         /* Parse MAC address for bonded device */
2986         arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
2987         if (arg_count == 1) {
2988                 struct ether_addr bond_mac;
2989
2990                 if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
2991                                 &bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
2992                         RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
2993                                         name);
2994                         return -1;
2995                 }
2996
2997                 /* Set MAC address */
2998                 if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
2999                         RTE_LOG(ERR, EAL,
3000                                         "Failed to set mac address on bonded device %s\n",
3001                                         name);
3002                         return -1;
3003                 }
3004         } else if (arg_count > 1) {
3005                 RTE_LOG(ERR, EAL,
3006                                 "MAC address can be specified only once for bonded device %s\n",
3007                                 name);
3008                 return -1;
3009         }
3010
3011         /* Parse/set balance mode transmit policy */
3012         arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
3013         if (arg_count == 1) {
3014                 uint8_t xmit_policy;
3015
3016                 if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
3017                                 &bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
3018                                                 0) {
3019                         RTE_LOG(INFO, EAL,
3020                                         "Invalid xmit policy specified for bonded device %s\n",
3021                                         name);
3022                         return -1;
3023                 }
3024
3025                 /* Set balance mode transmit policy*/
3026                 if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
3027                         RTE_LOG(ERR, EAL,
3028                                         "Failed to set balance xmit policy on bonded device %s\n",
3029                                         name);
3030                         return -1;
3031                 }
3032         } else if (arg_count > 1) {
3033                 RTE_LOG(ERR, EAL,
3034                                 "Transmit policy can be specified only once for bonded device"
3035                                 " %s\n", name);
3036                 return -1;
3037         }
3038
3039         /* Parse/add slave ports to bonded device */
3040         if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
3041                 struct bond_ethdev_slave_ports slave_ports;
3042                 unsigned i;
3043
3044                 memset(&slave_ports, 0, sizeof(slave_ports));
3045
3046                 if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
3047                                 &bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
3048                         RTE_LOG(ERR, EAL,
3049                                         "Failed to parse slave ports for bonded device %s\n",
3050                                         name);
3051                         return -1;
3052                 }
3053
3054                 for (i = 0; i < slave_ports.slave_count; i++) {
3055                         if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
3056                                 RTE_LOG(ERR, EAL,
3057                                                 "Failed to add port %d as slave to bonded device %s\n",
3058                                                 slave_ports.slaves[i], name);
3059                         }
3060                 }
3061
3062         } else {
3063                 RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
3064                 return -1;
3065         }
3066
3067         /* Parse/set primary slave port id*/
3068         arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
3069         if (arg_count == 1) {
3070                 uint8_t primary_slave_port_id;
3071
3072                 if (rte_kvargs_process(kvlist,
3073                                 PMD_BOND_PRIMARY_SLAVE_KVARG,
3074                                 &bond_ethdev_parse_primary_slave_port_id_kvarg,
3075                                 &primary_slave_port_id) < 0) {
3076                         RTE_LOG(INFO, EAL,
3077                                         "Invalid primary slave port id specified for bonded device"
3078                                         " %s\n", name);
3079                         return -1;
3080                 }
3081
3082                 /* Set balance mode transmit policy*/
3083                 if (rte_eth_bond_primary_set(port_id, (uint8_t)primary_slave_port_id)
3084                                 != 0) {
3085                         RTE_LOG(ERR, EAL,
3086                                         "Failed to set primary slave port %d on bonded device %s\n",
3087                                         primary_slave_port_id, name);
3088                         return -1;
3089                 }
3090         } else if (arg_count > 1) {
3091                 RTE_LOG(INFO, EAL,
3092                                 "Primary slave can be specified only once for bonded device"
3093                                 " %s\n", name);
3094                 return -1;
3095         }
3096
3097         /* Parse link status monitor polling interval */
3098         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
3099         if (arg_count == 1) {
3100                 uint32_t lsc_poll_interval_ms;
3101
3102                 if (rte_kvargs_process(kvlist,
3103                                 PMD_BOND_LSC_POLL_PERIOD_KVARG,
3104                                 &bond_ethdev_parse_time_ms_kvarg,
3105                                 &lsc_poll_interval_ms) < 0) {
3106                         RTE_LOG(INFO, EAL,
3107                                         "Invalid lsc polling interval value specified for bonded"
3108                                         " device %s\n", name);
3109                         return -1;
3110                 }
3111
3112                 if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
3113                                 != 0) {
3114                         RTE_LOG(ERR, EAL,
3115                                         "Failed to set lsc monitor polling interval (%u ms) on"
3116                                         " bonded device %s\n", lsc_poll_interval_ms, name);
3117                         return -1;
3118                 }
3119         } else if (arg_count > 1) {
3120                 RTE_LOG(INFO, EAL,
3121                                 "LSC polling interval can be specified only once for bonded"
3122                                 " device %s\n", name);
3123                 return -1;
3124         }
3125
3126         /* Parse link up interrupt propagation delay */
3127         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
3128         if (arg_count == 1) {
3129                 uint32_t link_up_delay_ms;
3130
3131                 if (rte_kvargs_process(kvlist,
3132                                 PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
3133                                 &bond_ethdev_parse_time_ms_kvarg,
3134                                 &link_up_delay_ms) < 0) {
3135                         RTE_LOG(INFO, EAL,
3136                                         "Invalid link up propagation delay value specified for"
3137                                         " bonded device %s\n", name);
3138                         return -1;
3139                 }
3140
3141                 /* Set balance mode transmit policy*/
3142                 if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
3143                                 != 0) {
3144                         RTE_LOG(ERR, EAL,
3145                                         "Failed to set link up propagation delay (%u ms) on bonded"
3146                                         " device %s\n", link_up_delay_ms, name);
3147                         return -1;
3148                 }
3149         } else if (arg_count > 1) {
3150                 RTE_LOG(INFO, EAL,
3151                                 "Link up propagation delay can be specified only once for"
3152                                 " bonded device %s\n", name);
3153                 return -1;
3154         }
3155
3156         /* Parse link down interrupt propagation delay */
3157         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
3158         if (arg_count == 1) {
3159                 uint32_t link_down_delay_ms;
3160
3161                 if (rte_kvargs_process(kvlist,
3162                                 PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
3163                                 &bond_ethdev_parse_time_ms_kvarg,
3164                                 &link_down_delay_ms) < 0) {
3165                         RTE_LOG(INFO, EAL,
3166                                         "Invalid link down propagation delay value specified for"
3167                                         " bonded device %s\n", name);
3168                         return -1;
3169                 }
3170
3171                 /* Set balance mode transmit policy*/
3172                 if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
3173                                 != 0) {
3174                         RTE_LOG(ERR, EAL,
3175                                         "Failed to set link down propagation delay (%u ms) on"
3176                                         " bonded device %s\n", link_down_delay_ms, name);
3177                         return -1;
3178                 }
3179         } else if (arg_count > 1) {
3180                 RTE_LOG(INFO, EAL,
3181                                 "Link down propagation delay can be specified only once for"
3182                                 " bonded device %s\n", name);
3183                 return -1;
3184         }
3185
3186         return 0;
3187 }
3188
3189 struct rte_vdev_driver pmd_bond_drv = {
3190         .probe = bond_probe,
3191         .remove = bond_remove,
3192 };
3193
3194 RTE_PMD_REGISTER_VDEV(net_bonding, pmd_bond_drv);
3195 RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
3196
3197 RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
3198         "slave=<ifc> "
3199         "primary=<ifc> "
3200         "mode=[0-6] "
3201         "xmit_policy=[l2 | l23 | l34] "
3202         "socket_id=<int> "
3203         "mac=<mac addr> "
3204         "lsc_poll_period_ms=<int> "
3205         "up_delay=<int> "
3206         "down_delay=<int>");