net/bonding: use ptype flags for LACP Rx filtering
[dpdk.git] / drivers / net / bonding / rte_eth_bond_pmd.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 #include <stdlib.h>
34 #include <netinet/in.h>
35
36 #include <rte_mbuf.h>
37 #include <rte_malloc.h>
38 #include <rte_ethdev.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_tcp.h>
41 #include <rte_udp.h>
42 #include <rte_ip.h>
43 #include <rte_ip_frag.h>
44 #include <rte_devargs.h>
45 #include <rte_kvargs.h>
46 #include <rte_vdev.h>
47 #include <rte_alarm.h>
48 #include <rte_cycles.h>
49
50 #include "rte_eth_bond.h"
51 #include "rte_eth_bond_private.h"
52 #include "rte_eth_bond_8023ad_private.h"
53
54 #define REORDER_PERIOD_MS 10
55 #define DEFAULT_POLLING_INTERVAL_10_MS (10)
56
57 #define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
58
59 /* Table for statistics in mode 5 TLB */
60 static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
61
62 static inline size_t
63 get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
64 {
65         size_t vlan_offset = 0;
66
67         if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
68                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
69
70                 vlan_offset = sizeof(struct vlan_hdr);
71                 *proto = vlan_hdr->eth_proto;
72
73                 if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
74                         vlan_hdr = vlan_hdr + 1;
75                         *proto = vlan_hdr->eth_proto;
76                         vlan_offset += sizeof(struct vlan_hdr);
77                 }
78         }
79         return vlan_offset;
80 }
81
82 static uint16_t
83 bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
84 {
85         struct bond_dev_private *internals;
86
87         uint16_t num_rx_slave = 0;
88         uint16_t num_rx_total = 0;
89
90         int i;
91
92         /* Cast to structure, containing bonded device's port id and queue id */
93         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
94
95         internals = bd_rx_q->dev_private;
96
97
98         for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
99                 /* Offset of pointer to *bufs increases as packets are received
100                  * from other slaves */
101                 num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
102                                 bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
103                 if (num_rx_slave) {
104                         num_rx_total += num_rx_slave;
105                         nb_pkts -= num_rx_slave;
106                 }
107         }
108
109         return num_rx_total;
110 }
111
112 static uint16_t
113 bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
114                 uint16_t nb_pkts)
115 {
116         struct bond_dev_private *internals;
117
118         /* Cast to structure, containing bonded device's port id and queue id */
119         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
120
121         internals = bd_rx_q->dev_private;
122
123         return rte_eth_rx_burst(internals->current_primary_port,
124                         bd_rx_q->queue_id, bufs, nb_pkts);
125 }
126
127 static inline uint8_t
128 is_lacp_packets(uint16_t ethertype, uint8_t subtype, uint16_t vlan_tci)
129 {
130         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
131
132         return !vlan_tci && (ethertype == ether_type_slow_be &&
133                 (subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
134 }
135
136 static uint16_t
137 bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
138                 uint16_t nb_pkts)
139 {
140         /* Cast to structure, containing bonded device's port id and queue id */
141         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
142         struct bond_dev_private *internals = bd_rx_q->dev_private;
143         struct ether_addr bond_mac;
144
145         struct ether_hdr *hdr;
146
147         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
148         uint16_t num_rx_total = 0;      /* Total number of received packets */
149         uint8_t slaves[RTE_MAX_ETHPORTS];
150         uint8_t slave_count, idx;
151
152         uint8_t collecting;  /* current slave collecting status */
153         const uint8_t promisc = internals->promiscuous_en;
154         uint8_t i, j, k;
155         uint8_t subtype;
156
157         rte_eth_macaddr_get(internals->port_id, &bond_mac);
158         /* Copy slave list to protect against slave up/down changes during tx
159          * bursting */
160         slave_count = internals->active_slave_count;
161         memcpy(slaves, internals->active_slaves,
162                         sizeof(internals->active_slaves[0]) * slave_count);
163
164         idx = internals->active_slave;
165         if (idx >= slave_count) {
166                 internals->active_slave = 0;
167                 idx = 0;
168         }
169         for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
170                 j = num_rx_total;
171                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
172                                          COLLECTING);
173
174                 /* Read packets from this slave */
175                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
176                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
177
178                 for (k = j; k < 2 && k < num_rx_total; k++)
179                         rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
180
181                 /* Handle slow protocol packets. */
182                 while (j < num_rx_total) {
183
184                         /* If packet is not pure L2 and is known, skip it */
185                         if ((bufs[j]->packet_type & ~RTE_PTYPE_L2_ETHER) != 0) {
186                                 j++;
187                                 continue;
188                         }
189
190                         if (j + 3 < num_rx_total)
191                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
192
193                         hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
194                         subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
195
196                         /* Remove packet from array if it is slow packet or slave is not
197                          * in collecting state or bonding interface is not in promiscuous
198                          * mode and packet address does not match. */
199                         if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]->vlan_tci) ||
200                                 !collecting || (!promisc &&
201                                         !is_multicast_ether_addr(&hdr->d_addr) &&
202                                         !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
203
204                                 if (hdr->ether_type == ether_type_slow_be) {
205                                         bond_mode_8023ad_handle_slow_pkt(
206                                             internals, slaves[idx], bufs[j]);
207                                 } else
208                                         rte_pktmbuf_free(bufs[j]);
209
210                                 /* Packet is managed by mode 4 or dropped, shift the array */
211                                 num_rx_total--;
212                                 if (j < num_rx_total) {
213                                         memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
214                                                 (num_rx_total - j));
215                                 }
216                         } else
217                                 j++;
218                 }
219                 if (unlikely(++idx == slave_count))
220                         idx = 0;
221         }
222
223         internals->active_slave = idx;
224         return num_rx_total;
225 }
226
227 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
228 uint32_t burstnumberRX;
229 uint32_t burstnumberTX;
230
231 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
232
233 static void
234 arp_op_name(uint16_t arp_op, char *buf)
235 {
236         switch (arp_op) {
237         case ARP_OP_REQUEST:
238                 snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
239                 return;
240         case ARP_OP_REPLY:
241                 snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
242                 return;
243         case ARP_OP_REVREQUEST:
244                 snprintf(buf, sizeof("Reverse ARP Request"), "%s",
245                                 "Reverse ARP Request");
246                 return;
247         case ARP_OP_REVREPLY:
248                 snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
249                                 "Reverse ARP Reply");
250                 return;
251         case ARP_OP_INVREQUEST:
252                 snprintf(buf, sizeof("Peer Identify Request"), "%s",
253                                 "Peer Identify Request");
254                 return;
255         case ARP_OP_INVREPLY:
256                 snprintf(buf, sizeof("Peer Identify Reply"), "%s",
257                                 "Peer Identify Reply");
258                 return;
259         default:
260                 break;
261         }
262         snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
263         return;
264 }
265 #endif
266 #define MaxIPv4String   16
267 static void
268 ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
269 {
270         uint32_t ipv4_addr;
271
272         ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
273         snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
274                 (ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
275                 ipv4_addr & 0xFF);
276 }
277
278 #define MAX_CLIENTS_NUMBER      128
279 uint8_t active_clients;
280 struct client_stats_t {
281         uint8_t port;
282         uint32_t ipv4_addr;
283         uint32_t ipv4_rx_packets;
284         uint32_t ipv4_tx_packets;
285 };
286 struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
287
288 static void
289 update_client_stats(uint32_t addr, uint8_t port, uint32_t *TXorRXindicator)
290 {
291         int i = 0;
292
293         for (; i < MAX_CLIENTS_NUMBER; i++)     {
294                 if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))      {
295                         /* Just update RX packets number for this client */
296                         if (TXorRXindicator == &burstnumberRX)
297                                 client_stats[i].ipv4_rx_packets++;
298                         else
299                                 client_stats[i].ipv4_tx_packets++;
300                         return;
301                 }
302         }
303         /* We have a new client. Insert him to the table, and increment stats */
304         if (TXorRXindicator == &burstnumberRX)
305                 client_stats[active_clients].ipv4_rx_packets++;
306         else
307                 client_stats[active_clients].ipv4_tx_packets++;
308         client_stats[active_clients].ipv4_addr = addr;
309         client_stats[active_clients].port = port;
310         active_clients++;
311
312 }
313
314 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
315 #define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)     \
316                 RTE_LOG(DEBUG, PMD, \
317                 "%s " \
318                 "port:%d " \
319                 "SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
320                 "SrcIP:%s " \
321                 "DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
322                 "DstIP:%s " \
323                 "%s " \
324                 "%d\n", \
325                 info, \
326                 port, \
327                 eth_h->s_addr.addr_bytes[0], \
328                 eth_h->s_addr.addr_bytes[1], \
329                 eth_h->s_addr.addr_bytes[2], \
330                 eth_h->s_addr.addr_bytes[3], \
331                 eth_h->s_addr.addr_bytes[4], \
332                 eth_h->s_addr.addr_bytes[5], \
333                 src_ip, \
334                 eth_h->d_addr.addr_bytes[0], \
335                 eth_h->d_addr.addr_bytes[1], \
336                 eth_h->d_addr.addr_bytes[2], \
337                 eth_h->d_addr.addr_bytes[3], \
338                 eth_h->d_addr.addr_bytes[4], \
339                 eth_h->d_addr.addr_bytes[5], \
340                 dst_ip, \
341                 arp_op, \
342                 ++burstnumber)
343 #endif
344
345 static void
346 mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
347                 uint8_t port, uint32_t __attribute__((unused)) *burstnumber)
348 {
349         struct ipv4_hdr *ipv4_h;
350 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
351         struct arp_hdr *arp_h;
352         char dst_ip[16];
353         char ArpOp[24];
354         char buf[16];
355 #endif
356         char src_ip[16];
357
358         uint16_t ether_type = eth_h->ether_type;
359         uint16_t offset = get_vlan_offset(eth_h, &ether_type);
360
361 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
362         snprintf(buf, 16, "%s", info);
363 #endif
364
365         if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
366                 ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
367                 ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
368 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
369                 ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
370                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
371 #endif
372                 update_client_stats(ipv4_h->src_addr, port, burstnumber);
373         }
374 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
375         else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
376                 arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
377                 ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
378                 ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
379                 arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
380                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
381         }
382 #endif
383 }
384 #endif
385
386 static uint16_t
387 bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
388 {
389         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
390         struct bond_dev_private *internals = bd_tx_q->dev_private;
391         struct ether_hdr *eth_h;
392         uint16_t ether_type, offset;
393         uint16_t nb_recv_pkts;
394         int i;
395
396         nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
397
398         for (i = 0; i < nb_recv_pkts; i++) {
399                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
400                 ether_type = eth_h->ether_type;
401                 offset = get_vlan_offset(eth_h, &ether_type);
402
403                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
404 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
405                         mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
406 #endif
407                         bond_mode_alb_arp_recv(eth_h, offset, internals);
408                 }
409 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
410                 else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
411                         mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
412 #endif
413         }
414
415         return nb_recv_pkts;
416 }
417
418 static uint16_t
419 bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
420                 uint16_t nb_pkts)
421 {
422         struct bond_dev_private *internals;
423         struct bond_tx_queue *bd_tx_q;
424
425         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
426         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
427
428         uint8_t num_of_slaves;
429         uint8_t slaves[RTE_MAX_ETHPORTS];
430
431         uint16_t num_tx_total = 0, num_tx_slave;
432
433         static int slave_idx = 0;
434         int i, cslave_idx = 0, tx_fail_total = 0;
435
436         bd_tx_q = (struct bond_tx_queue *)queue;
437         internals = bd_tx_q->dev_private;
438
439         /* Copy slave list to protect against slave up/down changes during tx
440          * bursting */
441         num_of_slaves = internals->active_slave_count;
442         memcpy(slaves, internals->active_slaves,
443                         sizeof(internals->active_slaves[0]) * num_of_slaves);
444
445         if (num_of_slaves < 1)
446                 return num_tx_total;
447
448         /* Populate slaves mbuf with which packets are to be sent on it  */
449         for (i = 0; i < nb_pkts; i++) {
450                 cslave_idx = (slave_idx + i) % num_of_slaves;
451                 slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
452         }
453
454         /* increment current slave index so the next call to tx burst starts on the
455          * next slave */
456         slave_idx = ++cslave_idx;
457
458         /* Send packet burst on each slave device */
459         for (i = 0; i < num_of_slaves; i++) {
460                 if (slave_nb_pkts[i] > 0) {
461                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
462                                         slave_bufs[i], slave_nb_pkts[i]);
463
464                         /* if tx burst fails move packets to end of bufs */
465                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
466                                 int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
467
468                                 tx_fail_total += tx_fail_slave;
469
470                                 memcpy(&bufs[nb_pkts - tx_fail_total],
471                                                 &slave_bufs[i][num_tx_slave],
472                                                 tx_fail_slave * sizeof(bufs[0]));
473                         }
474                         num_tx_total += num_tx_slave;
475                 }
476         }
477
478         return num_tx_total;
479 }
480
481 static uint16_t
482 bond_ethdev_tx_burst_active_backup(void *queue,
483                 struct rte_mbuf **bufs, uint16_t nb_pkts)
484 {
485         struct bond_dev_private *internals;
486         struct bond_tx_queue *bd_tx_q;
487
488         bd_tx_q = (struct bond_tx_queue *)queue;
489         internals = bd_tx_q->dev_private;
490
491         if (internals->active_slave_count < 1)
492                 return 0;
493
494         return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
495                         bufs, nb_pkts);
496 }
497
498 static inline uint16_t
499 ether_hash(struct ether_hdr *eth_hdr)
500 {
501         unaligned_uint16_t *word_src_addr =
502                 (unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
503         unaligned_uint16_t *word_dst_addr =
504                 (unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
505
506         return (word_src_addr[0] ^ word_dst_addr[0]) ^
507                         (word_src_addr[1] ^ word_dst_addr[1]) ^
508                         (word_src_addr[2] ^ word_dst_addr[2]);
509 }
510
511 static inline uint32_t
512 ipv4_hash(struct ipv4_hdr *ipv4_hdr)
513 {
514         return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
515 }
516
517 static inline uint32_t
518 ipv6_hash(struct ipv6_hdr *ipv6_hdr)
519 {
520         unaligned_uint32_t *word_src_addr =
521                 (unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
522         unaligned_uint32_t *word_dst_addr =
523                 (unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
524
525         return (word_src_addr[0] ^ word_dst_addr[0]) ^
526                         (word_src_addr[1] ^ word_dst_addr[1]) ^
527                         (word_src_addr[2] ^ word_dst_addr[2]) ^
528                         (word_src_addr[3] ^ word_dst_addr[3]);
529 }
530
531 uint16_t
532 xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
533 {
534         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
535
536         uint32_t hash = ether_hash(eth_hdr);
537
538         return (hash ^= hash >> 8) % slave_count;
539 }
540
541 uint16_t
542 xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count)
543 {
544         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
545         uint16_t proto = eth_hdr->ether_type;
546         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
547         uint32_t hash, l3hash = 0;
548
549         hash = ether_hash(eth_hdr);
550
551         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
552                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
553                                 ((char *)(eth_hdr + 1) + vlan_offset);
554                 l3hash = ipv4_hash(ipv4_hdr);
555
556         } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
557                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
558                                 ((char *)(eth_hdr + 1) + vlan_offset);
559                 l3hash = ipv6_hash(ipv6_hdr);
560         }
561
562         hash = hash ^ l3hash;
563         hash ^= hash >> 16;
564         hash ^= hash >> 8;
565
566         return hash % slave_count;
567 }
568
569 uint16_t
570 xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count)
571 {
572         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
573         uint16_t proto = eth_hdr->ether_type;
574         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
575
576         struct udp_hdr *udp_hdr = NULL;
577         struct tcp_hdr *tcp_hdr = NULL;
578         uint32_t hash, l3hash = 0, l4hash = 0;
579
580         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
581                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
582                                 ((char *)(eth_hdr + 1) + vlan_offset);
583                 size_t ip_hdr_offset;
584
585                 l3hash = ipv4_hash(ipv4_hdr);
586
587                 /* there is no L4 header in fragmented packet */
588                 if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) {
589                         ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) *
590                                         IPV4_IHL_MULTIPLIER;
591
592                         if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
593                                 tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
594                                                 ip_hdr_offset);
595                                 l4hash = HASH_L4_PORTS(tcp_hdr);
596                         } else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) {
597                                 udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
598                                                 ip_hdr_offset);
599                                 l4hash = HASH_L4_PORTS(udp_hdr);
600                         }
601                 }
602         } else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
603                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
604                                 ((char *)(eth_hdr + 1) + vlan_offset);
605                 l3hash = ipv6_hash(ipv6_hdr);
606
607                 if (ipv6_hdr->proto == IPPROTO_TCP) {
608                         tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
609                         l4hash = HASH_L4_PORTS(tcp_hdr);
610                 } else if (ipv6_hdr->proto == IPPROTO_UDP) {
611                         udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
612                         l4hash = HASH_L4_PORTS(udp_hdr);
613                 }
614         }
615
616         hash = l3hash ^ l4hash;
617         hash ^= hash >> 16;
618         hash ^= hash >> 8;
619
620         return hash % slave_count;
621 }
622
623 struct bwg_slave {
624         uint64_t bwg_left_int;
625         uint64_t bwg_left_remainder;
626         uint8_t slave;
627 };
628
629 void
630 bond_tlb_activate_slave(struct bond_dev_private *internals) {
631         int i;
632
633         for (i = 0; i < internals->active_slave_count; i++) {
634                 tlb_last_obytets[internals->active_slaves[i]] = 0;
635         }
636 }
637
638 static int
639 bandwidth_cmp(const void *a, const void *b)
640 {
641         const struct bwg_slave *bwg_a = a;
642         const struct bwg_slave *bwg_b = b;
643         int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
644         int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
645                         (int64_t)bwg_a->bwg_left_remainder;
646         if (diff > 0)
647                 return 1;
648         else if (diff < 0)
649                 return -1;
650         else if (diff2 > 0)
651                 return 1;
652         else if (diff2 < 0)
653                 return -1;
654         else
655                 return 0;
656 }
657
658 static void
659 bandwidth_left(uint8_t port_id, uint64_t load, uint8_t update_idx,
660                 struct bwg_slave *bwg_slave)
661 {
662         struct rte_eth_link link_status;
663
664         rte_eth_link_get_nowait(port_id, &link_status);
665         uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
666         if (link_bwg == 0)
667                 return;
668         link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
669         bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
670         bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
671 }
672
673 static void
674 bond_ethdev_update_tlb_slave_cb(void *arg)
675 {
676         struct bond_dev_private *internals = arg;
677         struct rte_eth_stats slave_stats;
678         struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
679         uint8_t slave_count;
680         uint64_t tx_bytes;
681
682         uint8_t update_stats = 0;
683         uint8_t i, slave_id;
684
685         internals->slave_update_idx++;
686
687
688         if (internals->slave_update_idx >= REORDER_PERIOD_MS)
689                 update_stats = 1;
690
691         for (i = 0; i < internals->active_slave_count; i++) {
692                 slave_id = internals->active_slaves[i];
693                 rte_eth_stats_get(slave_id, &slave_stats);
694                 tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
695                 bandwidth_left(slave_id, tx_bytes,
696                                 internals->slave_update_idx, &bwg_array[i]);
697                 bwg_array[i].slave = slave_id;
698
699                 if (update_stats) {
700                         tlb_last_obytets[slave_id] = slave_stats.obytes;
701                 }
702         }
703
704         if (update_stats == 1)
705                 internals->slave_update_idx = 0;
706
707         slave_count = i;
708         qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
709         for (i = 0; i < slave_count; i++)
710                 internals->tlb_slaves_order[i] = bwg_array[i].slave;
711
712         rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
713                         (struct bond_dev_private *)internals);
714 }
715
716 static uint16_t
717 bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
718 {
719         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
720         struct bond_dev_private *internals = bd_tx_q->dev_private;
721
722         struct rte_eth_dev *primary_port =
723                         &rte_eth_devices[internals->primary_port];
724         uint16_t num_tx_total = 0;
725         uint8_t i, j;
726
727         uint8_t num_of_slaves = internals->active_slave_count;
728         uint8_t slaves[RTE_MAX_ETHPORTS];
729
730         struct ether_hdr *ether_hdr;
731         struct ether_addr primary_slave_addr;
732         struct ether_addr active_slave_addr;
733
734         if (num_of_slaves < 1)
735                 return num_tx_total;
736
737         memcpy(slaves, internals->tlb_slaves_order,
738                                 sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
739
740
741         ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
742
743         if (nb_pkts > 3) {
744                 for (i = 0; i < 3; i++)
745                         rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
746         }
747
748         for (i = 0; i < num_of_slaves; i++) {
749                 rte_eth_macaddr_get(slaves[i], &active_slave_addr);
750                 for (j = num_tx_total; j < nb_pkts; j++) {
751                         if (j + 3 < nb_pkts)
752                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
753
754                         ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
755                         if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
756                                 ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
757 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
758                                         mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
759 #endif
760                 }
761
762                 num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
763                                 bufs + num_tx_total, nb_pkts - num_tx_total);
764
765                 if (num_tx_total == nb_pkts)
766                         break;
767         }
768
769         return num_tx_total;
770 }
771
772 void
773 bond_tlb_disable(struct bond_dev_private *internals)
774 {
775         rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
776 }
777
778 void
779 bond_tlb_enable(struct bond_dev_private *internals)
780 {
781         bond_ethdev_update_tlb_slave_cb(internals);
782 }
783
784 static uint16_t
785 bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
786 {
787         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
788         struct bond_dev_private *internals = bd_tx_q->dev_private;
789
790         struct ether_hdr *eth_h;
791         uint16_t ether_type, offset;
792
793         struct client_data *client_info;
794
795         /*
796          * We create transmit buffers for every slave and one additional to send
797          * through tlb. In worst case every packet will be send on one port.
798          */
799         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
800         uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
801
802         /*
803          * We create separate transmit buffers for update packets as they won't
804          * be counted in num_tx_total.
805          */
806         struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
807         uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
808
809         struct rte_mbuf *upd_pkt;
810         size_t pkt_size;
811
812         uint16_t num_send, num_not_send = 0;
813         uint16_t num_tx_total = 0;
814         uint8_t slave_idx;
815
816         int i, j;
817
818         /* Search tx buffer for ARP packets and forward them to alb */
819         for (i = 0; i < nb_pkts; i++) {
820                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
821                 ether_type = eth_h->ether_type;
822                 offset = get_vlan_offset(eth_h, &ether_type);
823
824                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
825                         slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
826
827                         /* Change src mac in eth header */
828                         rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
829
830                         /* Add packet to slave tx buffer */
831                         slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
832                         slave_bufs_pkts[slave_idx]++;
833                 } else {
834                         /* If packet is not ARP, send it with TLB policy */
835                         slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
836                                         bufs[i];
837                         slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
838                 }
839         }
840
841         /* Update connected client ARP tables */
842         if (internals->mode6.ntt) {
843                 for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
844                         client_info = &internals->mode6.client_table[i];
845
846                         if (client_info->in_use) {
847                                 /* Allocate new packet to send ARP update on current slave */
848                                 upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
849                                 if (upd_pkt == NULL) {
850                                         RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
851                                         continue;
852                                 }
853                                 pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
854                                                 + client_info->vlan_count * sizeof(struct vlan_hdr);
855                                 upd_pkt->data_len = pkt_size;
856                                 upd_pkt->pkt_len = pkt_size;
857
858                                 slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
859                                                 internals);
860
861                                 /* Add packet to update tx buffer */
862                                 update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
863                                 update_bufs_pkts[slave_idx]++;
864                         }
865                 }
866                 internals->mode6.ntt = 0;
867         }
868
869         /* Send ARP packets on proper slaves */
870         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
871                 if (slave_bufs_pkts[i] > 0) {
872                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
873                                         slave_bufs[i], slave_bufs_pkts[i]);
874                         for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
875                                 bufs[nb_pkts - 1 - num_not_send - j] =
876                                                 slave_bufs[i][nb_pkts - 1 - j];
877                         }
878
879                         num_tx_total += num_send;
880                         num_not_send += slave_bufs_pkts[i] - num_send;
881
882 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
883         /* Print TX stats including update packets */
884                         for (j = 0; j < slave_bufs_pkts[i]; j++) {
885                                 eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
886                                 mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
887                         }
888 #endif
889                 }
890         }
891
892         /* Send update packets on proper slaves */
893         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
894                 if (update_bufs_pkts[i] > 0) {
895                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
896                                         update_bufs_pkts[i]);
897                         for (j = num_send; j < update_bufs_pkts[i]; j++) {
898                                 rte_pktmbuf_free(update_bufs[i][j]);
899                         }
900 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
901                         for (j = 0; j < update_bufs_pkts[i]; j++) {
902                                 eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
903                                 mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
904                         }
905 #endif
906                 }
907         }
908
909         /* Send non-ARP packets using tlb policy */
910         if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
911                 num_send = bond_ethdev_tx_burst_tlb(queue,
912                                 slave_bufs[RTE_MAX_ETHPORTS],
913                                 slave_bufs_pkts[RTE_MAX_ETHPORTS]);
914
915                 for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
916                         bufs[nb_pkts - 1 - num_not_send - j] =
917                                         slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
918                 }
919
920                 num_tx_total += num_send;
921         }
922
923         return num_tx_total;
924 }
925
926 static uint16_t
927 bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
928                 uint16_t nb_pkts)
929 {
930         struct bond_dev_private *internals;
931         struct bond_tx_queue *bd_tx_q;
932
933         uint8_t num_of_slaves;
934         uint8_t slaves[RTE_MAX_ETHPORTS];
935
936         uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0;
937
938         int i, op_slave_id;
939
940         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
941         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
942
943         bd_tx_q = (struct bond_tx_queue *)queue;
944         internals = bd_tx_q->dev_private;
945
946         /* Copy slave list to protect against slave up/down changes during tx
947          * bursting */
948         num_of_slaves = internals->active_slave_count;
949         memcpy(slaves, internals->active_slaves,
950                         sizeof(internals->active_slaves[0]) * num_of_slaves);
951
952         if (num_of_slaves < 1)
953                 return num_tx_total;
954
955         /* Populate slaves mbuf with the packets which are to be sent on it  */
956         for (i = 0; i < nb_pkts; i++) {
957                 /* Select output slave using hash based on xmit policy */
958                 op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves);
959
960                 /* Populate slave mbuf arrays with mbufs for that slave */
961                 slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i];
962         }
963
964         /* Send packet burst on each slave device */
965         for (i = 0; i < num_of_slaves; i++) {
966                 if (slave_nb_pkts[i] > 0) {
967                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
968                                         slave_bufs[i], slave_nb_pkts[i]);
969
970                         /* if tx burst fails move packets to end of bufs */
971                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
972                                 int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave;
973
974                                 tx_fail_total += slave_tx_fail_count;
975                                 memcpy(&bufs[nb_pkts - tx_fail_total],
976                                                 &slave_bufs[i][num_tx_slave],
977                                                 slave_tx_fail_count * sizeof(bufs[0]));
978                         }
979
980                         num_tx_total += num_tx_slave;
981                 }
982         }
983
984         return num_tx_total;
985 }
986
987 static uint16_t
988 bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
989                 uint16_t nb_pkts)
990 {
991         struct bond_dev_private *internals;
992         struct bond_tx_queue *bd_tx_q;
993
994         uint8_t num_of_slaves;
995         uint8_t slaves[RTE_MAX_ETHPORTS];
996          /* positions in slaves, not ID */
997         uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
998         uint8_t distributing_count;
999
1000         uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
1001         uint16_t i, j, op_slave_idx;
1002         const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1;
1003
1004         /* Allocate additional packets in case 8023AD mode. */
1005         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size];
1006         void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS] = { NULL };
1007
1008         /* Total amount of packets in slave_bufs */
1009         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1010         /* Slow packets placed in each slave */
1011         uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1012
1013         bd_tx_q = (struct bond_tx_queue *)queue;
1014         internals = bd_tx_q->dev_private;
1015
1016         /* Copy slave list to protect against slave up/down changes during tx
1017          * bursting */
1018         num_of_slaves = internals->active_slave_count;
1019         if (num_of_slaves < 1)
1020                 return num_tx_total;
1021
1022         memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves);
1023
1024         distributing_count = 0;
1025         for (i = 0; i < num_of_slaves; i++) {
1026                 struct port *port = &mode_8023ad_ports[slaves[i]];
1027
1028                 slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring,
1029                                 slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS,
1030                                 NULL);
1031                 slave_nb_pkts[i] = slave_slow_nb_pkts[i];
1032
1033                 for (j = 0; j < slave_slow_nb_pkts[i]; j++)
1034                         slave_bufs[i][j] = slow_pkts[j];
1035
1036                 if (ACTOR_STATE(port, DISTRIBUTING))
1037                         distributing_offsets[distributing_count++] = i;
1038         }
1039
1040         if (likely(distributing_count > 0)) {
1041                 /* Populate slaves mbuf with the packets which are to be sent on it */
1042                 for (i = 0; i < nb_pkts; i++) {
1043                         /* Select output slave using hash based on xmit policy */
1044                         op_slave_idx = internals->xmit_hash(bufs[i], distributing_count);
1045
1046                         /* Populate slave mbuf arrays with mbufs for that slave. Use only
1047                          * slaves that are currently distributing. */
1048                         uint8_t slave_offset = distributing_offsets[op_slave_idx];
1049                         slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i];
1050                         slave_nb_pkts[slave_offset]++;
1051                 }
1052         }
1053
1054         /* Send packet burst on each slave device */
1055         for (i = 0; i < num_of_slaves; i++) {
1056                 if (slave_nb_pkts[i] == 0)
1057                         continue;
1058
1059                 num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1060                                 slave_bufs[i], slave_nb_pkts[i]);
1061
1062                 /* If tx burst fails drop slow packets */
1063                 for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++)
1064                         rte_pktmbuf_free(slave_bufs[i][num_tx_slave]);
1065
1066                 num_tx_total += num_tx_slave - slave_slow_nb_pkts[i];
1067                 num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
1068
1069                 /* If tx burst fails move packets to end of bufs */
1070                 if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1071                         uint16_t j = nb_pkts - num_tx_fail_total;
1072                         for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++)
1073                                 bufs[j] = slave_bufs[i][num_tx_slave];
1074                 }
1075         }
1076
1077         return num_tx_total;
1078 }
1079
1080 static uint16_t
1081 bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1082                 uint16_t nb_pkts)
1083 {
1084         struct bond_dev_private *internals;
1085         struct bond_tx_queue *bd_tx_q;
1086
1087         uint8_t tx_failed_flag = 0, num_of_slaves;
1088         uint8_t slaves[RTE_MAX_ETHPORTS];
1089
1090         uint16_t max_nb_of_tx_pkts = 0;
1091
1092         int slave_tx_total[RTE_MAX_ETHPORTS];
1093         int i, most_successful_tx_slave = -1;
1094
1095         bd_tx_q = (struct bond_tx_queue *)queue;
1096         internals = bd_tx_q->dev_private;
1097
1098         /* Copy slave list to protect against slave up/down changes during tx
1099          * bursting */
1100         num_of_slaves = internals->active_slave_count;
1101         memcpy(slaves, internals->active_slaves,
1102                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1103
1104         if (num_of_slaves < 1)
1105                 return 0;
1106
1107         /* Increment reference count on mbufs */
1108         for (i = 0; i < nb_pkts; i++)
1109                 rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1110
1111         /* Transmit burst on each active slave */
1112         for (i = 0; i < num_of_slaves; i++) {
1113                 slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1114                                         bufs, nb_pkts);
1115
1116                 if (unlikely(slave_tx_total[i] < nb_pkts))
1117                         tx_failed_flag = 1;
1118
1119                 /* record the value and slave index for the slave which transmits the
1120                  * maximum number of packets */
1121                 if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1122                         max_nb_of_tx_pkts = slave_tx_total[i];
1123                         most_successful_tx_slave = i;
1124                 }
1125         }
1126
1127         /* if slaves fail to transmit packets from burst, the calling application
1128          * is not expected to know about multiple references to packets so we must
1129          * handle failures of all packets except those of the most successful slave
1130          */
1131         if (unlikely(tx_failed_flag))
1132                 for (i = 0; i < num_of_slaves; i++)
1133                         if (i != most_successful_tx_slave)
1134                                 while (slave_tx_total[i] < nb_pkts)
1135                                         rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1136
1137         return max_nb_of_tx_pkts;
1138 }
1139
1140 void
1141 link_properties_set(struct rte_eth_dev *bonded_eth_dev,
1142                 struct rte_eth_link *slave_dev_link)
1143 {
1144         struct rte_eth_link *bonded_dev_link = &bonded_eth_dev->data->dev_link;
1145         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1146
1147         if (slave_dev_link->link_status &&
1148                 bonded_eth_dev->data->dev_started) {
1149                 bonded_dev_link->link_duplex = slave_dev_link->link_duplex;
1150                 bonded_dev_link->link_speed = slave_dev_link->link_speed;
1151
1152                 internals->link_props_set = 1;
1153         }
1154 }
1155
1156 void
1157 link_properties_reset(struct rte_eth_dev *bonded_eth_dev)
1158 {
1159         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1160
1161         memset(&(bonded_eth_dev->data->dev_link), 0,
1162                         sizeof(bonded_eth_dev->data->dev_link));
1163
1164         internals->link_props_set = 0;
1165 }
1166
1167 int
1168 link_properties_valid(struct rte_eth_link *bonded_dev_link,
1169                 struct rte_eth_link *slave_dev_link)
1170 {
1171         if (bonded_dev_link->link_duplex != slave_dev_link->link_duplex ||
1172                 bonded_dev_link->link_speed !=  slave_dev_link->link_speed)
1173                 return -1;
1174
1175         return 0;
1176 }
1177
1178 int
1179 mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1180 {
1181         struct ether_addr *mac_addr;
1182
1183         if (eth_dev == NULL) {
1184                 RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1185                 return -1;
1186         }
1187
1188         if (dst_mac_addr == NULL) {
1189                 RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1190                 return -1;
1191         }
1192
1193         mac_addr = eth_dev->data->mac_addrs;
1194
1195         ether_addr_copy(mac_addr, dst_mac_addr);
1196         return 0;
1197 }
1198
1199 int
1200 mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1201 {
1202         struct ether_addr *mac_addr;
1203
1204         if (eth_dev == NULL) {
1205                 RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1206                 return -1;
1207         }
1208
1209         if (new_mac_addr == NULL) {
1210                 RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1211                 return -1;
1212         }
1213
1214         mac_addr = eth_dev->data->mac_addrs;
1215
1216         /* If new MAC is different to current MAC then update */
1217         if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1218                 memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1219
1220         return 0;
1221 }
1222
1223 int
1224 mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1225 {
1226         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1227         int i;
1228
1229         /* Update slave devices MAC addresses */
1230         if (internals->slave_count < 1)
1231                 return -1;
1232
1233         switch (internals->mode) {
1234         case BONDING_MODE_ROUND_ROBIN:
1235         case BONDING_MODE_BALANCE:
1236         case BONDING_MODE_BROADCAST:
1237                 for (i = 0; i < internals->slave_count; i++) {
1238                         if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id],
1239                                         bonded_eth_dev->data->mac_addrs)) {
1240                                 RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1241                                                 internals->slaves[i].port_id);
1242                                 return -1;
1243                         }
1244                 }
1245                 break;
1246         case BONDING_MODE_8023AD:
1247                 bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1248                 break;
1249         case BONDING_MODE_ACTIVE_BACKUP:
1250         case BONDING_MODE_TLB:
1251         case BONDING_MODE_ALB:
1252         default:
1253                 for (i = 0; i < internals->slave_count; i++) {
1254                         if (internals->slaves[i].port_id ==
1255                                         internals->current_primary_port) {
1256                                 if (mac_address_set(&rte_eth_devices[internals->primary_port],
1257                                                 bonded_eth_dev->data->mac_addrs)) {
1258                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1259                                                         internals->current_primary_port);
1260                                         return -1;
1261                                 }
1262                         } else {
1263                                 if (mac_address_set(
1264                                                 &rte_eth_devices[internals->slaves[i].port_id],
1265                                                 &internals->slaves[i].persisted_mac_addr)) {
1266                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1267                                                         internals->slaves[i].port_id);
1268                                         return -1;
1269                                 }
1270                         }
1271                 }
1272         }
1273
1274         return 0;
1275 }
1276
1277 int
1278 bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1279 {
1280         struct bond_dev_private *internals;
1281
1282         internals = eth_dev->data->dev_private;
1283
1284         switch (mode) {
1285         case BONDING_MODE_ROUND_ROBIN:
1286                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1287                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1288                 break;
1289         case BONDING_MODE_ACTIVE_BACKUP:
1290                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1291                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1292                 break;
1293         case BONDING_MODE_BALANCE:
1294                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1295                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1296                 break;
1297         case BONDING_MODE_BROADCAST:
1298                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1299                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1300                 break;
1301         case BONDING_MODE_8023AD:
1302                 if (bond_mode_8023ad_enable(eth_dev) != 0)
1303                         return -1;
1304
1305                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1306                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1307                 RTE_LOG(WARNING, PMD,
1308                                 "Using mode 4, it is necessary to do TX burst and RX burst "
1309                                 "at least every 100ms.\n");
1310                 break;
1311         case BONDING_MODE_TLB:
1312                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1313                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1314                 break;
1315         case BONDING_MODE_ALB:
1316                 if (bond_mode_alb_enable(eth_dev) != 0)
1317                         return -1;
1318
1319                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1320                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1321                 break;
1322         default:
1323                 return -1;
1324         }
1325
1326         internals->mode = mode;
1327
1328         return 0;
1329 }
1330
1331 int
1332 slave_configure(struct rte_eth_dev *bonded_eth_dev,
1333                 struct rte_eth_dev *slave_eth_dev)
1334 {
1335         struct bond_rx_queue *bd_rx_q;
1336         struct bond_tx_queue *bd_tx_q;
1337
1338         int errval;
1339         uint16_t q_id;
1340
1341         /* Stop slave */
1342         rte_eth_dev_stop(slave_eth_dev->data->port_id);
1343
1344         /* Enable interrupts on slave device if supported */
1345         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1346                 slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1347
1348         /* If RSS is enabled for bonding, try to enable it for slaves  */
1349         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1350                 if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1351                                 != 0) {
1352                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1353                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1354                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1355                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1356                 } else {
1357                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1358                 }
1359
1360                 slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1361                                 bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1362                 slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1363                                 bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1364         }
1365
1366         slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter =
1367                         bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter;
1368
1369         /* Configure device */
1370         errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1371                         bonded_eth_dev->data->nb_rx_queues,
1372                         bonded_eth_dev->data->nb_tx_queues,
1373                         &(slave_eth_dev->data->dev_conf));
1374         if (errval != 0) {
1375                 RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1376                                 slave_eth_dev->data->port_id, errval);
1377                 return errval;
1378         }
1379
1380         /* Setup Rx Queues */
1381         for (q_id = 0; q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1382                 bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1383
1384                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1385                                 bd_rx_q->nb_rx_desc,
1386                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1387                                 &(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1388                 if (errval != 0) {
1389                         RTE_BOND_LOG(ERR,
1390                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1391                                         slave_eth_dev->data->port_id, q_id, errval);
1392                         return errval;
1393                 }
1394         }
1395
1396         /* Setup Tx Queues */
1397         for (q_id = 0; q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1398                 bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1399
1400                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1401                                 bd_tx_q->nb_tx_desc,
1402                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1403                                 &bd_tx_q->tx_conf);
1404                 if (errval != 0) {
1405                         RTE_BOND_LOG(ERR,
1406                                         "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1407                                         slave_eth_dev->data->port_id, q_id, errval);
1408                         return errval;
1409                 }
1410         }
1411
1412         /* Start device */
1413         errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1414         if (errval != 0) {
1415                 RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1416                                 slave_eth_dev->data->port_id, errval);
1417                 return -1;
1418         }
1419
1420         /* If RSS is enabled for bonding, synchronize RETA */
1421         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1422                 int i;
1423                 struct bond_dev_private *internals;
1424
1425                 internals = bonded_eth_dev->data->dev_private;
1426
1427                 for (i = 0; i < internals->slave_count; i++) {
1428                         if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1429                                 errval = rte_eth_dev_rss_reta_update(
1430                                                 slave_eth_dev->data->port_id,
1431                                                 &internals->reta_conf[0],
1432                                                 internals->slaves[i].reta_size);
1433                                 if (errval != 0) {
1434                                         RTE_LOG(WARNING, PMD,
1435                                                         "rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1436                                                         " RSS Configuration for bonding may be inconsistent.\n",
1437                                                         slave_eth_dev->data->port_id, errval);
1438                                 }
1439                                 break;
1440                         }
1441                 }
1442         }
1443
1444         /* If lsc interrupt is set, check initial slave's link status */
1445         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC) {
1446                 slave_eth_dev->dev_ops->link_update(slave_eth_dev, 0);
1447                 bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1448                         RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id,
1449                         NULL);
1450         }
1451
1452         return 0;
1453 }
1454
1455 void
1456 slave_remove(struct bond_dev_private *internals,
1457                 struct rte_eth_dev *slave_eth_dev)
1458 {
1459         uint8_t i;
1460
1461         for (i = 0; i < internals->slave_count; i++)
1462                 if (internals->slaves[i].port_id ==
1463                                 slave_eth_dev->data->port_id)
1464                         break;
1465
1466         if (i < (internals->slave_count - 1))
1467                 memmove(&internals->slaves[i], &internals->slaves[i + 1],
1468                                 sizeof(internals->slaves[0]) *
1469                                 (internals->slave_count - i - 1));
1470
1471         internals->slave_count--;
1472
1473         /* force reconfiguration of slave interfaces */
1474         _rte_eth_dev_reset(slave_eth_dev);
1475 }
1476
1477 static void
1478 bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1479
1480 void
1481 slave_add(struct bond_dev_private *internals,
1482                 struct rte_eth_dev *slave_eth_dev)
1483 {
1484         struct bond_slave_details *slave_details =
1485                         &internals->slaves[internals->slave_count];
1486
1487         slave_details->port_id = slave_eth_dev->data->port_id;
1488         slave_details->last_link_status = 0;
1489
1490         /* Mark slave devices that don't support interrupts so we can
1491          * compensate when we start the bond
1492          */
1493         if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1494                 slave_details->link_status_poll_enabled = 1;
1495         }
1496
1497         slave_details->link_status_wait_to_complete = 0;
1498         /* clean tlb_last_obytes when adding port for bonding device */
1499         memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
1500                         sizeof(struct ether_addr));
1501 }
1502
1503 void
1504 bond_ethdev_primary_set(struct bond_dev_private *internals,
1505                 uint8_t slave_port_id)
1506 {
1507         int i;
1508
1509         if (internals->active_slave_count < 1)
1510                 internals->current_primary_port = slave_port_id;
1511         else
1512                 /* Search bonded device slave ports for new proposed primary port */
1513                 for (i = 0; i < internals->active_slave_count; i++) {
1514                         if (internals->active_slaves[i] == slave_port_id)
1515                                 internals->current_primary_port = slave_port_id;
1516                 }
1517 }
1518
1519 static void
1520 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
1521
1522 static int
1523 bond_ethdev_start(struct rte_eth_dev *eth_dev)
1524 {
1525         struct bond_dev_private *internals;
1526         int i;
1527
1528         /* slave eth dev will be started by bonded device */
1529         if (check_for_bonded_ethdev(eth_dev)) {
1530                 RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
1531                                 eth_dev->data->port_id);
1532                 return -1;
1533         }
1534
1535         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1536         eth_dev->data->dev_started = 1;
1537
1538         internals = eth_dev->data->dev_private;
1539
1540         if (internals->slave_count == 0) {
1541                 RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
1542                 return -1;
1543         }
1544
1545         if (internals->user_defined_mac == 0) {
1546                 struct ether_addr *new_mac_addr = NULL;
1547
1548                 for (i = 0; i < internals->slave_count; i++)
1549                         if (internals->slaves[i].port_id == internals->primary_port)
1550                                 new_mac_addr = &internals->slaves[i].persisted_mac_addr;
1551
1552                 if (new_mac_addr == NULL)
1553                         return -1;
1554
1555                 if (mac_address_set(eth_dev, new_mac_addr) != 0) {
1556                         RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
1557                                         eth_dev->data->port_id);
1558                         return -1;
1559                 }
1560         }
1561
1562         /* Update all slave devices MACs*/
1563         if (mac_address_slaves_update(eth_dev) != 0)
1564                 return -1;
1565
1566         /* If bonded device is configure in promiscuous mode then re-apply config */
1567         if (internals->promiscuous_en)
1568                 bond_ethdev_promiscuous_enable(eth_dev);
1569
1570         /* Reconfigure each slave device if starting bonded device */
1571         for (i = 0; i < internals->slave_count; i++) {
1572                 if (slave_configure(eth_dev,
1573                                 &(rte_eth_devices[internals->slaves[i].port_id])) != 0) {
1574                         RTE_BOND_LOG(ERR,
1575                                         "bonded port (%d) failed to reconfigure slave device (%d)",
1576                                         eth_dev->data->port_id, internals->slaves[i].port_id);
1577                         return -1;
1578                 }
1579                 /* We will need to poll for link status if any slave doesn't
1580                  * support interrupts
1581                  */
1582                 if (internals->slaves[i].link_status_poll_enabled)
1583                         internals->link_status_polling_enabled = 1;
1584         }
1585         /* start polling if needed */
1586         if (internals->link_status_polling_enabled) {
1587                 rte_eal_alarm_set(
1588                         internals->link_status_polling_interval_ms * 1000,
1589                         bond_ethdev_slave_link_status_change_monitor,
1590                         (void *)&rte_eth_devices[internals->port_id]);
1591         }
1592
1593         if (internals->user_defined_primary_port)
1594                 bond_ethdev_primary_set(internals, internals->primary_port);
1595
1596         if (internals->mode == BONDING_MODE_8023AD)
1597                 bond_mode_8023ad_start(eth_dev);
1598
1599         if (internals->mode == BONDING_MODE_TLB ||
1600                         internals->mode == BONDING_MODE_ALB)
1601                 bond_tlb_enable(internals);
1602
1603         return 0;
1604 }
1605
1606 static void
1607 bond_ethdev_free_queues(struct rte_eth_dev *dev)
1608 {
1609         uint8_t i;
1610
1611         if (dev->data->rx_queues != NULL) {
1612                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1613                         rte_free(dev->data->rx_queues[i]);
1614                         dev->data->rx_queues[i] = NULL;
1615                 }
1616                 dev->data->nb_rx_queues = 0;
1617         }
1618
1619         if (dev->data->tx_queues != NULL) {
1620                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1621                         rte_free(dev->data->tx_queues[i]);
1622                         dev->data->tx_queues[i] = NULL;
1623                 }
1624                 dev->data->nb_tx_queues = 0;
1625         }
1626 }
1627
1628 void
1629 bond_ethdev_stop(struct rte_eth_dev *eth_dev)
1630 {
1631         struct bond_dev_private *internals = eth_dev->data->dev_private;
1632         uint8_t i;
1633
1634         if (internals->mode == BONDING_MODE_8023AD) {
1635                 struct port *port;
1636                 void *pkt = NULL;
1637
1638                 bond_mode_8023ad_stop(eth_dev);
1639
1640                 /* Discard all messages to/from mode 4 state machines */
1641                 for (i = 0; i < internals->active_slave_count; i++) {
1642                         port = &mode_8023ad_ports[internals->active_slaves[i]];
1643
1644                         RTE_ASSERT(port->rx_ring != NULL);
1645                         while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
1646                                 rte_pktmbuf_free(pkt);
1647
1648                         RTE_ASSERT(port->tx_ring != NULL);
1649                         while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
1650                                 rte_pktmbuf_free(pkt);
1651                 }
1652         }
1653
1654         if (internals->mode == BONDING_MODE_TLB ||
1655                         internals->mode == BONDING_MODE_ALB) {
1656                 bond_tlb_disable(internals);
1657                 for (i = 0; i < internals->active_slave_count; i++)
1658                         tlb_last_obytets[internals->active_slaves[i]] = 0;
1659         }
1660
1661         internals->active_slave_count = 0;
1662         internals->link_status_polling_enabled = 0;
1663         for (i = 0; i < internals->slave_count; i++)
1664                 internals->slaves[i].last_link_status = 0;
1665
1666         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1667         eth_dev->data->dev_started = 0;
1668 }
1669
1670 void
1671 bond_ethdev_close(struct rte_eth_dev *dev)
1672 {
1673         struct bond_dev_private *internals = dev->data->dev_private;
1674         uint8_t bond_port_id = internals->port_id;
1675         int skipped = 0;
1676
1677         RTE_LOG(INFO, EAL, "Closing bonded device %s\n", dev->device->name);
1678         while (internals->slave_count != skipped) {
1679                 uint8_t port_id = internals->slaves[skipped].port_id;
1680
1681                 rte_eth_dev_stop(port_id);
1682
1683                 if (rte_eth_bond_slave_remove(bond_port_id, port_id) != 0) {
1684                         RTE_LOG(ERR, EAL,
1685                                 "Failed to remove port %d from bonded device "
1686                                 "%s\n", port_id, dev->device->name);
1687                         skipped++;
1688                 }
1689         }
1690         bond_ethdev_free_queues(dev);
1691         rte_bitmap_reset(internals->vlan_filter_bmp);
1692 }
1693
1694 /* forward declaration */
1695 static int bond_ethdev_configure(struct rte_eth_dev *dev);
1696
1697 static void
1698 bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
1699 {
1700         struct bond_dev_private *internals = dev->data->dev_private;
1701         uint16_t max_nb_rx_queues = UINT16_MAX;
1702         uint16_t max_nb_tx_queues = UINT16_MAX;
1703
1704         dev_info->max_mac_addrs = 1;
1705
1706         dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen
1707                                   ? internals->candidate_max_rx_pktlen
1708                                   : ETHER_MAX_JUMBO_FRAME_LEN;
1709
1710         if (internals->slave_count > 0) {
1711                 /* Max number of tx/rx queues that the bonded device can
1712                  * support is the minimum values of the bonded slaves, as
1713                  * all slaves must be capable of supporting the same number
1714                  * of tx/rx queues.
1715                  */
1716                 struct rte_eth_dev_info slave_info;
1717                 uint8_t idx;
1718
1719                 for (idx = 0; idx < internals->slave_count; idx++) {
1720                         rte_eth_dev_info_get(internals->slaves[idx].port_id,
1721                                         &slave_info);
1722
1723                         if (slave_info.max_rx_queues < max_nb_rx_queues)
1724                                 max_nb_rx_queues = slave_info.max_rx_queues;
1725
1726                         if (slave_info.max_tx_queues < max_nb_tx_queues)
1727                                 max_nb_tx_queues = slave_info.max_tx_queues;
1728                 }
1729         }
1730
1731         dev_info->max_rx_queues = max_nb_rx_queues;
1732         dev_info->max_tx_queues = max_nb_tx_queues;
1733
1734         dev_info->min_rx_bufsize = 0;
1735
1736         dev_info->rx_offload_capa = internals->rx_offload_capa;
1737         dev_info->tx_offload_capa = internals->tx_offload_capa;
1738         dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
1739
1740         dev_info->reta_size = internals->reta_size;
1741 }
1742
1743 static int
1744 bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
1745 {
1746         int res;
1747         uint8_t i;
1748         struct bond_dev_private *internals = dev->data->dev_private;
1749
1750         /* don't do this while a slave is being added */
1751         rte_spinlock_lock(&internals->lock);
1752
1753         if (on)
1754                 rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
1755         else
1756                 rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
1757
1758         for (i = 0; i < internals->slave_count; i++) {
1759                 uint8_t port_id = internals->slaves[i].port_id;
1760
1761                 res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
1762                 if (res == ENOTSUP)
1763                         RTE_LOG(WARNING, PMD,
1764                                 "Setting VLAN filter on slave port %u not supported.\n",
1765                                 port_id);
1766         }
1767
1768         rte_spinlock_unlock(&internals->lock);
1769         return 0;
1770 }
1771
1772 static int
1773 bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1774                 uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
1775                 const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
1776 {
1777         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
1778                         rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
1779                                         0, dev->data->numa_node);
1780         if (bd_rx_q == NULL)
1781                 return -1;
1782
1783         bd_rx_q->queue_id = rx_queue_id;
1784         bd_rx_q->dev_private = dev->data->dev_private;
1785
1786         bd_rx_q->nb_rx_desc = nb_rx_desc;
1787
1788         memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
1789         bd_rx_q->mb_pool = mb_pool;
1790
1791         dev->data->rx_queues[rx_queue_id] = bd_rx_q;
1792
1793         return 0;
1794 }
1795
1796 static int
1797 bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1798                 uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
1799                 const struct rte_eth_txconf *tx_conf)
1800 {
1801         struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
1802                         rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
1803                                         0, dev->data->numa_node);
1804
1805         if (bd_tx_q == NULL)
1806                 return -1;
1807
1808         bd_tx_q->queue_id = tx_queue_id;
1809         bd_tx_q->dev_private = dev->data->dev_private;
1810
1811         bd_tx_q->nb_tx_desc = nb_tx_desc;
1812         memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
1813
1814         dev->data->tx_queues[tx_queue_id] = bd_tx_q;
1815
1816         return 0;
1817 }
1818
1819 static void
1820 bond_ethdev_rx_queue_release(void *queue)
1821 {
1822         if (queue == NULL)
1823                 return;
1824
1825         rte_free(queue);
1826 }
1827
1828 static void
1829 bond_ethdev_tx_queue_release(void *queue)
1830 {
1831         if (queue == NULL)
1832                 return;
1833
1834         rte_free(queue);
1835 }
1836
1837 static void
1838 bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
1839 {
1840         struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
1841         struct bond_dev_private *internals;
1842
1843         /* Default value for polling slave found is true as we don't want to
1844          * disable the polling thread if we cannot get the lock */
1845         int i, polling_slave_found = 1;
1846
1847         if (cb_arg == NULL)
1848                 return;
1849
1850         bonded_ethdev = (struct rte_eth_dev *)cb_arg;
1851         internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
1852
1853         if (!bonded_ethdev->data->dev_started ||
1854                 !internals->link_status_polling_enabled)
1855                 return;
1856
1857         /* If device is currently being configured then don't check slaves link
1858          * status, wait until next period */
1859         if (rte_spinlock_trylock(&internals->lock)) {
1860                 if (internals->slave_count > 0)
1861                         polling_slave_found = 0;
1862
1863                 for (i = 0; i < internals->slave_count; i++) {
1864                         if (!internals->slaves[i].link_status_poll_enabled)
1865                                 continue;
1866
1867                         slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
1868                         polling_slave_found = 1;
1869
1870                         /* Update slave link status */
1871                         (*slave_ethdev->dev_ops->link_update)(slave_ethdev,
1872                                         internals->slaves[i].link_status_wait_to_complete);
1873
1874                         /* if link status has changed since last checked then call lsc
1875                          * event callback */
1876                         if (slave_ethdev->data->dev_link.link_status !=
1877                                         internals->slaves[i].last_link_status) {
1878                                 internals->slaves[i].last_link_status =
1879                                                 slave_ethdev->data->dev_link.link_status;
1880
1881                                 bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
1882                                                 RTE_ETH_EVENT_INTR_LSC,
1883                                                 &bonded_ethdev->data->port_id,
1884                                                 NULL);
1885                         }
1886                 }
1887                 rte_spinlock_unlock(&internals->lock);
1888         }
1889
1890         if (polling_slave_found)
1891                 /* Set alarm to continue monitoring link status of slave ethdev's */
1892                 rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
1893                                 bond_ethdev_slave_link_status_change_monitor, cb_arg);
1894 }
1895
1896 static int
1897 bond_ethdev_link_update(struct rte_eth_dev *bonded_eth_dev,
1898                 int wait_to_complete)
1899 {
1900         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1901
1902         if (!bonded_eth_dev->data->dev_started ||
1903                 internals->active_slave_count == 0) {
1904                 bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1905                 return 0;
1906         } else {
1907                 struct rte_eth_dev *slave_eth_dev;
1908                 int i, link_up = 0;
1909
1910                 for (i = 0; i < internals->active_slave_count; i++) {
1911                         slave_eth_dev = &rte_eth_devices[internals->active_slaves[i]];
1912
1913                         (*slave_eth_dev->dev_ops->link_update)(slave_eth_dev,
1914                                         wait_to_complete);
1915                         if (slave_eth_dev->data->dev_link.link_status == ETH_LINK_UP) {
1916                                 link_up = 1;
1917                                 break;
1918                         }
1919                 }
1920
1921                 bonded_eth_dev->data->dev_link.link_status = link_up;
1922         }
1923
1924         return 0;
1925 }
1926
1927 static void
1928 bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1929 {
1930         struct bond_dev_private *internals = dev->data->dev_private;
1931         struct rte_eth_stats slave_stats;
1932         int i, j;
1933
1934         for (i = 0; i < internals->slave_count; i++) {
1935                 rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
1936
1937                 stats->ipackets += slave_stats.ipackets;
1938                 stats->opackets += slave_stats.opackets;
1939                 stats->ibytes += slave_stats.ibytes;
1940                 stats->obytes += slave_stats.obytes;
1941                 stats->imissed += slave_stats.imissed;
1942                 stats->ierrors += slave_stats.ierrors;
1943                 stats->oerrors += slave_stats.oerrors;
1944                 stats->rx_nombuf += slave_stats.rx_nombuf;
1945
1946                 for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
1947                         stats->q_ipackets[j] += slave_stats.q_ipackets[j];
1948                         stats->q_opackets[j] += slave_stats.q_opackets[j];
1949                         stats->q_ibytes[j] += slave_stats.q_ibytes[j];
1950                         stats->q_obytes[j] += slave_stats.q_obytes[j];
1951                         stats->q_errors[j] += slave_stats.q_errors[j];
1952                 }
1953
1954         }
1955 }
1956
1957 static void
1958 bond_ethdev_stats_reset(struct rte_eth_dev *dev)
1959 {
1960         struct bond_dev_private *internals = dev->data->dev_private;
1961         int i;
1962
1963         for (i = 0; i < internals->slave_count; i++)
1964                 rte_eth_stats_reset(internals->slaves[i].port_id);
1965 }
1966
1967 static void
1968 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
1969 {
1970         struct bond_dev_private *internals = eth_dev->data->dev_private;
1971         int i;
1972
1973         internals->promiscuous_en = 1;
1974
1975         switch (internals->mode) {
1976         /* Promiscuous mode is propagated to all slaves */
1977         case BONDING_MODE_ROUND_ROBIN:
1978         case BONDING_MODE_BALANCE:
1979         case BONDING_MODE_BROADCAST:
1980                 for (i = 0; i < internals->slave_count; i++)
1981                         rte_eth_promiscuous_enable(internals->slaves[i].port_id);
1982                 break;
1983         /* In mode4 promiscus mode is managed when slave is added/removed */
1984         case BONDING_MODE_8023AD:
1985                 break;
1986         /* Promiscuous mode is propagated only to primary slave */
1987         case BONDING_MODE_ACTIVE_BACKUP:
1988         case BONDING_MODE_TLB:
1989         case BONDING_MODE_ALB:
1990         default:
1991                 rte_eth_promiscuous_enable(internals->current_primary_port);
1992         }
1993 }
1994
1995 static void
1996 bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
1997 {
1998         struct bond_dev_private *internals = dev->data->dev_private;
1999         int i;
2000
2001         internals->promiscuous_en = 0;
2002
2003         switch (internals->mode) {
2004         /* Promiscuous mode is propagated to all slaves */
2005         case BONDING_MODE_ROUND_ROBIN:
2006         case BONDING_MODE_BALANCE:
2007         case BONDING_MODE_BROADCAST:
2008                 for (i = 0; i < internals->slave_count; i++)
2009                         rte_eth_promiscuous_disable(internals->slaves[i].port_id);
2010                 break;
2011         /* In mode4 promiscus mode is set managed when slave is added/removed */
2012         case BONDING_MODE_8023AD:
2013                 break;
2014         /* Promiscuous mode is propagated only to primary slave */
2015         case BONDING_MODE_ACTIVE_BACKUP:
2016         case BONDING_MODE_TLB:
2017         case BONDING_MODE_ALB:
2018         default:
2019                 rte_eth_promiscuous_disable(internals->current_primary_port);
2020         }
2021 }
2022
2023 static void
2024 bond_ethdev_delayed_lsc_propagation(void *arg)
2025 {
2026         if (arg == NULL)
2027                 return;
2028
2029         _rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
2030                         RTE_ETH_EVENT_INTR_LSC, NULL, NULL);
2031 }
2032
2033 int
2034 bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type,
2035                 void *param, void *ret_param __rte_unused)
2036 {
2037         struct rte_eth_dev *bonded_eth_dev, *slave_eth_dev;
2038         struct bond_dev_private *internals;
2039         struct rte_eth_link link;
2040         int rc = -1;
2041
2042         int i, valid_slave = 0;
2043         uint8_t active_pos;
2044         uint8_t lsc_flag = 0;
2045
2046         if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
2047                 return rc;
2048
2049         bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
2050         slave_eth_dev = &rte_eth_devices[port_id];
2051
2052         if (check_for_bonded_ethdev(bonded_eth_dev))
2053                 return rc;
2054
2055         internals = bonded_eth_dev->data->dev_private;
2056
2057         /* If the device isn't started don't handle interrupts */
2058         if (!bonded_eth_dev->data->dev_started)
2059                 return rc;
2060
2061         /* verify that port_id is a valid slave of bonded port */
2062         for (i = 0; i < internals->slave_count; i++) {
2063                 if (internals->slaves[i].port_id == port_id) {
2064                         valid_slave = 1;
2065                         break;
2066                 }
2067         }
2068
2069         if (!valid_slave)
2070                 return rc;
2071
2072         /* Search for port in active port list */
2073         active_pos = find_slave_by_id(internals->active_slaves,
2074                         internals->active_slave_count, port_id);
2075
2076         rte_eth_link_get_nowait(port_id, &link);
2077         if (link.link_status) {
2078                 if (active_pos < internals->active_slave_count)
2079                         return rc;
2080
2081                 /* if no active slave ports then set this port to be primary port */
2082                 if (internals->active_slave_count < 1) {
2083                         /* If first active slave, then change link status */
2084                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2085                         internals->current_primary_port = port_id;
2086                         lsc_flag = 1;
2087
2088                         mac_address_slaves_update(bonded_eth_dev);
2089
2090                         /* Inherit eth dev link properties from first active slave */
2091                         link_properties_set(bonded_eth_dev,
2092                                         &(slave_eth_dev->data->dev_link));
2093                 } else {
2094                         if (link_properties_valid(
2095                                 &bonded_eth_dev->data->dev_link, &link) != 0) {
2096                                 slave_eth_dev->data->dev_flags &=
2097                                         (~RTE_ETH_DEV_BONDED_SLAVE);
2098                                 RTE_LOG(ERR, PMD,
2099                                         "port %u invalid speed/duplex\n",
2100                                         port_id);
2101                                 return rc;
2102                         }
2103                 }
2104
2105                 activate_slave(bonded_eth_dev, port_id);
2106
2107                 /* If user has defined the primary port then default to using it */
2108                 if (internals->user_defined_primary_port &&
2109                                 internals->primary_port == port_id)
2110                         bond_ethdev_primary_set(internals, port_id);
2111         } else {
2112                 if (active_pos == internals->active_slave_count)
2113                         return rc;
2114
2115                 /* Remove from active slave list */
2116                 deactivate_slave(bonded_eth_dev, port_id);
2117
2118                 /* No active slaves, change link status to down and reset other
2119                  * link properties */
2120                 if (internals->active_slave_count < 1) {
2121                         lsc_flag = 1;
2122                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2123
2124                         link_properties_reset(bonded_eth_dev);
2125                 }
2126
2127                 /* Update primary id, take first active slave from list or if none
2128                  * available set to -1 */
2129                 if (port_id == internals->current_primary_port) {
2130                         if (internals->active_slave_count > 0)
2131                                 bond_ethdev_primary_set(internals,
2132                                                 internals->active_slaves[0]);
2133                         else
2134                                 internals->current_primary_port = internals->primary_port;
2135                 }
2136         }
2137
2138         if (lsc_flag) {
2139                 /* Cancel any possible outstanding interrupts if delays are enabled */
2140                 if (internals->link_up_delay_ms > 0 ||
2141                         internals->link_down_delay_ms > 0)
2142                         rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2143                                         bonded_eth_dev);
2144
2145                 if (bonded_eth_dev->data->dev_link.link_status) {
2146                         if (internals->link_up_delay_ms > 0)
2147                                 rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2148                                                 bond_ethdev_delayed_lsc_propagation,
2149                                                 (void *)bonded_eth_dev);
2150                         else
2151                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2152                                                 RTE_ETH_EVENT_INTR_LSC,
2153                                                 NULL, NULL);
2154
2155                 } else {
2156                         if (internals->link_down_delay_ms > 0)
2157                                 rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2158                                                 bond_ethdev_delayed_lsc_propagation,
2159                                                 (void *)bonded_eth_dev);
2160                         else
2161                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2162                                                 RTE_ETH_EVENT_INTR_LSC,
2163                                                 NULL, NULL);
2164                 }
2165         }
2166         return 0;
2167 }
2168
2169 static int
2170 bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2171                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2172 {
2173         unsigned i, j;
2174         int result = 0;
2175         int slave_reta_size;
2176         unsigned reta_count;
2177         struct bond_dev_private *internals = dev->data->dev_private;
2178
2179         if (reta_size != internals->reta_size)
2180                 return -EINVAL;
2181
2182          /* Copy RETA table */
2183         reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2184
2185         for (i = 0; i < reta_count; i++) {
2186                 internals->reta_conf[i].mask = reta_conf[i].mask;
2187                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2188                         if ((reta_conf[i].mask >> j) & 0x01)
2189                                 internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2190         }
2191
2192         /* Fill rest of array */
2193         for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2194                 memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2195                                 sizeof(internals->reta_conf[0]) * reta_count);
2196
2197         /* Propagate RETA over slaves */
2198         for (i = 0; i < internals->slave_count; i++) {
2199                 slave_reta_size = internals->slaves[i].reta_size;
2200                 result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2201                                 &internals->reta_conf[0], slave_reta_size);
2202                 if (result < 0)
2203                         return result;
2204         }
2205
2206         return 0;
2207 }
2208
2209 static int
2210 bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2211                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2212 {
2213         int i, j;
2214         struct bond_dev_private *internals = dev->data->dev_private;
2215
2216         if (reta_size != internals->reta_size)
2217                 return -EINVAL;
2218
2219          /* Copy RETA table */
2220         for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2221                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2222                         if ((reta_conf[i].mask >> j) & 0x01)
2223                                 reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2224
2225         return 0;
2226 }
2227
2228 static int
2229 bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2230                 struct rte_eth_rss_conf *rss_conf)
2231 {
2232         int i, result = 0;
2233         struct bond_dev_private *internals = dev->data->dev_private;
2234         struct rte_eth_rss_conf bond_rss_conf;
2235
2236         memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2237
2238         bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2239
2240         if (bond_rss_conf.rss_hf != 0)
2241                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2242
2243         if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2244                         sizeof(internals->rss_key)) {
2245                 if (bond_rss_conf.rss_key_len == 0)
2246                         bond_rss_conf.rss_key_len = 40;
2247                 internals->rss_key_len = bond_rss_conf.rss_key_len;
2248                 memcpy(internals->rss_key, bond_rss_conf.rss_key,
2249                                 internals->rss_key_len);
2250         }
2251
2252         for (i = 0; i < internals->slave_count; i++) {
2253                 result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2254                                 &bond_rss_conf);
2255                 if (result < 0)
2256                         return result;
2257         }
2258
2259         return 0;
2260 }
2261
2262 static int
2263 bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2264                 struct rte_eth_rss_conf *rss_conf)
2265 {
2266         struct bond_dev_private *internals = dev->data->dev_private;
2267
2268         rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2269         rss_conf->rss_key_len = internals->rss_key_len;
2270         if (rss_conf->rss_key)
2271                 memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2272
2273         return 0;
2274 }
2275
2276 const struct eth_dev_ops default_dev_ops = {
2277         .dev_start            = bond_ethdev_start,
2278         .dev_stop             = bond_ethdev_stop,
2279         .dev_close            = bond_ethdev_close,
2280         .dev_configure        = bond_ethdev_configure,
2281         .dev_infos_get        = bond_ethdev_info,
2282         .vlan_filter_set      = bond_ethdev_vlan_filter_set,
2283         .rx_queue_setup       = bond_ethdev_rx_queue_setup,
2284         .tx_queue_setup       = bond_ethdev_tx_queue_setup,
2285         .rx_queue_release     = bond_ethdev_rx_queue_release,
2286         .tx_queue_release     = bond_ethdev_tx_queue_release,
2287         .link_update          = bond_ethdev_link_update,
2288         .stats_get            = bond_ethdev_stats_get,
2289         .stats_reset          = bond_ethdev_stats_reset,
2290         .promiscuous_enable   = bond_ethdev_promiscuous_enable,
2291         .promiscuous_disable  = bond_ethdev_promiscuous_disable,
2292         .reta_update          = bond_ethdev_rss_reta_update,
2293         .reta_query           = bond_ethdev_rss_reta_query,
2294         .rss_hash_update      = bond_ethdev_rss_hash_update,
2295         .rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get
2296 };
2297
2298 static int
2299 bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
2300 {
2301         const char *name = rte_vdev_device_name(dev);
2302         uint8_t socket_id = dev->device.numa_node;
2303         struct bond_dev_private *internals = NULL;
2304         struct rte_eth_dev *eth_dev = NULL;
2305         uint32_t vlan_filter_bmp_size;
2306
2307         /* now do all data allocation - for eth_dev structure, dummy pci driver
2308          * and internal (private) data
2309          */
2310
2311         /* reserve an ethdev entry */
2312         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internals));
2313         if (eth_dev == NULL) {
2314                 RTE_BOND_LOG(ERR, "Unable to allocate rte_eth_dev");
2315                 goto err;
2316         }
2317
2318         internals = eth_dev->data->dev_private;
2319         eth_dev->data->nb_rx_queues = (uint16_t)1;
2320         eth_dev->data->nb_tx_queues = (uint16_t)1;
2321
2322         eth_dev->data->mac_addrs = rte_zmalloc_socket(name, ETHER_ADDR_LEN, 0,
2323                         socket_id);
2324         if (eth_dev->data->mac_addrs == NULL) {
2325                 RTE_BOND_LOG(ERR, "Unable to malloc mac_addrs");
2326                 goto err;
2327         }
2328
2329         eth_dev->dev_ops = &default_dev_ops;
2330         eth_dev->data->dev_flags = RTE_ETH_DEV_INTR_LSC |
2331                 RTE_ETH_DEV_DETACHABLE;
2332
2333         rte_spinlock_init(&internals->lock);
2334
2335         internals->port_id = eth_dev->data->port_id;
2336         internals->mode = BONDING_MODE_INVALID;
2337         internals->current_primary_port = RTE_MAX_ETHPORTS + 1;
2338         internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2;
2339         internals->xmit_hash = xmit_l2_hash;
2340         internals->user_defined_mac = 0;
2341         internals->link_props_set = 0;
2342
2343         internals->link_status_polling_enabled = 0;
2344
2345         internals->link_status_polling_interval_ms =
2346                 DEFAULT_POLLING_INTERVAL_10_MS;
2347         internals->link_down_delay_ms = 0;
2348         internals->link_up_delay_ms = 0;
2349
2350         internals->slave_count = 0;
2351         internals->active_slave_count = 0;
2352         internals->rx_offload_capa = 0;
2353         internals->tx_offload_capa = 0;
2354         internals->candidate_max_rx_pktlen = 0;
2355         internals->max_rx_pktlen = 0;
2356
2357         /* Initially allow to choose any offload type */
2358         internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK;
2359
2360         memset(internals->active_slaves, 0, sizeof(internals->active_slaves));
2361         memset(internals->slaves, 0, sizeof(internals->slaves));
2362
2363         /* Set mode 4 default configuration */
2364         bond_mode_8023ad_setup(eth_dev, NULL);
2365         if (bond_ethdev_mode_set(eth_dev, mode)) {
2366                 RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d",
2367                                  eth_dev->data->port_id, mode);
2368                 goto err;
2369         }
2370
2371         vlan_filter_bmp_size =
2372                 rte_bitmap_get_memory_footprint(ETHER_MAX_VLAN_ID + 1);
2373         internals->vlan_filter_bmpmem = rte_malloc(name, vlan_filter_bmp_size,
2374                                                    RTE_CACHE_LINE_SIZE);
2375         if (internals->vlan_filter_bmpmem == NULL) {
2376                 RTE_BOND_LOG(ERR,
2377                              "Failed to allocate vlan bitmap for bonded device %u\n",
2378                              eth_dev->data->port_id);
2379                 goto err;
2380         }
2381
2382         internals->vlan_filter_bmp = rte_bitmap_init(ETHER_MAX_VLAN_ID + 1,
2383                         internals->vlan_filter_bmpmem, vlan_filter_bmp_size);
2384         if (internals->vlan_filter_bmp == NULL) {
2385                 RTE_BOND_LOG(ERR,
2386                              "Failed to init vlan bitmap for bonded device %u\n",
2387                              eth_dev->data->port_id);
2388                 rte_free(internals->vlan_filter_bmpmem);
2389                 goto err;
2390         }
2391
2392         return eth_dev->data->port_id;
2393
2394 err:
2395         rte_free(internals);
2396         if (eth_dev != NULL) {
2397                 rte_free(eth_dev->data->mac_addrs);
2398                 rte_eth_dev_release_port(eth_dev);
2399         }
2400         return -1;
2401 }
2402
2403 static int
2404 bond_probe(struct rte_vdev_device *dev)
2405 {
2406         const char *name;
2407         struct bond_dev_private *internals;
2408         struct rte_kvargs *kvlist;
2409         uint8_t bonding_mode, socket_id;
2410         int  arg_count, port_id;
2411
2412         if (!dev)
2413                 return -EINVAL;
2414
2415         name = rte_vdev_device_name(dev);
2416         RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
2417
2418         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev),
2419                 pmd_bond_init_valid_arguments);
2420         if (kvlist == NULL)
2421                 return -1;
2422
2423         /* Parse link bonding mode */
2424         if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
2425                 if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
2426                                 &bond_ethdev_parse_slave_mode_kvarg,
2427                                 &bonding_mode) != 0) {
2428                         RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
2429                                         name);
2430                         goto parse_error;
2431                 }
2432         } else {
2433                 RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
2434                                 "device %s\n", name);
2435                 goto parse_error;
2436         }
2437
2438         /* Parse socket id to create bonding device on */
2439         arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
2440         if (arg_count == 1) {
2441                 if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
2442                                 &bond_ethdev_parse_socket_id_kvarg, &socket_id)
2443                                 != 0) {
2444                         RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
2445                                         "bonded device %s\n", name);
2446                         goto parse_error;
2447                 }
2448         } else if (arg_count > 1) {
2449                 RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
2450                                 "bonded device %s\n", name);
2451                 goto parse_error;
2452         } else {
2453                 socket_id = rte_socket_id();
2454         }
2455
2456         dev->device.numa_node = socket_id;
2457
2458         /* Create link bonding eth device */
2459         port_id = bond_alloc(dev, bonding_mode);
2460         if (port_id < 0) {
2461                 RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
2462                                 "socket %u.\n", name, bonding_mode, socket_id);
2463                 goto parse_error;
2464         }
2465         internals = rte_eth_devices[port_id].data->dev_private;
2466         internals->kvlist = kvlist;
2467
2468         RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
2469                         "socket %u.\n", name, port_id, bonding_mode, socket_id);
2470         return 0;
2471
2472 parse_error:
2473         rte_kvargs_free(kvlist);
2474
2475         return -1;
2476 }
2477
2478 static int
2479 bond_remove(struct rte_vdev_device *dev)
2480 {
2481         struct rte_eth_dev *eth_dev;
2482         struct bond_dev_private *internals;
2483         const char *name;
2484
2485         if (!dev)
2486                 return -EINVAL;
2487
2488         name = rte_vdev_device_name(dev);
2489         RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
2490
2491         /* now free all data allocation - for eth_dev structure,
2492          * dummy pci driver and internal (private) data
2493          */
2494
2495         /* find an ethdev entry */
2496         eth_dev = rte_eth_dev_allocated(name);
2497         if (eth_dev == NULL)
2498                 return -ENODEV;
2499
2500         RTE_ASSERT(eth_dev->device == &dev->device);
2501
2502         internals = eth_dev->data->dev_private;
2503         if (internals->slave_count != 0)
2504                 return -EBUSY;
2505
2506         if (eth_dev->data->dev_started == 1) {
2507                 bond_ethdev_stop(eth_dev);
2508                 bond_ethdev_close(eth_dev);
2509         }
2510
2511         eth_dev->dev_ops = NULL;
2512         eth_dev->rx_pkt_burst = NULL;
2513         eth_dev->tx_pkt_burst = NULL;
2514
2515         internals = eth_dev->data->dev_private;
2516         rte_bitmap_free(internals->vlan_filter_bmp);
2517         rte_free(internals->vlan_filter_bmpmem);
2518         rte_free(eth_dev->data->dev_private);
2519         rte_free(eth_dev->data->mac_addrs);
2520
2521         rte_eth_dev_release_port(eth_dev);
2522
2523         return 0;
2524 }
2525
2526 /* this part will resolve the slave portids after all the other pdev and vdev
2527  * have been allocated */
2528 static int
2529 bond_ethdev_configure(struct rte_eth_dev *dev)
2530 {
2531         const char *name = dev->device->name;
2532         struct bond_dev_private *internals = dev->data->dev_private;
2533         struct rte_kvargs *kvlist = internals->kvlist;
2534         int arg_count;
2535         uint8_t port_id = dev - rte_eth_devices;
2536
2537         static const uint8_t default_rss_key[40] = {
2538                 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
2539                 0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
2540                 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
2541                 0xBE, 0xAC, 0x01, 0xFA
2542         };
2543
2544         unsigned i, j;
2545
2546         /* If RSS is enabled, fill table and key with default values */
2547         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
2548                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
2549                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
2550                 memcpy(internals->rss_key, default_rss_key, 40);
2551
2552                 for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
2553                         internals->reta_conf[i].mask = ~0LL;
2554                         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2555                                 internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
2556                 }
2557         }
2558
2559         /* set the max_rx_pktlen */
2560         internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
2561
2562         /*
2563          * if no kvlist, it means that this bonded device has been created
2564          * through the bonding api.
2565          */
2566         if (!kvlist)
2567                 return 0;
2568
2569         /* Parse MAC address for bonded device */
2570         arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
2571         if (arg_count == 1) {
2572                 struct ether_addr bond_mac;
2573
2574                 if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
2575                                 &bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
2576                         RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
2577                                         name);
2578                         return -1;
2579                 }
2580
2581                 /* Set MAC address */
2582                 if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
2583                         RTE_LOG(ERR, EAL,
2584                                         "Failed to set mac address on bonded device %s\n",
2585                                         name);
2586                         return -1;
2587                 }
2588         } else if (arg_count > 1) {
2589                 RTE_LOG(ERR, EAL,
2590                                 "MAC address can be specified only once for bonded device %s\n",
2591                                 name);
2592                 return -1;
2593         }
2594
2595         /* Parse/set balance mode transmit policy */
2596         arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
2597         if (arg_count == 1) {
2598                 uint8_t xmit_policy;
2599
2600                 if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
2601                                 &bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
2602                                                 0) {
2603                         RTE_LOG(INFO, EAL,
2604                                         "Invalid xmit policy specified for bonded device %s\n",
2605                                         name);
2606                         return -1;
2607                 }
2608
2609                 /* Set balance mode transmit policy*/
2610                 if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
2611                         RTE_LOG(ERR, EAL,
2612                                         "Failed to set balance xmit policy on bonded device %s\n",
2613                                         name);
2614                         return -1;
2615                 }
2616         } else if (arg_count > 1) {
2617                 RTE_LOG(ERR, EAL,
2618                                 "Transmit policy can be specified only once for bonded device"
2619                                 " %s\n", name);
2620                 return -1;
2621         }
2622
2623         /* Parse/add slave ports to bonded device */
2624         if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
2625                 struct bond_ethdev_slave_ports slave_ports;
2626                 unsigned i;
2627
2628                 memset(&slave_ports, 0, sizeof(slave_ports));
2629
2630                 if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
2631                                 &bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
2632                         RTE_LOG(ERR, EAL,
2633                                         "Failed to parse slave ports for bonded device %s\n",
2634                                         name);
2635                         return -1;
2636                 }
2637
2638                 for (i = 0; i < slave_ports.slave_count; i++) {
2639                         if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
2640                                 RTE_LOG(ERR, EAL,
2641                                                 "Failed to add port %d as slave to bonded device %s\n",
2642                                                 slave_ports.slaves[i], name);
2643                         }
2644                 }
2645
2646         } else {
2647                 RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
2648                 return -1;
2649         }
2650
2651         /* Parse/set primary slave port id*/
2652         arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
2653         if (arg_count == 1) {
2654                 uint8_t primary_slave_port_id;
2655
2656                 if (rte_kvargs_process(kvlist,
2657                                 PMD_BOND_PRIMARY_SLAVE_KVARG,
2658                                 &bond_ethdev_parse_primary_slave_port_id_kvarg,
2659                                 &primary_slave_port_id) < 0) {
2660                         RTE_LOG(INFO, EAL,
2661                                         "Invalid primary slave port id specified for bonded device"
2662                                         " %s\n", name);
2663                         return -1;
2664                 }
2665
2666                 /* Set balance mode transmit policy*/
2667                 if (rte_eth_bond_primary_set(port_id, (uint8_t)primary_slave_port_id)
2668                                 != 0) {
2669                         RTE_LOG(ERR, EAL,
2670                                         "Failed to set primary slave port %d on bonded device %s\n",
2671                                         primary_slave_port_id, name);
2672                         return -1;
2673                 }
2674         } else if (arg_count > 1) {
2675                 RTE_LOG(INFO, EAL,
2676                                 "Primary slave can be specified only once for bonded device"
2677                                 " %s\n", name);
2678                 return -1;
2679         }
2680
2681         /* Parse link status monitor polling interval */
2682         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
2683         if (arg_count == 1) {
2684                 uint32_t lsc_poll_interval_ms;
2685
2686                 if (rte_kvargs_process(kvlist,
2687                                 PMD_BOND_LSC_POLL_PERIOD_KVARG,
2688                                 &bond_ethdev_parse_time_ms_kvarg,
2689                                 &lsc_poll_interval_ms) < 0) {
2690                         RTE_LOG(INFO, EAL,
2691                                         "Invalid lsc polling interval value specified for bonded"
2692                                         " device %s\n", name);
2693                         return -1;
2694                 }
2695
2696                 if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
2697                                 != 0) {
2698                         RTE_LOG(ERR, EAL,
2699                                         "Failed to set lsc monitor polling interval (%u ms) on"
2700                                         " bonded device %s\n", lsc_poll_interval_ms, name);
2701                         return -1;
2702                 }
2703         } else if (arg_count > 1) {
2704                 RTE_LOG(INFO, EAL,
2705                                 "LSC polling interval can be specified only once for bonded"
2706                                 " device %s\n", name);
2707                 return -1;
2708         }
2709
2710         /* Parse link up interrupt propagation delay */
2711         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
2712         if (arg_count == 1) {
2713                 uint32_t link_up_delay_ms;
2714
2715                 if (rte_kvargs_process(kvlist,
2716                                 PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
2717                                 &bond_ethdev_parse_time_ms_kvarg,
2718                                 &link_up_delay_ms) < 0) {
2719                         RTE_LOG(INFO, EAL,
2720                                         "Invalid link up propagation delay value specified for"
2721                                         " bonded device %s\n", name);
2722                         return -1;
2723                 }
2724
2725                 /* Set balance mode transmit policy*/
2726                 if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
2727                                 != 0) {
2728                         RTE_LOG(ERR, EAL,
2729                                         "Failed to set link up propagation delay (%u ms) on bonded"
2730                                         " device %s\n", link_up_delay_ms, name);
2731                         return -1;
2732                 }
2733         } else if (arg_count > 1) {
2734                 RTE_LOG(INFO, EAL,
2735                                 "Link up propagation delay can be specified only once for"
2736                                 " bonded device %s\n", name);
2737                 return -1;
2738         }
2739
2740         /* Parse link down interrupt propagation delay */
2741         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
2742         if (arg_count == 1) {
2743                 uint32_t link_down_delay_ms;
2744
2745                 if (rte_kvargs_process(kvlist,
2746                                 PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
2747                                 &bond_ethdev_parse_time_ms_kvarg,
2748                                 &link_down_delay_ms) < 0) {
2749                         RTE_LOG(INFO, EAL,
2750                                         "Invalid link down propagation delay value specified for"
2751                                         " bonded device %s\n", name);
2752                         return -1;
2753                 }
2754
2755                 /* Set balance mode transmit policy*/
2756                 if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
2757                                 != 0) {
2758                         RTE_LOG(ERR, EAL,
2759                                         "Failed to set link down propagation delay (%u ms) on"
2760                                         " bonded device %s\n", link_down_delay_ms, name);
2761                         return -1;
2762                 }
2763         } else if (arg_count > 1) {
2764                 RTE_LOG(INFO, EAL,
2765                                 "Link down propagation delay can be specified only once for"
2766                                 " bonded device %s\n", name);
2767                 return -1;
2768         }
2769
2770         return 0;
2771 }
2772
2773 struct rte_vdev_driver pmd_bond_drv = {
2774         .probe = bond_probe,
2775         .remove = bond_remove,
2776 };
2777
2778 RTE_PMD_REGISTER_VDEV(net_bonding, pmd_bond_drv);
2779 RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
2780
2781 RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
2782         "slave=<ifc> "
2783         "primary=<ifc> "
2784         "mode=[0-6] "
2785         "xmit_policy=[l2 | l23 | l34] "
2786         "socket_id=<int> "
2787         "mac=<mac addr> "
2788         "lsc_poll_period_ms=<int> "
2789         "up_delay=<int> "
2790         "down_delay=<int>");