net/bonding: call through EAL on create/free
[dpdk.git] / drivers / net / bonding / rte_eth_bond_pmd.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 #include <stdlib.h>
34 #include <netinet/in.h>
35
36 #include <rte_mbuf.h>
37 #include <rte_malloc.h>
38 #include <rte_ethdev.h>
39 #include <rte_tcp.h>
40 #include <rte_udp.h>
41 #include <rte_ip.h>
42 #include <rte_ip_frag.h>
43 #include <rte_devargs.h>
44 #include <rte_kvargs.h>
45 #include <rte_vdev.h>
46 #include <rte_alarm.h>
47 #include <rte_cycles.h>
48
49 #include "rte_eth_bond.h"
50 #include "rte_eth_bond_private.h"
51 #include "rte_eth_bond_8023ad_private.h"
52
53 #define REORDER_PERIOD_MS 10
54 #define DEFAULT_POLLING_INTERVAL_10_MS (10)
55
56 #define HASH_L4_PORTS(h) ((h)->src_port ^ (h)->dst_port)
57
58 /* Table for statistics in mode 5 TLB */
59 static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
60
61 static inline size_t
62 get_vlan_offset(struct ether_hdr *eth_hdr, uint16_t *proto)
63 {
64         size_t vlan_offset = 0;
65
66         if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
67                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
68
69                 vlan_offset = sizeof(struct vlan_hdr);
70                 *proto = vlan_hdr->eth_proto;
71
72                 if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == *proto) {
73                         vlan_hdr = vlan_hdr + 1;
74                         *proto = vlan_hdr->eth_proto;
75                         vlan_offset += sizeof(struct vlan_hdr);
76                 }
77         }
78         return vlan_offset;
79 }
80
81 static uint16_t
82 bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
83 {
84         struct bond_dev_private *internals;
85
86         uint16_t num_rx_slave = 0;
87         uint16_t num_rx_total = 0;
88
89         int i;
90
91         /* Cast to structure, containing bonded device's port id and queue id */
92         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
93
94         internals = bd_rx_q->dev_private;
95
96
97         for (i = 0; i < internals->active_slave_count && nb_pkts; i++) {
98                 /* Offset of pointer to *bufs increases as packets are received
99                  * from other slaves */
100                 num_rx_slave = rte_eth_rx_burst(internals->active_slaves[i],
101                                 bd_rx_q->queue_id, bufs + num_rx_total, nb_pkts);
102                 if (num_rx_slave) {
103                         num_rx_total += num_rx_slave;
104                         nb_pkts -= num_rx_slave;
105                 }
106         }
107
108         return num_rx_total;
109 }
110
111 static uint16_t
112 bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs,
113                 uint16_t nb_pkts)
114 {
115         struct bond_dev_private *internals;
116
117         /* Cast to structure, containing bonded device's port id and queue id */
118         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
119
120         internals = bd_rx_q->dev_private;
121
122         return rte_eth_rx_burst(internals->current_primary_port,
123                         bd_rx_q->queue_id, bufs, nb_pkts);
124 }
125
126 static inline uint8_t
127 is_lacp_packets(uint16_t ethertype, uint8_t subtype, uint16_t vlan_tci)
128 {
129         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
130
131         return !vlan_tci && (ethertype == ether_type_slow_be &&
132                 (subtype == SLOW_SUBTYPE_MARKER || subtype == SLOW_SUBTYPE_LACP));
133 }
134
135 static uint16_t
136 bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
137                 uint16_t nb_pkts)
138 {
139         /* Cast to structure, containing bonded device's port id and queue id */
140         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue;
141         struct bond_dev_private *internals = bd_rx_q->dev_private;
142         struct ether_addr bond_mac;
143
144         struct ether_hdr *hdr;
145
146         const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW);
147         uint16_t num_rx_total = 0;      /* Total number of received packets */
148         uint8_t slaves[RTE_MAX_ETHPORTS];
149         uint8_t slave_count, idx;
150
151         uint8_t collecting;  /* current slave collecting status */
152         const uint8_t promisc = internals->promiscuous_en;
153         uint8_t i, j, k;
154         uint8_t subtype;
155
156         rte_eth_macaddr_get(internals->port_id, &bond_mac);
157         /* Copy slave list to protect against slave up/down changes during tx
158          * bursting */
159         slave_count = internals->active_slave_count;
160         memcpy(slaves, internals->active_slaves,
161                         sizeof(internals->active_slaves[0]) * slave_count);
162
163         idx = internals->active_slave;
164         if (idx >= slave_count) {
165                 internals->active_slave = 0;
166                 idx = 0;
167         }
168         for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) {
169                 j = num_rx_total;
170                 collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[idx]],
171                                          COLLECTING);
172
173                 /* Read packets from this slave */
174                 num_rx_total += rte_eth_rx_burst(slaves[idx], bd_rx_q->queue_id,
175                                 &bufs[num_rx_total], nb_pkts - num_rx_total);
176
177                 for (k = j; k < 2 && k < num_rx_total; k++)
178                         rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *));
179
180                 /* Handle slow protocol packets. */
181                 while (j < num_rx_total) {
182                         if (j + 3 < num_rx_total)
183                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *));
184
185                         hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
186                         subtype = ((struct slow_protocol_frame *)hdr)->slow_protocol.subtype;
187
188                         /* Remove packet from array if it is slow packet or slave is not
189                          * in collecting state or bondign interface is not in promiscus
190                          * mode and packet address does not match. */
191                         if (unlikely(is_lacp_packets(hdr->ether_type, subtype, bufs[j]->vlan_tci) ||
192                                 !collecting || (!promisc &&
193                                         !is_multicast_ether_addr(&hdr->d_addr) &&
194                                         !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) {
195
196                                 if (hdr->ether_type == ether_type_slow_be) {
197                                         bond_mode_8023ad_handle_slow_pkt(
198                                             internals, slaves[idx], bufs[j]);
199                                 } else
200                                         rte_pktmbuf_free(bufs[j]);
201
202                                 /* Packet is managed by mode 4 or dropped, shift the array */
203                                 num_rx_total--;
204                                 if (j < num_rx_total) {
205                                         memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) *
206                                                 (num_rx_total - j));
207                                 }
208                         } else
209                                 j++;
210                 }
211                 if (unlikely(++idx == slave_count))
212                         idx = 0;
213         }
214
215         internals->active_slave = idx;
216         return num_rx_total;
217 }
218
219 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
220 uint32_t burstnumberRX;
221 uint32_t burstnumberTX;
222
223 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
224
225 static void
226 arp_op_name(uint16_t arp_op, char *buf)
227 {
228         switch (arp_op) {
229         case ARP_OP_REQUEST:
230                 snprintf(buf, sizeof("ARP Request"), "%s", "ARP Request");
231                 return;
232         case ARP_OP_REPLY:
233                 snprintf(buf, sizeof("ARP Reply"), "%s", "ARP Reply");
234                 return;
235         case ARP_OP_REVREQUEST:
236                 snprintf(buf, sizeof("Reverse ARP Request"), "%s",
237                                 "Reverse ARP Request");
238                 return;
239         case ARP_OP_REVREPLY:
240                 snprintf(buf, sizeof("Reverse ARP Reply"), "%s",
241                                 "Reverse ARP Reply");
242                 return;
243         case ARP_OP_INVREQUEST:
244                 snprintf(buf, sizeof("Peer Identify Request"), "%s",
245                                 "Peer Identify Request");
246                 return;
247         case ARP_OP_INVREPLY:
248                 snprintf(buf, sizeof("Peer Identify Reply"), "%s",
249                                 "Peer Identify Reply");
250                 return;
251         default:
252                 break;
253         }
254         snprintf(buf, sizeof("Unknown"), "%s", "Unknown");
255         return;
256 }
257 #endif
258 #define MaxIPv4String   16
259 static void
260 ipv4_addr_to_dot(uint32_t be_ipv4_addr, char *buf, uint8_t buf_size)
261 {
262         uint32_t ipv4_addr;
263
264         ipv4_addr = rte_be_to_cpu_32(be_ipv4_addr);
265         snprintf(buf, buf_size, "%d.%d.%d.%d", (ipv4_addr >> 24) & 0xFF,
266                 (ipv4_addr >> 16) & 0xFF, (ipv4_addr >> 8) & 0xFF,
267                 ipv4_addr & 0xFF);
268 }
269
270 #define MAX_CLIENTS_NUMBER      128
271 uint8_t active_clients;
272 struct client_stats_t {
273         uint8_t port;
274         uint32_t ipv4_addr;
275         uint32_t ipv4_rx_packets;
276         uint32_t ipv4_tx_packets;
277 };
278 struct client_stats_t client_stats[MAX_CLIENTS_NUMBER];
279
280 static void
281 update_client_stats(uint32_t addr, uint8_t port, uint32_t *TXorRXindicator)
282 {
283         int i = 0;
284
285         for (; i < MAX_CLIENTS_NUMBER; i++)     {
286                 if ((client_stats[i].ipv4_addr == addr) && (client_stats[i].port == port))      {
287                         /* Just update RX packets number for this client */
288                         if (TXorRXindicator == &burstnumberRX)
289                                 client_stats[i].ipv4_rx_packets++;
290                         else
291                                 client_stats[i].ipv4_tx_packets++;
292                         return;
293                 }
294         }
295         /* We have a new client. Insert him to the table, and increment stats */
296         if (TXorRXindicator == &burstnumberRX)
297                 client_stats[active_clients].ipv4_rx_packets++;
298         else
299                 client_stats[active_clients].ipv4_tx_packets++;
300         client_stats[active_clients].ipv4_addr = addr;
301         client_stats[active_clients].port = port;
302         active_clients++;
303
304 }
305
306 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
307 #define MODE6_DEBUG(info, src_ip, dst_ip, eth_h, arp_op, port, burstnumber)     \
308                 RTE_LOG(DEBUG, PMD, \
309                 "%s " \
310                 "port:%d " \
311                 "SrcMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
312                 "SrcIP:%s " \
313                 "DstMAC:%02X:%02X:%02X:%02X:%02X:%02X " \
314                 "DstIP:%s " \
315                 "%s " \
316                 "%d\n", \
317                 info, \
318                 port, \
319                 eth_h->s_addr.addr_bytes[0], \
320                 eth_h->s_addr.addr_bytes[1], \
321                 eth_h->s_addr.addr_bytes[2], \
322                 eth_h->s_addr.addr_bytes[3], \
323                 eth_h->s_addr.addr_bytes[4], \
324                 eth_h->s_addr.addr_bytes[5], \
325                 src_ip, \
326                 eth_h->d_addr.addr_bytes[0], \
327                 eth_h->d_addr.addr_bytes[1], \
328                 eth_h->d_addr.addr_bytes[2], \
329                 eth_h->d_addr.addr_bytes[3], \
330                 eth_h->d_addr.addr_bytes[4], \
331                 eth_h->d_addr.addr_bytes[5], \
332                 dst_ip, \
333                 arp_op, \
334                 ++burstnumber)
335 #endif
336
337 static void
338 mode6_debug(const char __attribute__((unused)) *info, struct ether_hdr *eth_h,
339                 uint8_t port, uint32_t __attribute__((unused)) *burstnumber)
340 {
341         struct ipv4_hdr *ipv4_h;
342 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
343         struct arp_hdr *arp_h;
344         char dst_ip[16];
345         char ArpOp[24];
346         char buf[16];
347 #endif
348         char src_ip[16];
349
350         uint16_t ether_type = eth_h->ether_type;
351         uint16_t offset = get_vlan_offset(eth_h, &ether_type);
352
353 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
354         snprintf(buf, 16, "%s", info);
355 #endif
356
357         if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
358                 ipv4_h = (struct ipv4_hdr *)((char *)(eth_h + 1) + offset);
359                 ipv4_addr_to_dot(ipv4_h->src_addr, src_ip, MaxIPv4String);
360 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
361                 ipv4_addr_to_dot(ipv4_h->dst_addr, dst_ip, MaxIPv4String);
362                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, "", port, *burstnumber);
363 #endif
364                 update_client_stats(ipv4_h->src_addr, port, burstnumber);
365         }
366 #ifdef RTE_LIBRTE_BOND_DEBUG_ALB
367         else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
368                 arp_h = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
369                 ipv4_addr_to_dot(arp_h->arp_data.arp_sip, src_ip, MaxIPv4String);
370                 ipv4_addr_to_dot(arp_h->arp_data.arp_tip, dst_ip, MaxIPv4String);
371                 arp_op_name(rte_be_to_cpu_16(arp_h->arp_op), ArpOp);
372                 MODE6_DEBUG(buf, src_ip, dst_ip, eth_h, ArpOp, port, *burstnumber);
373         }
374 #endif
375 }
376 #endif
377
378 static uint16_t
379 bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
380 {
381         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
382         struct bond_dev_private *internals = bd_tx_q->dev_private;
383         struct ether_hdr *eth_h;
384         uint16_t ether_type, offset;
385         uint16_t nb_recv_pkts;
386         int i;
387
388         nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
389
390         for (i = 0; i < nb_recv_pkts; i++) {
391                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
392                 ether_type = eth_h->ether_type;
393                 offset = get_vlan_offset(eth_h, &ether_type);
394
395                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
396 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
397                         mode6_debug("RX ARP:", eth_h, bufs[i]->port, &burstnumberRX);
398 #endif
399                         bond_mode_alb_arp_recv(eth_h, offset, internals);
400                 }
401 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
402                 else if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_IPv4))
403                         mode6_debug("RX IPv4:", eth_h, bufs[i]->port, &burstnumberRX);
404 #endif
405         }
406
407         return nb_recv_pkts;
408 }
409
410 static uint16_t
411 bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
412                 uint16_t nb_pkts)
413 {
414         struct bond_dev_private *internals;
415         struct bond_tx_queue *bd_tx_q;
416
417         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
418         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
419
420         uint8_t num_of_slaves;
421         uint8_t slaves[RTE_MAX_ETHPORTS];
422
423         uint16_t num_tx_total = 0, num_tx_slave;
424
425         static int slave_idx = 0;
426         int i, cslave_idx = 0, tx_fail_total = 0;
427
428         bd_tx_q = (struct bond_tx_queue *)queue;
429         internals = bd_tx_q->dev_private;
430
431         /* Copy slave list to protect against slave up/down changes during tx
432          * bursting */
433         num_of_slaves = internals->active_slave_count;
434         memcpy(slaves, internals->active_slaves,
435                         sizeof(internals->active_slaves[0]) * num_of_slaves);
436
437         if (num_of_slaves < 1)
438                 return num_tx_total;
439
440         /* Populate slaves mbuf with which packets are to be sent on it  */
441         for (i = 0; i < nb_pkts; i++) {
442                 cslave_idx = (slave_idx + i) % num_of_slaves;
443                 slave_bufs[cslave_idx][(slave_nb_pkts[cslave_idx])++] = bufs[i];
444         }
445
446         /* increment current slave index so the next call to tx burst starts on the
447          * next slave */
448         slave_idx = ++cslave_idx;
449
450         /* Send packet burst on each slave device */
451         for (i = 0; i < num_of_slaves; i++) {
452                 if (slave_nb_pkts[i] > 0) {
453                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
454                                         slave_bufs[i], slave_nb_pkts[i]);
455
456                         /* if tx burst fails move packets to end of bufs */
457                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
458                                 int tx_fail_slave = slave_nb_pkts[i] - num_tx_slave;
459
460                                 tx_fail_total += tx_fail_slave;
461
462                                 memcpy(&bufs[nb_pkts - tx_fail_total],
463                                                 &slave_bufs[i][num_tx_slave],
464                                                 tx_fail_slave * sizeof(bufs[0]));
465                         }
466                         num_tx_total += num_tx_slave;
467                 }
468         }
469
470         return num_tx_total;
471 }
472
473 static uint16_t
474 bond_ethdev_tx_burst_active_backup(void *queue,
475                 struct rte_mbuf **bufs, uint16_t nb_pkts)
476 {
477         struct bond_dev_private *internals;
478         struct bond_tx_queue *bd_tx_q;
479
480         bd_tx_q = (struct bond_tx_queue *)queue;
481         internals = bd_tx_q->dev_private;
482
483         if (internals->active_slave_count < 1)
484                 return 0;
485
486         return rte_eth_tx_burst(internals->current_primary_port, bd_tx_q->queue_id,
487                         bufs, nb_pkts);
488 }
489
490 static inline uint16_t
491 ether_hash(struct ether_hdr *eth_hdr)
492 {
493         unaligned_uint16_t *word_src_addr =
494                 (unaligned_uint16_t *)eth_hdr->s_addr.addr_bytes;
495         unaligned_uint16_t *word_dst_addr =
496                 (unaligned_uint16_t *)eth_hdr->d_addr.addr_bytes;
497
498         return (word_src_addr[0] ^ word_dst_addr[0]) ^
499                         (word_src_addr[1] ^ word_dst_addr[1]) ^
500                         (word_src_addr[2] ^ word_dst_addr[2]);
501 }
502
503 static inline uint32_t
504 ipv4_hash(struct ipv4_hdr *ipv4_hdr)
505 {
506         return ipv4_hdr->src_addr ^ ipv4_hdr->dst_addr;
507 }
508
509 static inline uint32_t
510 ipv6_hash(struct ipv6_hdr *ipv6_hdr)
511 {
512         unaligned_uint32_t *word_src_addr =
513                 (unaligned_uint32_t *)&(ipv6_hdr->src_addr[0]);
514         unaligned_uint32_t *word_dst_addr =
515                 (unaligned_uint32_t *)&(ipv6_hdr->dst_addr[0]);
516
517         return (word_src_addr[0] ^ word_dst_addr[0]) ^
518                         (word_src_addr[1] ^ word_dst_addr[1]) ^
519                         (word_src_addr[2] ^ word_dst_addr[2]) ^
520                         (word_src_addr[3] ^ word_dst_addr[3]);
521 }
522
523 uint16_t
524 xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
525 {
526         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
527
528         uint32_t hash = ether_hash(eth_hdr);
529
530         return (hash ^= hash >> 8) % slave_count;
531 }
532
533 uint16_t
534 xmit_l23_hash(const struct rte_mbuf *buf, uint8_t slave_count)
535 {
536         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
537         uint16_t proto = eth_hdr->ether_type;
538         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
539         uint32_t hash, l3hash = 0;
540
541         hash = ether_hash(eth_hdr);
542
543         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
544                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
545                                 ((char *)(eth_hdr + 1) + vlan_offset);
546                 l3hash = ipv4_hash(ipv4_hdr);
547
548         } else if (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
549                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
550                                 ((char *)(eth_hdr + 1) + vlan_offset);
551                 l3hash = ipv6_hash(ipv6_hdr);
552         }
553
554         hash = hash ^ l3hash;
555         hash ^= hash >> 16;
556         hash ^= hash >> 8;
557
558         return hash % slave_count;
559 }
560
561 uint16_t
562 xmit_l34_hash(const struct rte_mbuf *buf, uint8_t slave_count)
563 {
564         struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(buf, struct ether_hdr *);
565         uint16_t proto = eth_hdr->ether_type;
566         size_t vlan_offset = get_vlan_offset(eth_hdr, &proto);
567
568         struct udp_hdr *udp_hdr = NULL;
569         struct tcp_hdr *tcp_hdr = NULL;
570         uint32_t hash, l3hash = 0, l4hash = 0;
571
572         if (rte_cpu_to_be_16(ETHER_TYPE_IPv4) == proto) {
573                 struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *)
574                                 ((char *)(eth_hdr + 1) + vlan_offset);
575                 size_t ip_hdr_offset;
576
577                 l3hash = ipv4_hash(ipv4_hdr);
578
579                 /* there is no L4 header in fragmented packet */
580                 if (likely(rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr) == 0)) {
581                         ip_hdr_offset = (ipv4_hdr->version_ihl & IPV4_HDR_IHL_MASK) *
582                                         IPV4_IHL_MULTIPLIER;
583
584                         if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
585                                 tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
586                                                 ip_hdr_offset);
587                                 l4hash = HASH_L4_PORTS(tcp_hdr);
588                         } else if (ipv4_hdr->next_proto_id == IPPROTO_UDP) {
589                                 udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
590                                                 ip_hdr_offset);
591                                 l4hash = HASH_L4_PORTS(udp_hdr);
592                         }
593                 }
594         } else if  (rte_cpu_to_be_16(ETHER_TYPE_IPv6) == proto) {
595                 struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *)
596                                 ((char *)(eth_hdr + 1) + vlan_offset);
597                 l3hash = ipv6_hash(ipv6_hdr);
598
599                 if (ipv6_hdr->proto == IPPROTO_TCP) {
600                         tcp_hdr = (struct tcp_hdr *)(ipv6_hdr + 1);
601                         l4hash = HASH_L4_PORTS(tcp_hdr);
602                 } else if (ipv6_hdr->proto == IPPROTO_UDP) {
603                         udp_hdr = (struct udp_hdr *)(ipv6_hdr + 1);
604                         l4hash = HASH_L4_PORTS(udp_hdr);
605                 }
606         }
607
608         hash = l3hash ^ l4hash;
609         hash ^= hash >> 16;
610         hash ^= hash >> 8;
611
612         return hash % slave_count;
613 }
614
615 struct bwg_slave {
616         uint64_t bwg_left_int;
617         uint64_t bwg_left_remainder;
618         uint8_t slave;
619 };
620
621 void
622 bond_tlb_activate_slave(struct bond_dev_private *internals) {
623         int i;
624
625         for (i = 0; i < internals->active_slave_count; i++) {
626                 tlb_last_obytets[internals->active_slaves[i]] = 0;
627         }
628 }
629
630 static int
631 bandwidth_cmp(const void *a, const void *b)
632 {
633         const struct bwg_slave *bwg_a = a;
634         const struct bwg_slave *bwg_b = b;
635         int64_t diff = (int64_t)bwg_b->bwg_left_int - (int64_t)bwg_a->bwg_left_int;
636         int64_t diff2 = (int64_t)bwg_b->bwg_left_remainder -
637                         (int64_t)bwg_a->bwg_left_remainder;
638         if (diff > 0)
639                 return 1;
640         else if (diff < 0)
641                 return -1;
642         else if (diff2 > 0)
643                 return 1;
644         else if (diff2 < 0)
645                 return -1;
646         else
647                 return 0;
648 }
649
650 static void
651 bandwidth_left(uint8_t port_id, uint64_t load, uint8_t update_idx,
652                 struct bwg_slave *bwg_slave)
653 {
654         struct rte_eth_link link_status;
655
656         rte_eth_link_get(port_id, &link_status);
657         uint64_t link_bwg = link_status.link_speed * 1000000ULL / 8;
658         if (link_bwg == 0)
659                 return;
660         link_bwg = link_bwg * (update_idx+1) * REORDER_PERIOD_MS;
661         bwg_slave->bwg_left_int = (link_bwg - 1000*load) / link_bwg;
662         bwg_slave->bwg_left_remainder = (link_bwg - 1000*load) % link_bwg;
663 }
664
665 static void
666 bond_ethdev_update_tlb_slave_cb(void *arg)
667 {
668         struct bond_dev_private *internals = arg;
669         struct rte_eth_stats slave_stats;
670         struct bwg_slave bwg_array[RTE_MAX_ETHPORTS];
671         uint8_t slave_count;
672         uint64_t tx_bytes;
673
674         uint8_t update_stats = 0;
675         uint8_t i, slave_id;
676
677         internals->slave_update_idx++;
678
679
680         if (internals->slave_update_idx >= REORDER_PERIOD_MS)
681                 update_stats = 1;
682
683         for (i = 0; i < internals->active_slave_count; i++) {
684                 slave_id = internals->active_slaves[i];
685                 rte_eth_stats_get(slave_id, &slave_stats);
686                 tx_bytes = slave_stats.obytes - tlb_last_obytets[slave_id];
687                 bandwidth_left(slave_id, tx_bytes,
688                                 internals->slave_update_idx, &bwg_array[i]);
689                 bwg_array[i].slave = slave_id;
690
691                 if (update_stats) {
692                         tlb_last_obytets[slave_id] = slave_stats.obytes;
693                 }
694         }
695
696         if (update_stats == 1)
697                 internals->slave_update_idx = 0;
698
699         slave_count = i;
700         qsort(bwg_array, slave_count, sizeof(bwg_array[0]), bandwidth_cmp);
701         for (i = 0; i < slave_count; i++)
702                 internals->tlb_slaves_order[i] = bwg_array[i].slave;
703
704         rte_eal_alarm_set(REORDER_PERIOD_MS * 1000, bond_ethdev_update_tlb_slave_cb,
705                         (struct bond_dev_private *)internals);
706 }
707
708 static uint16_t
709 bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
710 {
711         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
712         struct bond_dev_private *internals = bd_tx_q->dev_private;
713
714         struct rte_eth_dev *primary_port =
715                         &rte_eth_devices[internals->primary_port];
716         uint16_t num_tx_total = 0;
717         uint8_t i, j;
718
719         uint8_t num_of_slaves = internals->active_slave_count;
720         uint8_t slaves[RTE_MAX_ETHPORTS];
721
722         struct ether_hdr *ether_hdr;
723         struct ether_addr primary_slave_addr;
724         struct ether_addr active_slave_addr;
725
726         if (num_of_slaves < 1)
727                 return num_tx_total;
728
729         memcpy(slaves, internals->tlb_slaves_order,
730                                 sizeof(internals->tlb_slaves_order[0]) * num_of_slaves);
731
732
733         ether_addr_copy(primary_port->data->mac_addrs, &primary_slave_addr);
734
735         if (nb_pkts > 3) {
736                 for (i = 0; i < 3; i++)
737                         rte_prefetch0(rte_pktmbuf_mtod(bufs[i], void*));
738         }
739
740         for (i = 0; i < num_of_slaves; i++) {
741                 rte_eth_macaddr_get(slaves[i], &active_slave_addr);
742                 for (j = num_tx_total; j < nb_pkts; j++) {
743                         if (j + 3 < nb_pkts)
744                                 rte_prefetch0(rte_pktmbuf_mtod(bufs[j+3], void*));
745
746                         ether_hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *);
747                         if (is_same_ether_addr(&ether_hdr->s_addr, &primary_slave_addr))
748                                 ether_addr_copy(&active_slave_addr, &ether_hdr->s_addr);
749 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
750                                         mode6_debug("TX IPv4:", ether_hdr, slaves[i], &burstnumberTX);
751 #endif
752                 }
753
754                 num_tx_total += rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
755                                 bufs + num_tx_total, nb_pkts - num_tx_total);
756
757                 if (num_tx_total == nb_pkts)
758                         break;
759         }
760
761         return num_tx_total;
762 }
763
764 void
765 bond_tlb_disable(struct bond_dev_private *internals)
766 {
767         rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
768 }
769
770 void
771 bond_tlb_enable(struct bond_dev_private *internals)
772 {
773         bond_ethdev_update_tlb_slave_cb(internals);
774 }
775
776 static uint16_t
777 bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
778 {
779         struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
780         struct bond_dev_private *internals = bd_tx_q->dev_private;
781
782         struct ether_hdr *eth_h;
783         uint16_t ether_type, offset;
784
785         struct client_data *client_info;
786
787         /*
788          * We create transmit buffers for every slave and one additional to send
789          * through tlb. In worst case every packet will be send on one port.
790          */
791         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
792         uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
793
794         /*
795          * We create separate transmit buffers for update packets as they wont be
796          * counted in num_tx_total.
797          */
798         struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
799         uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
800
801         struct rte_mbuf *upd_pkt;
802         size_t pkt_size;
803
804         uint16_t num_send, num_not_send = 0;
805         uint16_t num_tx_total = 0;
806         uint8_t slave_idx;
807
808         int i, j;
809
810         /* Search tx buffer for ARP packets and forward them to alb */
811         for (i = 0; i < nb_pkts; i++) {
812                 eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
813                 ether_type = eth_h->ether_type;
814                 offset = get_vlan_offset(eth_h, &ether_type);
815
816                 if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
817                         slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
818
819                         /* Change src mac in eth header */
820                         rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
821
822                         /* Add packet to slave tx buffer */
823                         slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
824                         slave_bufs_pkts[slave_idx]++;
825                 } else {
826                         /* If packet is not ARP, send it with TLB policy */
827                         slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
828                                         bufs[i];
829                         slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
830                 }
831         }
832
833         /* Update connected client ARP tables */
834         if (internals->mode6.ntt) {
835                 for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
836                         client_info = &internals->mode6.client_table[i];
837
838                         if (client_info->in_use) {
839                                 /* Allocate new packet to send ARP update on current slave */
840                                 upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
841                                 if (upd_pkt == NULL) {
842                                         RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
843                                         continue;
844                                 }
845                                 pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr)
846                                                 + client_info->vlan_count * sizeof(struct vlan_hdr);
847                                 upd_pkt->data_len = pkt_size;
848                                 upd_pkt->pkt_len = pkt_size;
849
850                                 slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
851                                                 internals);
852
853                                 /* Add packet to update tx buffer */
854                                 update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
855                                 update_bufs_pkts[slave_idx]++;
856                         }
857                 }
858                 internals->mode6.ntt = 0;
859         }
860
861         /* Send ARP packets on proper slaves */
862         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
863                 if (slave_bufs_pkts[i] > 0) {
864                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
865                                         slave_bufs[i], slave_bufs_pkts[i]);
866                         for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
867                                 bufs[nb_pkts - 1 - num_not_send - j] =
868                                                 slave_bufs[i][nb_pkts - 1 - j];
869                         }
870
871                         num_tx_total += num_send;
872                         num_not_send += slave_bufs_pkts[i] - num_send;
873
874 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
875         /* Print TX stats including update packets */
876                         for (j = 0; j < slave_bufs_pkts[i]; j++) {
877                                 eth_h = rte_pktmbuf_mtod(slave_bufs[i][j], struct ether_hdr *);
878                                 mode6_debug("TX ARP:", eth_h, i, &burstnumberTX);
879                         }
880 #endif
881                 }
882         }
883
884         /* Send update packets on proper slaves */
885         for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
886                 if (update_bufs_pkts[i] > 0) {
887                         num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
888                                         update_bufs_pkts[i]);
889                         for (j = num_send; j < update_bufs_pkts[i]; j++) {
890                                 rte_pktmbuf_free(update_bufs[i][j]);
891                         }
892 #if defined(RTE_LIBRTE_BOND_DEBUG_ALB) || defined(RTE_LIBRTE_BOND_DEBUG_ALB_L1)
893                         for (j = 0; j < update_bufs_pkts[i]; j++) {
894                                 eth_h = rte_pktmbuf_mtod(update_bufs[i][j], struct ether_hdr *);
895                                 mode6_debug("TX ARPupd:", eth_h, i, &burstnumberTX);
896                         }
897 #endif
898                 }
899         }
900
901         /* Send non-ARP packets using tlb policy */
902         if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
903                 num_send = bond_ethdev_tx_burst_tlb(queue,
904                                 slave_bufs[RTE_MAX_ETHPORTS],
905                                 slave_bufs_pkts[RTE_MAX_ETHPORTS]);
906
907                 for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
908                         bufs[nb_pkts - 1 - num_not_send - j] =
909                                         slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
910                 }
911
912                 num_tx_total += num_send;
913         }
914
915         return num_tx_total;
916 }
917
918 static uint16_t
919 bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
920                 uint16_t nb_pkts)
921 {
922         struct bond_dev_private *internals;
923         struct bond_tx_queue *bd_tx_q;
924
925         uint8_t num_of_slaves;
926         uint8_t slaves[RTE_MAX_ETHPORTS];
927
928         uint16_t num_tx_total = 0, num_tx_slave = 0, tx_fail_total = 0;
929
930         int i, op_slave_id;
931
932         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][nb_pkts];
933         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
934
935         bd_tx_q = (struct bond_tx_queue *)queue;
936         internals = bd_tx_q->dev_private;
937
938         /* Copy slave list to protect against slave up/down changes during tx
939          * bursting */
940         num_of_slaves = internals->active_slave_count;
941         memcpy(slaves, internals->active_slaves,
942                         sizeof(internals->active_slaves[0]) * num_of_slaves);
943
944         if (num_of_slaves < 1)
945                 return num_tx_total;
946
947         /* Populate slaves mbuf with the packets which are to be sent on it  */
948         for (i = 0; i < nb_pkts; i++) {
949                 /* Select output slave using hash based on xmit policy */
950                 op_slave_id = internals->xmit_hash(bufs[i], num_of_slaves);
951
952                 /* Populate slave mbuf arrays with mbufs for that slave */
953                 slave_bufs[op_slave_id][slave_nb_pkts[op_slave_id]++] = bufs[i];
954         }
955
956         /* Send packet burst on each slave device */
957         for (i = 0; i < num_of_slaves; i++) {
958                 if (slave_nb_pkts[i] > 0) {
959                         num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
960                                         slave_bufs[i], slave_nb_pkts[i]);
961
962                         /* if tx burst fails move packets to end of bufs */
963                         if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
964                                 int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave;
965
966                                 tx_fail_total += slave_tx_fail_count;
967                                 memcpy(&bufs[nb_pkts - tx_fail_total],
968                                                 &slave_bufs[i][num_tx_slave],
969                                                 slave_tx_fail_count * sizeof(bufs[0]));
970                         }
971
972                         num_tx_total += num_tx_slave;
973                 }
974         }
975
976         return num_tx_total;
977 }
978
979 static uint16_t
980 bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
981                 uint16_t nb_pkts)
982 {
983         struct bond_dev_private *internals;
984         struct bond_tx_queue *bd_tx_q;
985
986         uint8_t num_of_slaves;
987         uint8_t slaves[RTE_MAX_ETHPORTS];
988          /* positions in slaves, not ID */
989         uint8_t distributing_offsets[RTE_MAX_ETHPORTS];
990         uint8_t distributing_count;
991
992         uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0;
993         uint16_t i, j, op_slave_idx;
994         const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1;
995
996         /* Allocate additional packets in case 8023AD mode. */
997         struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size];
998         void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS] = { NULL };
999
1000         /* Total amount of packets in slave_bufs */
1001         uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1002         /* Slow packets placed in each slave */
1003         uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 };
1004
1005         bd_tx_q = (struct bond_tx_queue *)queue;
1006         internals = bd_tx_q->dev_private;
1007
1008         /* Copy slave list to protect against slave up/down changes during tx
1009          * bursting */
1010         num_of_slaves = internals->active_slave_count;
1011         if (num_of_slaves < 1)
1012                 return num_tx_total;
1013
1014         memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves);
1015
1016         distributing_count = 0;
1017         for (i = 0; i < num_of_slaves; i++) {
1018                 struct port *port = &mode_8023ad_ports[slaves[i]];
1019
1020                 slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring,
1021                                 slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS,
1022                                 NULL);
1023                 slave_nb_pkts[i] = slave_slow_nb_pkts[i];
1024
1025                 for (j = 0; j < slave_slow_nb_pkts[i]; j++)
1026                         slave_bufs[i][j] = slow_pkts[j];
1027
1028                 if (ACTOR_STATE(port, DISTRIBUTING))
1029                         distributing_offsets[distributing_count++] = i;
1030         }
1031
1032         if (likely(distributing_count > 0)) {
1033                 /* Populate slaves mbuf with the packets which are to be sent on it */
1034                 for (i = 0; i < nb_pkts; i++) {
1035                         /* Select output slave using hash based on xmit policy */
1036                         op_slave_idx = internals->xmit_hash(bufs[i], distributing_count);
1037
1038                         /* Populate slave mbuf arrays with mbufs for that slave. Use only
1039                          * slaves that are currently distributing. */
1040                         uint8_t slave_offset = distributing_offsets[op_slave_idx];
1041                         slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i];
1042                         slave_nb_pkts[slave_offset]++;
1043                 }
1044         }
1045
1046         /* Send packet burst on each slave device */
1047         for (i = 0; i < num_of_slaves; i++) {
1048                 if (slave_nb_pkts[i] == 0)
1049                         continue;
1050
1051                 num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1052                                 slave_bufs[i], slave_nb_pkts[i]);
1053
1054                 /* If tx burst fails drop slow packets */
1055                 for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++)
1056                         rte_pktmbuf_free(slave_bufs[i][num_tx_slave]);
1057
1058                 num_tx_total += num_tx_slave - slave_slow_nb_pkts[i];
1059                 num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave;
1060
1061                 /* If tx burst fails move packets to end of bufs */
1062                 if (unlikely(num_tx_slave < slave_nb_pkts[i])) {
1063                         uint16_t j = nb_pkts - num_tx_fail_total;
1064                         for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++)
1065                                 bufs[j] = slave_bufs[i][num_tx_slave];
1066                 }
1067         }
1068
1069         return num_tx_total;
1070 }
1071
1072 static uint16_t
1073 bond_ethdev_tx_burst_broadcast(void *queue, struct rte_mbuf **bufs,
1074                 uint16_t nb_pkts)
1075 {
1076         struct bond_dev_private *internals;
1077         struct bond_tx_queue *bd_tx_q;
1078
1079         uint8_t tx_failed_flag = 0, num_of_slaves;
1080         uint8_t slaves[RTE_MAX_ETHPORTS];
1081
1082         uint16_t max_nb_of_tx_pkts = 0;
1083
1084         int slave_tx_total[RTE_MAX_ETHPORTS];
1085         int i, most_successful_tx_slave = -1;
1086
1087         bd_tx_q = (struct bond_tx_queue *)queue;
1088         internals = bd_tx_q->dev_private;
1089
1090         /* Copy slave list to protect against slave up/down changes during tx
1091          * bursting */
1092         num_of_slaves = internals->active_slave_count;
1093         memcpy(slaves, internals->active_slaves,
1094                         sizeof(internals->active_slaves[0]) * num_of_slaves);
1095
1096         if (num_of_slaves < 1)
1097                 return 0;
1098
1099         /* Increment reference count on mbufs */
1100         for (i = 0; i < nb_pkts; i++)
1101                 rte_mbuf_refcnt_update(bufs[i], num_of_slaves - 1);
1102
1103         /* Transmit burst on each active slave */
1104         for (i = 0; i < num_of_slaves; i++) {
1105                 slave_tx_total[i] = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id,
1106                                         bufs, nb_pkts);
1107
1108                 if (unlikely(slave_tx_total[i] < nb_pkts))
1109                         tx_failed_flag = 1;
1110
1111                 /* record the value and slave index for the slave which transmits the
1112                  * maximum number of packets */
1113                 if (slave_tx_total[i] > max_nb_of_tx_pkts) {
1114                         max_nb_of_tx_pkts = slave_tx_total[i];
1115                         most_successful_tx_slave = i;
1116                 }
1117         }
1118
1119         /* if slaves fail to transmit packets from burst, the calling application
1120          * is not expected to know about multiple references to packets so we must
1121          * handle failures of all packets except those of the most successful slave
1122          */
1123         if (unlikely(tx_failed_flag))
1124                 for (i = 0; i < num_of_slaves; i++)
1125                         if (i != most_successful_tx_slave)
1126                                 while (slave_tx_total[i] < nb_pkts)
1127                                         rte_pktmbuf_free(bufs[slave_tx_total[i]++]);
1128
1129         return max_nb_of_tx_pkts;
1130 }
1131
1132 void
1133 link_properties_set(struct rte_eth_dev *bonded_eth_dev,
1134                 struct rte_eth_link *slave_dev_link)
1135 {
1136         struct rte_eth_link *bonded_dev_link = &bonded_eth_dev->data->dev_link;
1137         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1138
1139         if (slave_dev_link->link_status &&
1140                 bonded_eth_dev->data->dev_started) {
1141                 bonded_dev_link->link_duplex = slave_dev_link->link_duplex;
1142                 bonded_dev_link->link_speed = slave_dev_link->link_speed;
1143
1144                 internals->link_props_set = 1;
1145         }
1146 }
1147
1148 void
1149 link_properties_reset(struct rte_eth_dev *bonded_eth_dev)
1150 {
1151         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1152
1153         memset(&(bonded_eth_dev->data->dev_link), 0,
1154                         sizeof(bonded_eth_dev->data->dev_link));
1155
1156         internals->link_props_set = 0;
1157 }
1158
1159 int
1160 link_properties_valid(struct rte_eth_link *bonded_dev_link,
1161                 struct rte_eth_link *slave_dev_link)
1162 {
1163         if (bonded_dev_link->link_duplex != slave_dev_link->link_duplex ||
1164                 bonded_dev_link->link_speed !=  slave_dev_link->link_speed)
1165                 return -1;
1166
1167         return 0;
1168 }
1169
1170 int
1171 mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr)
1172 {
1173         struct ether_addr *mac_addr;
1174
1175         if (eth_dev == NULL) {
1176                 RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__);
1177                 return -1;
1178         }
1179
1180         if (dst_mac_addr == NULL) {
1181                 RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__);
1182                 return -1;
1183         }
1184
1185         mac_addr = eth_dev->data->mac_addrs;
1186
1187         ether_addr_copy(mac_addr, dst_mac_addr);
1188         return 0;
1189 }
1190
1191 int
1192 mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr)
1193 {
1194         struct ether_addr *mac_addr;
1195
1196         if (eth_dev == NULL) {
1197                 RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified");
1198                 return -1;
1199         }
1200
1201         if (new_mac_addr == NULL) {
1202                 RTE_BOND_LOG(ERR, "NULL pointer MAC specified");
1203                 return -1;
1204         }
1205
1206         mac_addr = eth_dev->data->mac_addrs;
1207
1208         /* If new MAC is different to current MAC then update */
1209         if (memcmp(mac_addr, new_mac_addr, sizeof(*mac_addr)) != 0)
1210                 memcpy(mac_addr, new_mac_addr, sizeof(*mac_addr));
1211
1212         return 0;
1213 }
1214
1215 int
1216 mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
1217 {
1218         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1219         int i;
1220
1221         /* Update slave devices MAC addresses */
1222         if (internals->slave_count < 1)
1223                 return -1;
1224
1225         switch (internals->mode) {
1226         case BONDING_MODE_ROUND_ROBIN:
1227         case BONDING_MODE_BALANCE:
1228         case BONDING_MODE_BROADCAST:
1229                 for (i = 0; i < internals->slave_count; i++) {
1230                         if (mac_address_set(&rte_eth_devices[internals->slaves[i].port_id],
1231                                         bonded_eth_dev->data->mac_addrs)) {
1232                                 RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1233                                                 internals->slaves[i].port_id);
1234                                 return -1;
1235                         }
1236                 }
1237                 break;
1238         case BONDING_MODE_8023AD:
1239                 bond_mode_8023ad_mac_address_update(bonded_eth_dev);
1240                 break;
1241         case BONDING_MODE_ACTIVE_BACKUP:
1242         case BONDING_MODE_TLB:
1243         case BONDING_MODE_ALB:
1244         default:
1245                 for (i = 0; i < internals->slave_count; i++) {
1246                         if (internals->slaves[i].port_id ==
1247                                         internals->current_primary_port) {
1248                                 if (mac_address_set(&rte_eth_devices[internals->primary_port],
1249                                                 bonded_eth_dev->data->mac_addrs)) {
1250                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1251                                                         internals->current_primary_port);
1252                                         return -1;
1253                                 }
1254                         } else {
1255                                 if (mac_address_set(
1256                                                 &rte_eth_devices[internals->slaves[i].port_id],
1257                                                 &internals->slaves[i].persisted_mac_addr)) {
1258                                         RTE_BOND_LOG(ERR, "Failed to update port Id %d MAC address",
1259                                                         internals->slaves[i].port_id);
1260                                         return -1;
1261                                 }
1262                         }
1263                 }
1264         }
1265
1266         return 0;
1267 }
1268
1269 int
1270 bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
1271 {
1272         struct bond_dev_private *internals;
1273
1274         internals = eth_dev->data->dev_private;
1275
1276         switch (mode) {
1277         case BONDING_MODE_ROUND_ROBIN:
1278                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_round_robin;
1279                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1280                 break;
1281         case BONDING_MODE_ACTIVE_BACKUP:
1282                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_active_backup;
1283                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1284                 break;
1285         case BONDING_MODE_BALANCE:
1286                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_balance;
1287                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1288                 break;
1289         case BONDING_MODE_BROADCAST:
1290                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_broadcast;
1291                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst;
1292                 break;
1293         case BONDING_MODE_8023AD:
1294                 if (bond_mode_8023ad_enable(eth_dev) != 0)
1295                         return -1;
1296
1297                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad;
1298                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad;
1299                 RTE_LOG(WARNING, PMD,
1300                                 "Using mode 4, it is necessary to do TX burst and RX burst "
1301                                 "at least every 100ms.\n");
1302                 break;
1303         case BONDING_MODE_TLB:
1304                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
1305                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
1306                 break;
1307         case BONDING_MODE_ALB:
1308                 if (bond_mode_alb_enable(eth_dev) != 0)
1309                         return -1;
1310
1311                 eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
1312                 eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
1313                 break;
1314         default:
1315                 return -1;
1316         }
1317
1318         internals->mode = mode;
1319
1320         return 0;
1321 }
1322
1323 int
1324 slave_configure(struct rte_eth_dev *bonded_eth_dev,
1325                 struct rte_eth_dev *slave_eth_dev)
1326 {
1327         struct bond_rx_queue *bd_rx_q;
1328         struct bond_tx_queue *bd_tx_q;
1329
1330         int errval;
1331         uint16_t q_id;
1332
1333         /* Stop slave */
1334         rte_eth_dev_stop(slave_eth_dev->data->port_id);
1335
1336         /* Enable interrupts on slave device if supported */
1337         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1338                 slave_eth_dev->data->dev_conf.intr_conf.lsc = 1;
1339
1340         /* If RSS is enabled for bonding, try to enable it for slaves  */
1341         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG) {
1342                 if (bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len
1343                                 != 0) {
1344                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len =
1345                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len;
1346                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key =
1347                                         bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1348                 } else {
1349                         slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = NULL;
1350                 }
1351
1352                 slave_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf =
1353                                 bonded_eth_dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1354                 slave_eth_dev->data->dev_conf.rxmode.mq_mode =
1355                                 bonded_eth_dev->data->dev_conf.rxmode.mq_mode;
1356         }
1357
1358         slave_eth_dev->data->dev_conf.rxmode.hw_vlan_filter =
1359                         bonded_eth_dev->data->dev_conf.rxmode.hw_vlan_filter;
1360
1361         /* Configure device */
1362         errval = rte_eth_dev_configure(slave_eth_dev->data->port_id,
1363                         bonded_eth_dev->data->nb_rx_queues,
1364                         bonded_eth_dev->data->nb_tx_queues,
1365                         &(slave_eth_dev->data->dev_conf));
1366         if (errval != 0) {
1367                 RTE_BOND_LOG(ERR, "Cannot configure slave device: port %u , err (%d)",
1368                                 slave_eth_dev->data->port_id, errval);
1369                 return errval;
1370         }
1371
1372         /* Setup Rx Queues */
1373         for (q_id = 0; q_id < bonded_eth_dev->data->nb_rx_queues; q_id++) {
1374                 bd_rx_q = (struct bond_rx_queue *)bonded_eth_dev->data->rx_queues[q_id];
1375
1376                 errval = rte_eth_rx_queue_setup(slave_eth_dev->data->port_id, q_id,
1377                                 bd_rx_q->nb_rx_desc,
1378                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1379                                 &(bd_rx_q->rx_conf), bd_rx_q->mb_pool);
1380                 if (errval != 0) {
1381                         RTE_BOND_LOG(ERR,
1382                                         "rte_eth_rx_queue_setup: port=%d queue_id %d, err (%d)",
1383                                         slave_eth_dev->data->port_id, q_id, errval);
1384                         return errval;
1385                 }
1386         }
1387
1388         /* Setup Tx Queues */
1389         for (q_id = 0; q_id < bonded_eth_dev->data->nb_tx_queues; q_id++) {
1390                 bd_tx_q = (struct bond_tx_queue *)bonded_eth_dev->data->tx_queues[q_id];
1391
1392                 errval = rte_eth_tx_queue_setup(slave_eth_dev->data->port_id, q_id,
1393                                 bd_tx_q->nb_tx_desc,
1394                                 rte_eth_dev_socket_id(slave_eth_dev->data->port_id),
1395                                 &bd_tx_q->tx_conf);
1396                 if (errval != 0) {
1397                         RTE_BOND_LOG(ERR,
1398                                         "rte_eth_tx_queue_setup: port=%d queue_id %d, err (%d)",
1399                                         slave_eth_dev->data->port_id, q_id, errval);
1400                         return errval;
1401                 }
1402         }
1403
1404         /* Start device */
1405         errval = rte_eth_dev_start(slave_eth_dev->data->port_id);
1406         if (errval != 0) {
1407                 RTE_BOND_LOG(ERR, "rte_eth_dev_start: port=%u, err (%d)",
1408                                 slave_eth_dev->data->port_id, errval);
1409                 return -1;
1410         }
1411
1412         /* If RSS is enabled for bonding, synchronize RETA */
1413         if (bonded_eth_dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
1414                 int i;
1415                 struct bond_dev_private *internals;
1416
1417                 internals = bonded_eth_dev->data->dev_private;
1418
1419                 for (i = 0; i < internals->slave_count; i++) {
1420                         if (internals->slaves[i].port_id == slave_eth_dev->data->port_id) {
1421                                 errval = rte_eth_dev_rss_reta_update(
1422                                                 slave_eth_dev->data->port_id,
1423                                                 &internals->reta_conf[0],
1424                                                 internals->slaves[i].reta_size);
1425                                 if (errval != 0) {
1426                                         RTE_LOG(WARNING, PMD,
1427                                                         "rte_eth_dev_rss_reta_update on slave port %d fails (err %d)."
1428                                                         " RSS Configuration for bonding may be inconsistent.\n",
1429                                                         slave_eth_dev->data->port_id, errval);
1430                                 }
1431                                 break;
1432                         }
1433                 }
1434         }
1435
1436         /* If lsc interrupt is set, check initial slave's link status */
1437         if (slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1438                 bond_ethdev_lsc_event_callback(slave_eth_dev->data->port_id,
1439                         RTE_ETH_EVENT_INTR_LSC, &bonded_eth_dev->data->port_id);
1440
1441         return 0;
1442 }
1443
1444 void
1445 slave_remove(struct bond_dev_private *internals,
1446                 struct rte_eth_dev *slave_eth_dev)
1447 {
1448         uint8_t i;
1449
1450         for (i = 0; i < internals->slave_count; i++)
1451                 if (internals->slaves[i].port_id ==
1452                                 slave_eth_dev->data->port_id)
1453                         break;
1454
1455         if (i < (internals->slave_count - 1))
1456                 memmove(&internals->slaves[i], &internals->slaves[i + 1],
1457                                 sizeof(internals->slaves[0]) *
1458                                 (internals->slave_count - i - 1));
1459
1460         internals->slave_count--;
1461
1462         /* force reconfiguration of slave interfaces */
1463         _rte_eth_dev_reset(slave_eth_dev);
1464 }
1465
1466 static void
1467 bond_ethdev_slave_link_status_change_monitor(void *cb_arg);
1468
1469 void
1470 slave_add(struct bond_dev_private *internals,
1471                 struct rte_eth_dev *slave_eth_dev)
1472 {
1473         struct bond_slave_details *slave_details =
1474                         &internals->slaves[internals->slave_count];
1475
1476         slave_details->port_id = slave_eth_dev->data->port_id;
1477         slave_details->last_link_status = 0;
1478
1479         /* Mark slave devices that don't support interrupts so we can
1480          * compensate when we start the bond
1481          */
1482         if (!(slave_eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1483                 slave_details->link_status_poll_enabled = 1;
1484         }
1485
1486         slave_details->link_status_wait_to_complete = 0;
1487         /* clean tlb_last_obytes when adding port for bonding device */
1488         memcpy(&(slave_details->persisted_mac_addr), slave_eth_dev->data->mac_addrs,
1489                         sizeof(struct ether_addr));
1490 }
1491
1492 void
1493 bond_ethdev_primary_set(struct bond_dev_private *internals,
1494                 uint8_t slave_port_id)
1495 {
1496         int i;
1497
1498         if (internals->active_slave_count < 1)
1499                 internals->current_primary_port = slave_port_id;
1500         else
1501                 /* Search bonded device slave ports for new proposed primary port */
1502                 for (i = 0; i < internals->active_slave_count; i++) {
1503                         if (internals->active_slaves[i] == slave_port_id)
1504                                 internals->current_primary_port = slave_port_id;
1505                 }
1506 }
1507
1508 static void
1509 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev);
1510
1511 static int
1512 bond_ethdev_start(struct rte_eth_dev *eth_dev)
1513 {
1514         struct bond_dev_private *internals;
1515         int i;
1516
1517         /* slave eth dev will be started by bonded device */
1518         if (check_for_bonded_ethdev(eth_dev)) {
1519                 RTE_BOND_LOG(ERR, "User tried to explicitly start a slave eth_dev (%d)",
1520                                 eth_dev->data->port_id);
1521                 return -1;
1522         }
1523
1524         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1525         eth_dev->data->dev_started = 1;
1526
1527         internals = eth_dev->data->dev_private;
1528
1529         if (internals->slave_count == 0) {
1530                 RTE_BOND_LOG(ERR, "Cannot start port since there are no slave devices");
1531                 return -1;
1532         }
1533
1534         if (internals->user_defined_mac == 0) {
1535                 struct ether_addr *new_mac_addr = NULL;
1536
1537                 for (i = 0; i < internals->slave_count; i++)
1538                         if (internals->slaves[i].port_id == internals->primary_port)
1539                                 new_mac_addr = &internals->slaves[i].persisted_mac_addr;
1540
1541                 if (new_mac_addr == NULL)
1542                         return -1;
1543
1544                 if (mac_address_set(eth_dev, new_mac_addr) != 0) {
1545                         RTE_BOND_LOG(ERR, "bonded port (%d) failed to update MAC address",
1546                                         eth_dev->data->port_id);
1547                         return -1;
1548                 }
1549         }
1550
1551         /* Update all slave devices MACs*/
1552         if (mac_address_slaves_update(eth_dev) != 0)
1553                 return -1;
1554
1555         /* If bonded device is configure in promiscuous mode then re-apply config */
1556         if (internals->promiscuous_en)
1557                 bond_ethdev_promiscuous_enable(eth_dev);
1558
1559         /* Reconfigure each slave device if starting bonded device */
1560         for (i = 0; i < internals->slave_count; i++) {
1561                 if (slave_configure(eth_dev,
1562                                 &(rte_eth_devices[internals->slaves[i].port_id])) != 0) {
1563                         RTE_BOND_LOG(ERR,
1564                                         "bonded port (%d) failed to reconfigure slave device (%d)",
1565                                         eth_dev->data->port_id, internals->slaves[i].port_id);
1566                         return -1;
1567                 }
1568                 /* We will need to poll for link status if any slave doesn't
1569                  * support interrupts
1570                  */
1571                 if (internals->slaves[i].link_status_poll_enabled)
1572                         internals->link_status_polling_enabled = 1;
1573         }
1574         /* start polling if needed */
1575         if (internals->link_status_polling_enabled) {
1576                 rte_eal_alarm_set(
1577                         internals->link_status_polling_interval_ms * 1000,
1578                         bond_ethdev_slave_link_status_change_monitor,
1579                         (void *)&rte_eth_devices[internals->port_id]);
1580         }
1581
1582         if (internals->user_defined_primary_port)
1583                 bond_ethdev_primary_set(internals, internals->primary_port);
1584
1585         if (internals->mode == BONDING_MODE_8023AD)
1586                 bond_mode_8023ad_start(eth_dev);
1587
1588         if (internals->mode == BONDING_MODE_TLB ||
1589                         internals->mode == BONDING_MODE_ALB)
1590                 bond_tlb_enable(internals);
1591
1592         return 0;
1593 }
1594
1595 static void
1596 bond_ethdev_free_queues(struct rte_eth_dev *dev)
1597 {
1598         uint8_t i;
1599
1600         if (dev->data->rx_queues != NULL) {
1601                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1602                         rte_free(dev->data->rx_queues[i]);
1603                         dev->data->rx_queues[i] = NULL;
1604                 }
1605                 dev->data->nb_rx_queues = 0;
1606         }
1607
1608         if (dev->data->tx_queues != NULL) {
1609                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1610                         rte_free(dev->data->tx_queues[i]);
1611                         dev->data->tx_queues[i] = NULL;
1612                 }
1613                 dev->data->nb_tx_queues = 0;
1614         }
1615 }
1616
1617 void
1618 bond_ethdev_stop(struct rte_eth_dev *eth_dev)
1619 {
1620         struct bond_dev_private *internals = eth_dev->data->dev_private;
1621         uint8_t i;
1622
1623         if (internals->mode == BONDING_MODE_8023AD) {
1624                 struct port *port;
1625                 void *pkt = NULL;
1626
1627                 bond_mode_8023ad_stop(eth_dev);
1628
1629                 /* Discard all messages to/from mode 4 state machines */
1630                 for (i = 0; i < internals->active_slave_count; i++) {
1631                         port = &mode_8023ad_ports[internals->active_slaves[i]];
1632
1633                         RTE_ASSERT(port->rx_ring != NULL);
1634                         while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT)
1635                                 rte_pktmbuf_free(pkt);
1636
1637                         RTE_ASSERT(port->tx_ring != NULL);
1638                         while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT)
1639                                 rte_pktmbuf_free(pkt);
1640                 }
1641         }
1642
1643         if (internals->mode == BONDING_MODE_TLB ||
1644                         internals->mode == BONDING_MODE_ALB) {
1645                 bond_tlb_disable(internals);
1646                 for (i = 0; i < internals->active_slave_count; i++)
1647                         tlb_last_obytets[internals->active_slaves[i]] = 0;
1648         }
1649
1650         internals->active_slave_count = 0;
1651         internals->link_status_polling_enabled = 0;
1652         for (i = 0; i < internals->slave_count; i++)
1653                 internals->slaves[i].last_link_status = 0;
1654
1655         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1656         eth_dev->data->dev_started = 0;
1657 }
1658
1659 void
1660 bond_ethdev_close(struct rte_eth_dev *dev)
1661 {
1662         struct bond_dev_private *internals = dev->data->dev_private;
1663
1664         bond_ethdev_free_queues(dev);
1665         rte_bitmap_reset(internals->vlan_filter_bmp);
1666 }
1667
1668 /* forward declaration */
1669 static int bond_ethdev_configure(struct rte_eth_dev *dev);
1670
1671 static void
1672 bond_ethdev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
1673 {
1674         struct bond_dev_private *internals = dev->data->dev_private;
1675
1676         dev_info->max_mac_addrs = 1;
1677
1678         dev_info->max_rx_pktlen = internals->candidate_max_rx_pktlen ?
1679                                   internals->candidate_max_rx_pktlen : 2048;
1680
1681         dev_info->max_rx_queues = (uint16_t)128;
1682         dev_info->max_tx_queues = (uint16_t)512;
1683
1684         dev_info->min_rx_bufsize = 0;
1685
1686         dev_info->rx_offload_capa = internals->rx_offload_capa;
1687         dev_info->tx_offload_capa = internals->tx_offload_capa;
1688         dev_info->flow_type_rss_offloads = internals->flow_type_rss_offloads;
1689
1690         dev_info->reta_size = internals->reta_size;
1691 }
1692
1693 static int
1694 bond_ethdev_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
1695 {
1696         int res;
1697         uint8_t i;
1698         struct bond_dev_private *internals = dev->data->dev_private;
1699
1700         /* don't do this while a slave is being added */
1701         rte_spinlock_lock(&internals->lock);
1702
1703         if (on)
1704                 rte_bitmap_set(internals->vlan_filter_bmp, vlan_id);
1705         else
1706                 rte_bitmap_clear(internals->vlan_filter_bmp, vlan_id);
1707
1708         for (i = 0; i < internals->slave_count; i++) {
1709                 uint8_t port_id = internals->slaves[i].port_id;
1710
1711                 res = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
1712                 if (res == ENOTSUP)
1713                         RTE_LOG(WARNING, PMD,
1714                                 "Setting VLAN filter on slave port %u not supported.\n",
1715                                 port_id);
1716         }
1717
1718         rte_spinlock_unlock(&internals->lock);
1719         return 0;
1720 }
1721
1722 static int
1723 bond_ethdev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1724                 uint16_t nb_rx_desc, unsigned int socket_id __rte_unused,
1725                 const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool)
1726 {
1727         struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)
1728                         rte_zmalloc_socket(NULL, sizeof(struct bond_rx_queue),
1729                                         0, dev->data->numa_node);
1730         if (bd_rx_q == NULL)
1731                 return -1;
1732
1733         bd_rx_q->queue_id = rx_queue_id;
1734         bd_rx_q->dev_private = dev->data->dev_private;
1735
1736         bd_rx_q->nb_rx_desc = nb_rx_desc;
1737
1738         memcpy(&(bd_rx_q->rx_conf), rx_conf, sizeof(struct rte_eth_rxconf));
1739         bd_rx_q->mb_pool = mb_pool;
1740
1741         dev->data->rx_queues[rx_queue_id] = bd_rx_q;
1742
1743         return 0;
1744 }
1745
1746 static int
1747 bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1748                 uint16_t nb_tx_desc, unsigned int socket_id __rte_unused,
1749                 const struct rte_eth_txconf *tx_conf)
1750 {
1751         struct bond_tx_queue *bd_tx_q  = (struct bond_tx_queue *)
1752                         rte_zmalloc_socket(NULL, sizeof(struct bond_tx_queue),
1753                                         0, dev->data->numa_node);
1754
1755         if (bd_tx_q == NULL)
1756                 return -1;
1757
1758         bd_tx_q->queue_id = tx_queue_id;
1759         bd_tx_q->dev_private = dev->data->dev_private;
1760
1761         bd_tx_q->nb_tx_desc = nb_tx_desc;
1762         memcpy(&(bd_tx_q->tx_conf), tx_conf, sizeof(bd_tx_q->tx_conf));
1763
1764         dev->data->tx_queues[tx_queue_id] = bd_tx_q;
1765
1766         return 0;
1767 }
1768
1769 static void
1770 bond_ethdev_rx_queue_release(void *queue)
1771 {
1772         if (queue == NULL)
1773                 return;
1774
1775         rte_free(queue);
1776 }
1777
1778 static void
1779 bond_ethdev_tx_queue_release(void *queue)
1780 {
1781         if (queue == NULL)
1782                 return;
1783
1784         rte_free(queue);
1785 }
1786
1787 static void
1788 bond_ethdev_slave_link_status_change_monitor(void *cb_arg)
1789 {
1790         struct rte_eth_dev *bonded_ethdev, *slave_ethdev;
1791         struct bond_dev_private *internals;
1792
1793         /* Default value for polling slave found is true as we don't want to
1794          * disable the polling thread if we cannot get the lock */
1795         int i, polling_slave_found = 1;
1796
1797         if (cb_arg == NULL)
1798                 return;
1799
1800         bonded_ethdev = (struct rte_eth_dev *)cb_arg;
1801         internals = (struct bond_dev_private *)bonded_ethdev->data->dev_private;
1802
1803         if (!bonded_ethdev->data->dev_started ||
1804                 !internals->link_status_polling_enabled)
1805                 return;
1806
1807         /* If device is currently being configured then don't check slaves link
1808          * status, wait until next period */
1809         if (rte_spinlock_trylock(&internals->lock)) {
1810                 if (internals->slave_count > 0)
1811                         polling_slave_found = 0;
1812
1813                 for (i = 0; i < internals->slave_count; i++) {
1814                         if (!internals->slaves[i].link_status_poll_enabled)
1815                                 continue;
1816
1817                         slave_ethdev = &rte_eth_devices[internals->slaves[i].port_id];
1818                         polling_slave_found = 1;
1819
1820                         /* Update slave link status */
1821                         (*slave_ethdev->dev_ops->link_update)(slave_ethdev,
1822                                         internals->slaves[i].link_status_wait_to_complete);
1823
1824                         /* if link status has changed since last checked then call lsc
1825                          * event callback */
1826                         if (slave_ethdev->data->dev_link.link_status !=
1827                                         internals->slaves[i].last_link_status) {
1828                                 internals->slaves[i].last_link_status =
1829                                                 slave_ethdev->data->dev_link.link_status;
1830
1831                                 bond_ethdev_lsc_event_callback(internals->slaves[i].port_id,
1832                                                 RTE_ETH_EVENT_INTR_LSC,
1833                                                 &bonded_ethdev->data->port_id);
1834                         }
1835                 }
1836                 rte_spinlock_unlock(&internals->lock);
1837         }
1838
1839         if (polling_slave_found)
1840                 /* Set alarm to continue monitoring link status of slave ethdev's */
1841                 rte_eal_alarm_set(internals->link_status_polling_interval_ms * 1000,
1842                                 bond_ethdev_slave_link_status_change_monitor, cb_arg);
1843 }
1844
1845 static int
1846 bond_ethdev_link_update(struct rte_eth_dev *bonded_eth_dev,
1847                 int wait_to_complete)
1848 {
1849         struct bond_dev_private *internals = bonded_eth_dev->data->dev_private;
1850
1851         if (!bonded_eth_dev->data->dev_started ||
1852                 internals->active_slave_count == 0) {
1853                 bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
1854                 return 0;
1855         } else {
1856                 struct rte_eth_dev *slave_eth_dev;
1857                 int i, link_up = 0;
1858
1859                 for (i = 0; i < internals->active_slave_count; i++) {
1860                         slave_eth_dev = &rte_eth_devices[internals->active_slaves[i]];
1861
1862                         (*slave_eth_dev->dev_ops->link_update)(slave_eth_dev,
1863                                         wait_to_complete);
1864                         if (slave_eth_dev->data->dev_link.link_status == ETH_LINK_UP) {
1865                                 link_up = 1;
1866                                 break;
1867                         }
1868                 }
1869
1870                 bonded_eth_dev->data->dev_link.link_status = link_up;
1871         }
1872
1873         return 0;
1874 }
1875
1876 static void
1877 bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1878 {
1879         struct bond_dev_private *internals = dev->data->dev_private;
1880         struct rte_eth_stats slave_stats;
1881         int i, j;
1882
1883         for (i = 0; i < internals->slave_count; i++) {
1884                 rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
1885
1886                 stats->ipackets += slave_stats.ipackets;
1887                 stats->opackets += slave_stats.opackets;
1888                 stats->ibytes += slave_stats.ibytes;
1889                 stats->obytes += slave_stats.obytes;
1890                 stats->imissed += slave_stats.imissed;
1891                 stats->ierrors += slave_stats.ierrors;
1892                 stats->oerrors += slave_stats.oerrors;
1893                 stats->rx_nombuf += slave_stats.rx_nombuf;
1894
1895                 for (j = 0; j < RTE_ETHDEV_QUEUE_STAT_CNTRS; j++) {
1896                         stats->q_ipackets[j] += slave_stats.q_ipackets[j];
1897                         stats->q_opackets[j] += slave_stats.q_opackets[j];
1898                         stats->q_ibytes[j] += slave_stats.q_ibytes[j];
1899                         stats->q_obytes[j] += slave_stats.q_obytes[j];
1900                         stats->q_errors[j] += slave_stats.q_errors[j];
1901                 }
1902
1903         }
1904 }
1905
1906 static void
1907 bond_ethdev_stats_reset(struct rte_eth_dev *dev)
1908 {
1909         struct bond_dev_private *internals = dev->data->dev_private;
1910         int i;
1911
1912         for (i = 0; i < internals->slave_count; i++)
1913                 rte_eth_stats_reset(internals->slaves[i].port_id);
1914 }
1915
1916 static void
1917 bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
1918 {
1919         struct bond_dev_private *internals = eth_dev->data->dev_private;
1920         int i;
1921
1922         internals->promiscuous_en = 1;
1923
1924         switch (internals->mode) {
1925         /* Promiscuous mode is propagated to all slaves */
1926         case BONDING_MODE_ROUND_ROBIN:
1927         case BONDING_MODE_BALANCE:
1928         case BONDING_MODE_BROADCAST:
1929                 for (i = 0; i < internals->slave_count; i++)
1930                         rte_eth_promiscuous_enable(internals->slaves[i].port_id);
1931                 break;
1932         /* In mode4 promiscus mode is managed when slave is added/removed */
1933         case BONDING_MODE_8023AD:
1934                 break;
1935         /* Promiscuous mode is propagated only to primary slave */
1936         case BONDING_MODE_ACTIVE_BACKUP:
1937         case BONDING_MODE_TLB:
1938         case BONDING_MODE_ALB:
1939         default:
1940                 rte_eth_promiscuous_enable(internals->current_primary_port);
1941         }
1942 }
1943
1944 static void
1945 bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
1946 {
1947         struct bond_dev_private *internals = dev->data->dev_private;
1948         int i;
1949
1950         internals->promiscuous_en = 0;
1951
1952         switch (internals->mode) {
1953         /* Promiscuous mode is propagated to all slaves */
1954         case BONDING_MODE_ROUND_ROBIN:
1955         case BONDING_MODE_BALANCE:
1956         case BONDING_MODE_BROADCAST:
1957                 for (i = 0; i < internals->slave_count; i++)
1958                         rte_eth_promiscuous_disable(internals->slaves[i].port_id);
1959                 break;
1960         /* In mode4 promiscus mode is set managed when slave is added/removed */
1961         case BONDING_MODE_8023AD:
1962                 break;
1963         /* Promiscuous mode is propagated only to primary slave */
1964         case BONDING_MODE_ACTIVE_BACKUP:
1965         case BONDING_MODE_TLB:
1966         case BONDING_MODE_ALB:
1967         default:
1968                 rte_eth_promiscuous_disable(internals->current_primary_port);
1969         }
1970 }
1971
1972 static void
1973 bond_ethdev_delayed_lsc_propagation(void *arg)
1974 {
1975         if (arg == NULL)
1976                 return;
1977
1978         _rte_eth_dev_callback_process((struct rte_eth_dev *)arg,
1979                         RTE_ETH_EVENT_INTR_LSC, NULL);
1980 }
1981
1982 void
1983 bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type,
1984                 void *param)
1985 {
1986         struct rte_eth_dev *bonded_eth_dev, *slave_eth_dev;
1987         struct bond_dev_private *internals;
1988         struct rte_eth_link link;
1989
1990         int i, valid_slave = 0;
1991         uint8_t active_pos;
1992         uint8_t lsc_flag = 0;
1993
1994         if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL)
1995                 return;
1996
1997         bonded_eth_dev = &rte_eth_devices[*(uint8_t *)param];
1998         slave_eth_dev = &rte_eth_devices[port_id];
1999
2000         if (check_for_bonded_ethdev(bonded_eth_dev))
2001                 return;
2002
2003         internals = bonded_eth_dev->data->dev_private;
2004
2005         /* If the device isn't started don't handle interrupts */
2006         if (!bonded_eth_dev->data->dev_started)
2007                 return;
2008
2009         /* verify that port_id is a valid slave of bonded port */
2010         for (i = 0; i < internals->slave_count; i++) {
2011                 if (internals->slaves[i].port_id == port_id) {
2012                         valid_slave = 1;
2013                         break;
2014                 }
2015         }
2016
2017         if (!valid_slave)
2018                 return;
2019
2020         /* Search for port in active port list */
2021         active_pos = find_slave_by_id(internals->active_slaves,
2022                         internals->active_slave_count, port_id);
2023
2024         rte_eth_link_get_nowait(port_id, &link);
2025         if (link.link_status) {
2026                 if (active_pos < internals->active_slave_count)
2027                         return;
2028
2029                 /* if no active slave ports then set this port to be primary port */
2030                 if (internals->active_slave_count < 1) {
2031                         /* If first active slave, then change link status */
2032                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_UP;
2033                         internals->current_primary_port = port_id;
2034                         lsc_flag = 1;
2035
2036                         mac_address_slaves_update(bonded_eth_dev);
2037
2038                         /* Inherit eth dev link properties from first active slave */
2039                         link_properties_set(bonded_eth_dev,
2040                                         &(slave_eth_dev->data->dev_link));
2041                 } else {
2042                         if (link_properties_valid(
2043                                 &bonded_eth_dev->data->dev_link, &link) != 0) {
2044                                 slave_eth_dev->data->dev_flags &=
2045                                         (~RTE_ETH_DEV_BONDED_SLAVE);
2046                                 RTE_LOG(ERR, PMD,
2047                                         "port %u invalid speed/duplex\n",
2048                                         port_id);
2049                                 return;
2050                         }
2051                 }
2052
2053                 activate_slave(bonded_eth_dev, port_id);
2054
2055                 /* If user has defined the primary port then default to using it */
2056                 if (internals->user_defined_primary_port &&
2057                                 internals->primary_port == port_id)
2058                         bond_ethdev_primary_set(internals, port_id);
2059         } else {
2060                 if (active_pos == internals->active_slave_count)
2061                         return;
2062
2063                 /* Remove from active slave list */
2064                 deactivate_slave(bonded_eth_dev, port_id);
2065
2066                 /* No active slaves, change link status to down and reset other
2067                  * link properties */
2068                 if (internals->active_slave_count < 1) {
2069                         lsc_flag = 1;
2070                         bonded_eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
2071
2072                         link_properties_reset(bonded_eth_dev);
2073                 }
2074
2075                 /* Update primary id, take first active slave from list or if none
2076                  * available set to -1 */
2077                 if (port_id == internals->current_primary_port) {
2078                         if (internals->active_slave_count > 0)
2079                                 bond_ethdev_primary_set(internals,
2080                                                 internals->active_slaves[0]);
2081                         else
2082                                 internals->current_primary_port = internals->primary_port;
2083                 }
2084         }
2085
2086         if (lsc_flag) {
2087                 /* Cancel any possible outstanding interrupts if delays are enabled */
2088                 if (internals->link_up_delay_ms > 0 ||
2089                         internals->link_down_delay_ms > 0)
2090                         rte_eal_alarm_cancel(bond_ethdev_delayed_lsc_propagation,
2091                                         bonded_eth_dev);
2092
2093                 if (bonded_eth_dev->data->dev_link.link_status) {
2094                         if (internals->link_up_delay_ms > 0)
2095                                 rte_eal_alarm_set(internals->link_up_delay_ms * 1000,
2096                                                 bond_ethdev_delayed_lsc_propagation,
2097                                                 (void *)bonded_eth_dev);
2098                         else
2099                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2100                                                 RTE_ETH_EVENT_INTR_LSC, NULL);
2101
2102                 } else {
2103                         if (internals->link_down_delay_ms > 0)
2104                                 rte_eal_alarm_set(internals->link_down_delay_ms * 1000,
2105                                                 bond_ethdev_delayed_lsc_propagation,
2106                                                 (void *)bonded_eth_dev);
2107                         else
2108                                 _rte_eth_dev_callback_process(bonded_eth_dev,
2109                                                 RTE_ETH_EVENT_INTR_LSC, NULL);
2110                 }
2111         }
2112 }
2113
2114 static int
2115 bond_ethdev_rss_reta_update(struct rte_eth_dev *dev,
2116                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2117 {
2118         unsigned i, j;
2119         int result = 0;
2120         int slave_reta_size;
2121         unsigned reta_count;
2122         struct bond_dev_private *internals = dev->data->dev_private;
2123
2124         if (reta_size != internals->reta_size)
2125                 return -EINVAL;
2126
2127          /* Copy RETA table */
2128         reta_count = reta_size / RTE_RETA_GROUP_SIZE;
2129
2130         for (i = 0; i < reta_count; i++) {
2131                 internals->reta_conf[i].mask = reta_conf[i].mask;
2132                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2133                         if ((reta_conf[i].mask >> j) & 0x01)
2134                                 internals->reta_conf[i].reta[j] = reta_conf[i].reta[j];
2135         }
2136
2137         /* Fill rest of array */
2138         for (; i < RTE_DIM(internals->reta_conf); i += reta_count)
2139                 memcpy(&internals->reta_conf[i], &internals->reta_conf[0],
2140                                 sizeof(internals->reta_conf[0]) * reta_count);
2141
2142         /* Propagate RETA over slaves */
2143         for (i = 0; i < internals->slave_count; i++) {
2144                 slave_reta_size = internals->slaves[i].reta_size;
2145                 result = rte_eth_dev_rss_reta_update(internals->slaves[i].port_id,
2146                                 &internals->reta_conf[0], slave_reta_size);
2147                 if (result < 0)
2148                         return result;
2149         }
2150
2151         return 0;
2152 }
2153
2154 static int
2155 bond_ethdev_rss_reta_query(struct rte_eth_dev *dev,
2156                 struct rte_eth_rss_reta_entry64 *reta_conf, uint16_t reta_size)
2157 {
2158         int i, j;
2159         struct bond_dev_private *internals = dev->data->dev_private;
2160
2161         if (reta_size != internals->reta_size)
2162                 return -EINVAL;
2163
2164          /* Copy RETA table */
2165         for (i = 0; i < reta_size / RTE_RETA_GROUP_SIZE; i++)
2166                 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2167                         if ((reta_conf[i].mask >> j) & 0x01)
2168                                 reta_conf[i].reta[j] = internals->reta_conf[i].reta[j];
2169
2170         return 0;
2171 }
2172
2173 static int
2174 bond_ethdev_rss_hash_update(struct rte_eth_dev *dev,
2175                 struct rte_eth_rss_conf *rss_conf)
2176 {
2177         int i, result = 0;
2178         struct bond_dev_private *internals = dev->data->dev_private;
2179         struct rte_eth_rss_conf bond_rss_conf;
2180
2181         memcpy(&bond_rss_conf, rss_conf, sizeof(struct rte_eth_rss_conf));
2182
2183         bond_rss_conf.rss_hf &= internals->flow_type_rss_offloads;
2184
2185         if (bond_rss_conf.rss_hf != 0)
2186                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf = bond_rss_conf.rss_hf;
2187
2188         if (bond_rss_conf.rss_key && bond_rss_conf.rss_key_len <
2189                         sizeof(internals->rss_key)) {
2190                 if (bond_rss_conf.rss_key_len == 0)
2191                         bond_rss_conf.rss_key_len = 40;
2192                 internals->rss_key_len = bond_rss_conf.rss_key_len;
2193                 memcpy(internals->rss_key, bond_rss_conf.rss_key,
2194                                 internals->rss_key_len);
2195         }
2196
2197         for (i = 0; i < internals->slave_count; i++) {
2198                 result = rte_eth_dev_rss_hash_update(internals->slaves[i].port_id,
2199                                 &bond_rss_conf);
2200                 if (result < 0)
2201                         return result;
2202         }
2203
2204         return 0;
2205 }
2206
2207 static int
2208 bond_ethdev_rss_hash_conf_get(struct rte_eth_dev *dev,
2209                 struct rte_eth_rss_conf *rss_conf)
2210 {
2211         struct bond_dev_private *internals = dev->data->dev_private;
2212
2213         rss_conf->rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
2214         rss_conf->rss_key_len = internals->rss_key_len;
2215         if (rss_conf->rss_key)
2216                 memcpy(rss_conf->rss_key, internals->rss_key, internals->rss_key_len);
2217
2218         return 0;
2219 }
2220
2221 const struct eth_dev_ops default_dev_ops = {
2222         .dev_start            = bond_ethdev_start,
2223         .dev_stop             = bond_ethdev_stop,
2224         .dev_close            = bond_ethdev_close,
2225         .dev_configure        = bond_ethdev_configure,
2226         .dev_infos_get        = bond_ethdev_info,
2227         .vlan_filter_set      = bond_ethdev_vlan_filter_set,
2228         .rx_queue_setup       = bond_ethdev_rx_queue_setup,
2229         .tx_queue_setup       = bond_ethdev_tx_queue_setup,
2230         .rx_queue_release     = bond_ethdev_rx_queue_release,
2231         .tx_queue_release     = bond_ethdev_tx_queue_release,
2232         .link_update          = bond_ethdev_link_update,
2233         .stats_get            = bond_ethdev_stats_get,
2234         .stats_reset          = bond_ethdev_stats_reset,
2235         .promiscuous_enable   = bond_ethdev_promiscuous_enable,
2236         .promiscuous_disable  = bond_ethdev_promiscuous_disable,
2237         .reta_update          = bond_ethdev_rss_reta_update,
2238         .reta_query           = bond_ethdev_rss_reta_query,
2239         .rss_hash_update      = bond_ethdev_rss_hash_update,
2240         .rss_hash_conf_get    = bond_ethdev_rss_hash_conf_get
2241 };
2242
2243 static int
2244 bond_alloc(struct rte_vdev_device *dev, uint8_t mode)
2245 {
2246         const char *name = rte_vdev_device_name(dev);
2247         uint8_t socket_id = dev->device.numa_node;
2248         struct bond_dev_private *internals = NULL;
2249         struct rte_eth_dev *eth_dev = NULL;
2250         uint32_t vlan_filter_bmp_size;
2251
2252         /* now do all data allocation - for eth_dev structure, dummy pci driver
2253          * and internal (private) data
2254          */
2255
2256         if (name == NULL) {
2257                 RTE_BOND_LOG(ERR, "Invalid name specified");
2258                 goto err;
2259         }
2260
2261         if (socket_id >= number_of_sockets()) {
2262                 RTE_BOND_LOG(ERR,
2263                                 "Invalid socket id specified to create bonded device on.");
2264                 goto err;
2265         }
2266
2267         internals = rte_zmalloc_socket(name, sizeof(*internals), 0, socket_id);
2268         if (internals == NULL) {
2269                 RTE_BOND_LOG(ERR, "Unable to malloc internals on socket");
2270                 goto err;
2271         }
2272
2273         /* reserve an ethdev entry */
2274         eth_dev = rte_eth_dev_allocate(name);
2275         if (eth_dev == NULL) {
2276                 RTE_BOND_LOG(ERR, "Unable to allocate rte_eth_dev");
2277                 goto err;
2278         }
2279
2280         eth_dev->data->dev_private = internals;
2281         eth_dev->data->nb_rx_queues = (uint16_t)1;
2282         eth_dev->data->nb_tx_queues = (uint16_t)1;
2283
2284         eth_dev->data->mac_addrs = rte_zmalloc_socket(name, ETHER_ADDR_LEN, 0,
2285                         socket_id);
2286         if (eth_dev->data->mac_addrs == NULL) {
2287                 RTE_BOND_LOG(ERR, "Unable to malloc mac_addrs");
2288                 goto err;
2289         }
2290
2291         eth_dev->dev_ops = &default_dev_ops;
2292         eth_dev->data->dev_flags = RTE_ETH_DEV_INTR_LSC |
2293                 RTE_ETH_DEV_DETACHABLE;
2294         eth_dev->driver = NULL;
2295         eth_dev->data->kdrv = RTE_KDRV_NONE;
2296         eth_dev->data->drv_name = pmd_bond_drv.driver.name;
2297         eth_dev->data->numa_node =  socket_id;
2298
2299         rte_spinlock_init(&internals->lock);
2300
2301         internals->port_id = eth_dev->data->port_id;
2302         internals->mode = BONDING_MODE_INVALID;
2303         internals->current_primary_port = RTE_MAX_ETHPORTS + 1;
2304         internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2;
2305         internals->xmit_hash = xmit_l2_hash;
2306         internals->user_defined_mac = 0;
2307         internals->link_props_set = 0;
2308
2309         internals->link_status_polling_enabled = 0;
2310
2311         internals->link_status_polling_interval_ms =
2312                 DEFAULT_POLLING_INTERVAL_10_MS;
2313         internals->link_down_delay_ms = 0;
2314         internals->link_up_delay_ms = 0;
2315
2316         internals->slave_count = 0;
2317         internals->active_slave_count = 0;
2318         internals->rx_offload_capa = 0;
2319         internals->tx_offload_capa = 0;
2320         internals->candidate_max_rx_pktlen = 0;
2321         internals->max_rx_pktlen = 0;
2322
2323         /* Initially allow to choose any offload type */
2324         internals->flow_type_rss_offloads = ETH_RSS_PROTO_MASK;
2325
2326         memset(internals->active_slaves, 0, sizeof(internals->active_slaves));
2327         memset(internals->slaves, 0, sizeof(internals->slaves));
2328
2329         /* Set mode 4 default configuration */
2330         bond_mode_8023ad_setup(eth_dev, NULL);
2331         if (bond_ethdev_mode_set(eth_dev, mode)) {
2332                 RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d",
2333                                  eth_dev->data->port_id, mode);
2334                 goto err;
2335         }
2336
2337         vlan_filter_bmp_size =
2338                 rte_bitmap_get_memory_footprint(ETHER_MAX_VLAN_ID + 1);
2339         internals->vlan_filter_bmpmem = rte_malloc(name, vlan_filter_bmp_size,
2340                                                    RTE_CACHE_LINE_SIZE);
2341         if (internals->vlan_filter_bmpmem == NULL) {
2342                 RTE_BOND_LOG(ERR,
2343                              "Failed to allocate vlan bitmap for bonded device %u\n",
2344                              eth_dev->data->port_id);
2345                 goto err;
2346         }
2347
2348         internals->vlan_filter_bmp = rte_bitmap_init(ETHER_MAX_VLAN_ID + 1,
2349                         internals->vlan_filter_bmpmem, vlan_filter_bmp_size);
2350         if (internals->vlan_filter_bmp == NULL) {
2351                 RTE_BOND_LOG(ERR,
2352                              "Failed to init vlan bitmap for bonded device %u\n",
2353                              eth_dev->data->port_id);
2354                 rte_free(internals->vlan_filter_bmpmem);
2355                 goto err;
2356         }
2357
2358         return eth_dev->data->port_id;
2359
2360 err:
2361         rte_free(internals);
2362         if (eth_dev != NULL) {
2363                 rte_free(eth_dev->data->mac_addrs);
2364                 rte_eth_dev_release_port(eth_dev);
2365         }
2366         return -1;
2367 }
2368
2369 static int
2370 bond_probe(struct rte_vdev_device *dev)
2371 {
2372         const char *name;
2373         struct bond_dev_private *internals;
2374         struct rte_kvargs *kvlist;
2375         uint8_t bonding_mode, socket_id;
2376         int  arg_count, port_id;
2377
2378         if (!dev)
2379                 return -EINVAL;
2380
2381         name = rte_vdev_device_name(dev);
2382         RTE_LOG(INFO, EAL, "Initializing pmd_bond for %s\n", name);
2383
2384         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev),
2385                 pmd_bond_init_valid_arguments);
2386         if (kvlist == NULL)
2387                 return -1;
2388
2389         /* Parse link bonding mode */
2390         if (rte_kvargs_count(kvlist, PMD_BOND_MODE_KVARG) == 1) {
2391                 if (rte_kvargs_process(kvlist, PMD_BOND_MODE_KVARG,
2392                                 &bond_ethdev_parse_slave_mode_kvarg,
2393                                 &bonding_mode) != 0) {
2394                         RTE_LOG(ERR, EAL, "Invalid mode for bonded device %s\n",
2395                                         name);
2396                         goto parse_error;
2397                 }
2398         } else {
2399                 RTE_LOG(ERR, EAL, "Mode must be specified only once for bonded "
2400                                 "device %s\n", name);
2401                 goto parse_error;
2402         }
2403
2404         /* Parse socket id to create bonding device on */
2405         arg_count = rte_kvargs_count(kvlist, PMD_BOND_SOCKET_ID_KVARG);
2406         if (arg_count == 1) {
2407                 if (rte_kvargs_process(kvlist, PMD_BOND_SOCKET_ID_KVARG,
2408                                 &bond_ethdev_parse_socket_id_kvarg, &socket_id)
2409                                 != 0) {
2410                         RTE_LOG(ERR, EAL, "Invalid socket Id specified for "
2411                                         "bonded device %s\n", name);
2412                         goto parse_error;
2413                 }
2414         } else if (arg_count > 1) {
2415                 RTE_LOG(ERR, EAL, "Socket Id can be specified only once for "
2416                                 "bonded device %s\n", name);
2417                 goto parse_error;
2418         } else {
2419                 socket_id = rte_socket_id();
2420         }
2421
2422         dev->device.numa_node = socket_id;
2423
2424         /* Create link bonding eth device */
2425         port_id = bond_alloc(dev, bonding_mode);
2426         if (port_id < 0) {
2427                 RTE_LOG(ERR, EAL, "Failed to create socket %s in mode %u on "
2428                                 "socket %u.\n", name, bonding_mode, socket_id);
2429                 goto parse_error;
2430         }
2431         internals = rte_eth_devices[port_id].data->dev_private;
2432         internals->kvlist = kvlist;
2433
2434         RTE_LOG(INFO, EAL, "Create bonded device %s on port %d in mode %u on "
2435                         "socket %u.\n", name, port_id, bonding_mode, socket_id);
2436         return 0;
2437
2438 parse_error:
2439         rte_kvargs_free(kvlist);
2440
2441         return -1;
2442 }
2443
2444 static int
2445 bond_remove(struct rte_vdev_device *dev)
2446 {
2447         struct rte_eth_dev *eth_dev;
2448         struct bond_dev_private *internals;
2449         const char *name;
2450
2451         if (!dev)
2452                 return -EINVAL;
2453
2454         name = rte_vdev_device_name(dev);
2455         RTE_LOG(INFO, EAL, "Uninitializing pmd_bond for %s\n", name);
2456
2457         /* now free all data allocation - for eth_dev structure,
2458          * dummy pci driver and internal (private) data
2459          */
2460
2461         /* find an ethdev entry */
2462         eth_dev = rte_eth_dev_allocated(name);
2463         if (eth_dev == NULL)
2464                 return -ENODEV;
2465
2466         RTE_ASSERT(eth_dev->device == &dev->device);
2467
2468         internals = eth_dev->data->dev_private;
2469         if (internals->slave_count != 0)
2470                 return -EBUSY;
2471
2472         if (eth_dev->data->dev_started == 1) {
2473                 bond_ethdev_stop(eth_dev);
2474                 bond_ethdev_close(eth_dev);
2475         }
2476
2477         eth_dev->dev_ops = NULL;
2478         eth_dev->rx_pkt_burst = NULL;
2479         eth_dev->tx_pkt_burst = NULL;
2480
2481         internals = eth_dev->data->dev_private;
2482         rte_bitmap_free(internals->vlan_filter_bmp);
2483         rte_free(internals->vlan_filter_bmpmem);
2484         rte_free(eth_dev->data->dev_private);
2485         rte_free(eth_dev->data->mac_addrs);
2486
2487         rte_eth_dev_release_port(eth_dev);
2488
2489         return 0;
2490 }
2491
2492 /* this part will resolve the slave portids after all the other pdev and vdev
2493  * have been allocated */
2494 static int
2495 bond_ethdev_configure(struct rte_eth_dev *dev)
2496 {
2497         char *name = dev->data->name;
2498         struct bond_dev_private *internals = dev->data->dev_private;
2499         struct rte_kvargs *kvlist = internals->kvlist;
2500         int arg_count;
2501         uint8_t port_id = dev - rte_eth_devices;
2502
2503         static const uint8_t default_rss_key[40] = {
2504                 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2, 0x41, 0x67, 0x25, 0x3D,
2505                 0x43, 0xA3, 0x8F, 0xB0, 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
2506                 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C, 0x6A, 0x42, 0xB7, 0x3B,
2507                 0xBE, 0xAC, 0x01, 0xFA
2508         };
2509
2510         unsigned i, j;
2511
2512         /* If RSS is enabled, fill table and key with default values */
2513         if (dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS) {
2514                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key = internals->rss_key;
2515                 dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key_len = 0;
2516                 memcpy(internals->rss_key, default_rss_key, 40);
2517
2518                 for (i = 0; i < RTE_DIM(internals->reta_conf); i++) {
2519                         internals->reta_conf[i].mask = ~0LL;
2520                         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++)
2521                                 internals->reta_conf[i].reta[j] = j % dev->data->nb_rx_queues;
2522                 }
2523         }
2524
2525         /* set the max_rx_pktlen */
2526         internals->max_rx_pktlen = internals->candidate_max_rx_pktlen;
2527
2528         /*
2529          * if no kvlist, it means that this bonded device has been created
2530          * through the bonding api.
2531          */
2532         if (!kvlist)
2533                 return 0;
2534
2535         /* Parse MAC address for bonded device */
2536         arg_count = rte_kvargs_count(kvlist, PMD_BOND_MAC_ADDR_KVARG);
2537         if (arg_count == 1) {
2538                 struct ether_addr bond_mac;
2539
2540                 if (rte_kvargs_process(kvlist, PMD_BOND_MAC_ADDR_KVARG,
2541                                 &bond_ethdev_parse_bond_mac_addr_kvarg, &bond_mac) < 0) {
2542                         RTE_LOG(INFO, EAL, "Invalid mac address for bonded device %s\n",
2543                                         name);
2544                         return -1;
2545                 }
2546
2547                 /* Set MAC address */
2548                 if (rte_eth_bond_mac_address_set(port_id, &bond_mac) != 0) {
2549                         RTE_LOG(ERR, EAL,
2550                                         "Failed to set mac address on bonded device %s\n",
2551                                         name);
2552                         return -1;
2553                 }
2554         } else if (arg_count > 1) {
2555                 RTE_LOG(ERR, EAL,
2556                                 "MAC address can be specified only once for bonded device %s\n",
2557                                 name);
2558                 return -1;
2559         }
2560
2561         /* Parse/set balance mode transmit policy */
2562         arg_count = rte_kvargs_count(kvlist, PMD_BOND_XMIT_POLICY_KVARG);
2563         if (arg_count == 1) {
2564                 uint8_t xmit_policy;
2565
2566                 if (rte_kvargs_process(kvlist, PMD_BOND_XMIT_POLICY_KVARG,
2567                                 &bond_ethdev_parse_balance_xmit_policy_kvarg, &xmit_policy) !=
2568                                                 0) {
2569                         RTE_LOG(INFO, EAL,
2570                                         "Invalid xmit policy specified for bonded device %s\n",
2571                                         name);
2572                         return -1;
2573                 }
2574
2575                 /* Set balance mode transmit policy*/
2576                 if (rte_eth_bond_xmit_policy_set(port_id, xmit_policy) != 0) {
2577                         RTE_LOG(ERR, EAL,
2578                                         "Failed to set balance xmit policy on bonded device %s\n",
2579                                         name);
2580                         return -1;
2581                 }
2582         } else if (arg_count > 1) {
2583                 RTE_LOG(ERR, EAL,
2584                                 "Transmit policy can be specified only once for bonded device"
2585                                 " %s\n", name);
2586                 return -1;
2587         }
2588
2589         /* Parse/add slave ports to bonded device */
2590         if (rte_kvargs_count(kvlist, PMD_BOND_SLAVE_PORT_KVARG) > 0) {
2591                 struct bond_ethdev_slave_ports slave_ports;
2592                 unsigned i;
2593
2594                 memset(&slave_ports, 0, sizeof(slave_ports));
2595
2596                 if (rte_kvargs_process(kvlist, PMD_BOND_SLAVE_PORT_KVARG,
2597                                 &bond_ethdev_parse_slave_port_kvarg, &slave_ports) != 0) {
2598                         RTE_LOG(ERR, EAL,
2599                                         "Failed to parse slave ports for bonded device %s\n",
2600                                         name);
2601                         return -1;
2602                 }
2603
2604                 for (i = 0; i < slave_ports.slave_count; i++) {
2605                         if (rte_eth_bond_slave_add(port_id, slave_ports.slaves[i]) != 0) {
2606                                 RTE_LOG(ERR, EAL,
2607                                                 "Failed to add port %d as slave to bonded device %s\n",
2608                                                 slave_ports.slaves[i], name);
2609                         }
2610                 }
2611
2612         } else {
2613                 RTE_LOG(INFO, EAL, "No slaves specified for bonded device %s\n", name);
2614                 return -1;
2615         }
2616
2617         /* Parse/set primary slave port id*/
2618         arg_count = rte_kvargs_count(kvlist, PMD_BOND_PRIMARY_SLAVE_KVARG);
2619         if (arg_count == 1) {
2620                 uint8_t primary_slave_port_id;
2621
2622                 if (rte_kvargs_process(kvlist,
2623                                 PMD_BOND_PRIMARY_SLAVE_KVARG,
2624                                 &bond_ethdev_parse_primary_slave_port_id_kvarg,
2625                                 &primary_slave_port_id) < 0) {
2626                         RTE_LOG(INFO, EAL,
2627                                         "Invalid primary slave port id specified for bonded device"
2628                                         " %s\n", name);
2629                         return -1;
2630                 }
2631
2632                 /* Set balance mode transmit policy*/
2633                 if (rte_eth_bond_primary_set(port_id, (uint8_t)primary_slave_port_id)
2634                                 != 0) {
2635                         RTE_LOG(ERR, EAL,
2636                                         "Failed to set primary slave port %d on bonded device %s\n",
2637                                         primary_slave_port_id, name);
2638                         return -1;
2639                 }
2640         } else if (arg_count > 1) {
2641                 RTE_LOG(INFO, EAL,
2642                                 "Primary slave can be specified only once for bonded device"
2643                                 " %s\n", name);
2644                 return -1;
2645         }
2646
2647         /* Parse link status monitor polling interval */
2648         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LSC_POLL_PERIOD_KVARG);
2649         if (arg_count == 1) {
2650                 uint32_t lsc_poll_interval_ms;
2651
2652                 if (rte_kvargs_process(kvlist,
2653                                 PMD_BOND_LSC_POLL_PERIOD_KVARG,
2654                                 &bond_ethdev_parse_time_ms_kvarg,
2655                                 &lsc_poll_interval_ms) < 0) {
2656                         RTE_LOG(INFO, EAL,
2657                                         "Invalid lsc polling interval value specified for bonded"
2658                                         " device %s\n", name);
2659                         return -1;
2660                 }
2661
2662                 if (rte_eth_bond_link_monitoring_set(port_id, lsc_poll_interval_ms)
2663                                 != 0) {
2664                         RTE_LOG(ERR, EAL,
2665                                         "Failed to set lsc monitor polling interval (%u ms) on"
2666                                         " bonded device %s\n", lsc_poll_interval_ms, name);
2667                         return -1;
2668                 }
2669         } else if (arg_count > 1) {
2670                 RTE_LOG(INFO, EAL,
2671                                 "LSC polling interval can be specified only once for bonded"
2672                                 " device %s\n", name);
2673                 return -1;
2674         }
2675
2676         /* Parse link up interrupt propagation delay */
2677         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_UP_PROP_DELAY_KVARG);
2678         if (arg_count == 1) {
2679                 uint32_t link_up_delay_ms;
2680
2681                 if (rte_kvargs_process(kvlist,
2682                                 PMD_BOND_LINK_UP_PROP_DELAY_KVARG,
2683                                 &bond_ethdev_parse_time_ms_kvarg,
2684                                 &link_up_delay_ms) < 0) {
2685                         RTE_LOG(INFO, EAL,
2686                                         "Invalid link up propagation delay value specified for"
2687                                         " bonded device %s\n", name);
2688                         return -1;
2689                 }
2690
2691                 /* Set balance mode transmit policy*/
2692                 if (rte_eth_bond_link_up_prop_delay_set(port_id, link_up_delay_ms)
2693                                 != 0) {
2694                         RTE_LOG(ERR, EAL,
2695                                         "Failed to set link up propagation delay (%u ms) on bonded"
2696                                         " device %s\n", link_up_delay_ms, name);
2697                         return -1;
2698                 }
2699         } else if (arg_count > 1) {
2700                 RTE_LOG(INFO, EAL,
2701                                 "Link up propagation delay can be specified only once for"
2702                                 " bonded device %s\n", name);
2703                 return -1;
2704         }
2705
2706         /* Parse link down interrupt propagation delay */
2707         arg_count = rte_kvargs_count(kvlist, PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG);
2708         if (arg_count == 1) {
2709                 uint32_t link_down_delay_ms;
2710
2711                 if (rte_kvargs_process(kvlist,
2712                                 PMD_BOND_LINK_DOWN_PROP_DELAY_KVARG,
2713                                 &bond_ethdev_parse_time_ms_kvarg,
2714                                 &link_down_delay_ms) < 0) {
2715                         RTE_LOG(INFO, EAL,
2716                                         "Invalid link down propagation delay value specified for"
2717                                         " bonded device %s\n", name);
2718                         return -1;
2719                 }
2720
2721                 /* Set balance mode transmit policy*/
2722                 if (rte_eth_bond_link_down_prop_delay_set(port_id, link_down_delay_ms)
2723                                 != 0) {
2724                         RTE_LOG(ERR, EAL,
2725                                         "Failed to set link down propagation delay (%u ms) on"
2726                                         " bonded device %s\n", link_down_delay_ms, name);
2727                         return -1;
2728                 }
2729         } else if (arg_count > 1) {
2730                 RTE_LOG(INFO, EAL,
2731                                 "Link down propagation delay can be specified only once for"
2732                                 " bonded device %s\n", name);
2733                 return -1;
2734         }
2735
2736         return 0;
2737 }
2738
2739 struct rte_vdev_driver pmd_bond_drv = {
2740         .probe = bond_probe,
2741         .remove = bond_remove,
2742 };
2743
2744 RTE_PMD_REGISTER_VDEV(net_bonding, pmd_bond_drv);
2745 RTE_PMD_REGISTER_ALIAS(net_bonding, eth_bond);
2746
2747 RTE_PMD_REGISTER_PARAM_STRING(net_bonding,
2748         "slave=<ifc> "
2749         "primary=<ifc> "
2750         "mode=[0-6] "
2751         "xmit_policy=[l2 | l23 | l34] "
2752         "socket_id=<int> "
2753         "mac=<mac addr> "
2754         "lsc_poll_period_ms=<int> "
2755         "up_delay=<int> "
2756         "down_delay=<int>");