From 46fb436836790ebcfb9779773ec827682ad05a43 Mon Sep 17 00:00:00 2001 From: Pawel Wodkowski Date: Thu, 27 Nov 2014 18:01:10 +0000 Subject: [PATCH] bond: add mode 4 This patch set add support for dynamic link aggregation (mode 4) to the librte_pmd_bond library. This mode provides auto negotiation/configuration of peers and well as link status changes monitoring using out of band LACP (link aggregation control protocol) messages. For further details of LACP specification see the IEEE 802.3ad/802.1AX standards. It is also described here https://www.kernel.org/doc/Documentation/networking/bonding.txt. In this implementation we have an array of mode 4 settings for each slave. There is also assumption that for every port is one aggregator (it might be unused if better is found). Difference in this implementation vs Linux implementation: - this implementation it is not directly based on state machines but current state is calculated from actor and partner states (and other things too). Some implementation details: - during rx burst every packet Is checked if this is LACP or marker packet. If it is LACP frame it is passed to mode 4 logic using slaves rx ring and removed from rx buffer before it is returned - in tx burst, packets from mode 4 (if any) are injected into each slave. - there is a timer running in background to process/produce mode 4 frames form rx/to tx functions. Some requirements for this mode: - for LACP mode to work rx and tx burst functions must be invoked at least in 100ms intervals - provided buffer to rx burst should be at least 2x slave count size. This is not needed but might increase performance especially during initial handshake. Signed-off-by: Pawel Wodkowski Acked-by: Declan Doherty --- lib/librte_ether/rte_ether.h | 1 + lib/librte_pmd_bond/Makefile | 2 + lib/librte_pmd_bond/rte_eth_bond.h | 19 + lib/librte_pmd_bond/rte_eth_bond_8023ad.c | 1216 +++++++++++++++++ lib/librte_pmd_bond/rte_eth_bond_8023ad.h | 214 +++ .../rte_eth_bond_8023ad_private.h | 308 +++++ lib/librte_pmd_bond/rte_eth_bond_api.c | 91 +- lib/librte_pmd_bond/rte_eth_bond_args.c | 1 + lib/librte_pmd_bond/rte_eth_bond_pmd.c | 265 +++- lib/librte_pmd_bond/rte_eth_bond_private.h | 31 +- 10 files changed, 2100 insertions(+), 48 deletions(-) create mode 100644 lib/librte_pmd_bond/rte_eth_bond_8023ad.c create mode 100644 lib/librte_pmd_bond/rte_eth_bond_8023ad.h create mode 100644 lib/librte_pmd_bond/rte_eth_bond_8023ad_private.h diff --git a/lib/librte_ether/rte_ether.h b/lib/librte_ether/rte_ether.h index 187608d539..7e7d22cca7 100644 --- a/lib/librte_ether/rte_ether.h +++ b/lib/librte_ether/rte_ether.h @@ -328,6 +328,7 @@ struct vxlan_hdr { #define ETHER_TYPE_RARP 0x8035 /**< Reverse Arp Protocol. */ #define ETHER_TYPE_VLAN 0x8100 /**< IEEE 802.1Q VLAN tagging. */ #define ETHER_TYPE_1588 0x88F7 /**< IEEE 802.1AS 1588 Precise Time Protocol. */ +#define ETHER_TYPE_SLOW 0x8809 /**< Slow protocols (LACP and Marker). */ #define ETHER_VXLAN_HLEN (sizeof(struct udp_hdr) + sizeof(struct vxlan_hdr)) /**< VXLAN tunnel header length. */ diff --git a/lib/librte_pmd_bond/Makefile b/lib/librte_pmd_bond/Makefile index d4e10bf5a7..cdff126278 100644 --- a/lib/librte_pmd_bond/Makefile +++ b/lib/librte_pmd_bond/Makefile @@ -45,6 +45,7 @@ CFLAGS += $(WERROR_FLAGS) SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_api.c SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_pmd.c SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_args.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_8023ad.c ifeq ($(CONFIG_RTE_MBUF_REFCNT),n) $(info WARNING: Link Bonding Broadcast mode is disabled because it needs MBUF_REFCNT.) @@ -54,6 +55,7 @@ endif # Export include files # SYMLINK-y-include += rte_eth_bond.h +SYMLINK-y-include += rte_eth_bond_8023ad.h # this lib depends upon: DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += lib/librte_mbuf diff --git a/lib/librte_pmd_bond/rte_eth_bond.h b/lib/librte_pmd_bond/rte_eth_bond.h index 085500b425..9101f64049 100644 --- a/lib/librte_pmd_bond/rte_eth_bond.h +++ b/lib/librte_pmd_bond/rte_eth_bond.h @@ -77,6 +77,25 @@ extern "C" { * In this mode all transmitted packets will be transmitted on all available * active slaves of the bonded. */ #endif +#define BONDING_MODE_8023AD (4) +/**< 802.3AD (Mode 4). + * + * This mode provides auto negotiation/configuration + * of peers and well as link status changes monitoring using out of band + * LACP (link aggregation control protocol) messages. For further details of + * LACP specification see the IEEE 802.3ad/802.1AX standards. It is also + * described here + * https://www.kernel.org/doc/Documentation/networking/bonding.txt. + * + * Important Usage Notes: + * - for LACP mode to work the rx/tx burst functions must be invoked + * at least once every 100ms, otherwise the out-of-band LACP messages will not + * be handled with the expected latency and this may cause the link status to be + * incorrectly marked as down or failure to correctly negotiate with peers. + * - For optimal performance during initial handshaking the array of mbufs provided + * to rx_burst should be at least 2 times the slave count size. + * + */ /* Balance Mode Transmit Policies */ #define BALANCE_XMIT_POLICY_LAYER2 (0) /**< Layer 2 (Ethernet MAC) */ diff --git a/lib/librte_pmd_bond/rte_eth_bond_8023ad.c b/lib/librte_pmd_bond/rte_eth_bond_8023ad.c new file mode 100644 index 0000000000..f1cf81a63a --- /dev/null +++ b/lib/librte_pmd_bond/rte_eth_bond_8023ad.c @@ -0,0 +1,1216 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "rte_eth_bond_private.h" + +#ifdef RTE_LIBRTE_BOND_DEBUG_8023AD +#define MODE4_DEBUG(fmt, ...) RTE_LOG(DEBUG, PMD, "%6u [Port %u: %s] " fmt, \ + bond_dbg_get_time_diff_ms(), slave_id, \ + __func__, ##__VA_ARGS__) + +static uint64_t start_time; + +static unsigned +bond_dbg_get_time_diff_ms(void) +{ + uint64_t now; + + now = rte_rdtsc(); + if (start_time == 0) + start_time = now; + + return ((now - start_time) * 1000) / rte_get_tsc_hz(); +} + +static void +bond_print_lacp(struct lacpdu *l) +{ + char a_address[18]; + char p_address[18]; + char a_state[256] = { 0 }; + char p_state[256] = { 0 }; + + static const char * const state_labels[] = { + "ACT", "TIMEOUT", "AGG", "SYNC", "COL", "DIST", "DEF", "EXP" + }; + + int a_len = 0; + int p_len = 0; + uint8_t i; + uint8_t *addr; + + addr = l->actor.port_params.system.addr_bytes; + snprintf(a_address, sizeof(a_address), "%02X:%02X:%02X:%02X:%02X:%02X", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + + addr = l->partner.port_params.system.addr_bytes; + snprintf(p_address, sizeof(p_address), "%02X:%02X:%02X:%02X:%02X:%02X", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + + for (i = 0; i < 8; i++) { + if ((l->actor.state >> i) & 1) { + a_len += snprintf(&a_state[a_len], RTE_DIM(a_state) - a_len, "%s ", + state_labels[i]); + } + + if ((l->partner.state >> i) & 1) { + p_len += snprintf(&p_state[p_len], RTE_DIM(p_state) - p_len, "%s ", + state_labels[i]); + } + } + + if (a_len && a_state[a_len-1] == ' ') + a_state[a_len-1] = '\0'; + + if (p_len && p_state[p_len-1] == ' ') + p_state[p_len-1] = '\0'; + + RTE_LOG(DEBUG, PMD, "LACP: {\n"\ + " subtype= %02X\n"\ + " ver_num=%02X\n"\ + " actor={ tlv=%02X, len=%02X\n"\ + " pri=%04X, system=%s, key=%04X, p_pri=%04X p_num=%04X\n"\ + " state={ %s }\n"\ + " }\n"\ + " partner={ tlv=%02X, len=%02X\n"\ + " pri=%04X, system=%s, key=%04X, p_pri=%04X p_num=%04X\n"\ + " state={ %s }\n"\ + " }\n"\ + " collector={info=%02X, length=%02X, max_delay=%04X\n, " \ + "type_term=%02X, terminator_length = %02X}\n",\ + l->subtype,\ + l->version_number,\ + l->actor.tlv_type_info,\ + l->actor.info_length,\ + l->actor.port_params.system_priority,\ + a_address,\ + l->actor.port_params.key,\ + l->actor.port_params.port_priority,\ + l->actor.port_params.port_number,\ + a_state,\ + l->partner.tlv_type_info,\ + l->partner.info_length,\ + l->partner.port_params.system_priority,\ + p_address,\ + l->partner.port_params.key,\ + l->partner.port_params.port_priority,\ + l->partner.port_params.port_number,\ + p_state,\ + l->tlv_type_collector_info,\ + l->collector_info_length,\ + l->collector_max_delay,\ + l->tlv_type_terminator,\ + l->terminator_length); + +} +#define BOND_PRINT_LACP(lacpdu) bond_print_lacp(lacpdu) +#else +#define BOND_PRINT_LACP(lacpdu) do { } while (0) +#define MODE4_DEBUG(fmt, ...) do { } while (0) +#endif + +static const struct ether_addr lacp_mac_addr = { + .addr_bytes = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 } +}; + +struct port mode_8023ad_ports[RTE_MAX_ETHPORTS]; + +static void +timer_cancel(uint64_t *timer) +{ + *timer = 0; +} + +static void +timer_set(uint64_t *timer, uint64_t timeout) +{ + *timer = rte_rdtsc() + timeout; +} + +/* Forces given timer to be in expired state. */ +static void +timer_force_expired(uint64_t *timer) +{ + *timer = rte_rdtsc(); +} + +static bool +timer_is_stopped(uint64_t *timer) +{ + return *timer == 0; +} + +static bool +timer_is_expired(uint64_t *timer) +{ + return *timer < rte_rdtsc(); +} + +/* Timer is in running state if it is not stopped nor expired */ +static bool +timer_is_running(uint64_t *timer) +{ + return !timer_is_stopped(timer) && !timer_is_expired(timer); +} + +static void +set_warning_flags(struct port *port, uint16_t flags) +{ + int retval; + uint16_t old; + uint16_t new_flag = 0; + + do { + old = port->warnings_to_show; + new_flag = old | flags; + retval = rte_atomic16_cmpset(&port->warnings_to_show, old, new_flag); + } while (unlikely(retval == 0)); +} + +static void +show_warnings(uint8_t slave_id) +{ + struct port *port = &mode_8023ad_ports[slave_id]; + uint8_t warnings; + + do { + warnings = port->warnings_to_show; + } while (rte_atomic16_cmpset(&port->warnings_to_show, warnings, 0) == 0); + + if (!warnings) + return; + + if (!timer_is_expired(&port->warning_timer)) + return; + + + timer_set(&port->warning_timer, BOND_8023AD_WARNINGS_PERIOD_MS * + rte_get_tsc_hz() / 1000); + + if (warnings & WRN_RX_QUEUE_FULL) { + RTE_LOG(DEBUG, PMD, + "Slave %u: failed to enqueue LACP packet into RX ring.\n" + "Receive and transmit functions must be invoked on bonded\n" + "interface at least 10 times per second or LACP will not\n" + "work correctly\n", slave_id); + } + + if (warnings & WRN_TX_QUEUE_FULL) { + RTE_LOG(DEBUG, PMD, + "Slave %u: failed to enqueue LACP packet into TX ring.\n" + "Receive and transmit functions must be invoked on bonded\n" + "interface at least 10 times per second or LACP will not\n" + "work correctly\n", slave_id); + } + + if (warnings & WRN_RX_MARKER_TO_FAST) + RTE_LOG(INFO, PMD, "Slave %u: marker to early - ignoring.\n", slave_id); + + if (warnings & WRN_UNKNOWN_SLOW_TYPE) { + RTE_LOG(INFO, PMD, + "Slave %u: ignoring unknown slow protocol frame type", slave_id); + } + + if (warnings & WRN_UNKNOWN_MARKER_TYPE) + RTE_LOG(INFO, PMD, "Slave %u: ignoring unknown marker type", slave_id); + + if (warnings & WRN_NOT_LACP_CAPABLE) + MODE4_DEBUG("Port %u is not LACP capable!\n", slave_id); +} + +static void +record_default(struct port *port) +{ + /* Record default parameters for partner. Partner admin parameters + * are not implemented so set them to arbitrary default (last known) and + * mark actor that parner is in defaulted state. */ + port->partner_state = STATE_LACP_ACTIVE; + ACTOR_STATE_SET(port, DEFAULTED); +} + +/** Function handles rx state machine. + * + * This function implements Receive State Machine from point 5.4.12 in + * 802.1AX documentation. It should be called periodically. + * + * @param lacpdu LACPDU received. + * @param port Port on which LACPDU was received. + */ +static void +rx_machine(struct bond_dev_private *internals, uint8_t slave_id, + struct lacpdu *lacp) +{ + struct port *agg, *port = &mode_8023ad_ports[slave_id]; + uint64_t timeout; + + if (SM_FLAG(port, BEGIN)) { + /* Initialize stuff */ + MODE4_DEBUG("-> INITIALIZE\n"); + SM_FLAG_CLR(port, MOVED); + port->selected = UNSELECTED; + + record_default(port); + + ACTOR_STATE_CLR(port, EXPIRED); + timer_cancel(&port->current_while_timer); + + /* DISABLED: On initialization partner is out of sync */ + PARTNER_STATE_CLR(port, SYNCHRONIZATION); + + /* LACP DISABLED stuff if LACP not enabled on this port */ + if (!SM_FLAG(port, LACP_ENABLED)) + PARTNER_STATE_CLR(port, AGGREGATION); + else + PARTNER_STATE_SET(port, AGGREGATION); + } + + if (!SM_FLAG(port, LACP_ENABLED)) { + /* Update parameters only if state changed */ + if (!timer_is_stopped(&port->current_while_timer)) { + port->selected = UNSELECTED; + record_default(port); + PARTNER_STATE_CLR(port, AGGREGATION); + ACTOR_STATE_CLR(port, EXPIRED); + timer_cancel(&port->current_while_timer); + } + return; + } + + if (lacp) { + MODE4_DEBUG("LACP -> CURRENT\n"); + BOND_PRINT_LACP(lacp); + /* Update selected flag. If partner parameters are defaulted assume they + * are match. If not defaulted compare LACP actor with ports parner + * params. */ + if (!ACTOR_STATE(port, DEFAULTED) && + (ACTOR_STATE(port, AGGREGATION) != PARTNER_STATE(port, AGGREGATION) + || memcmp(&port->partner, &lacp->actor.port_params, + sizeof(port->partner)) != 0)) { + MODE4_DEBUG("selected <- UNSELECTED\n"); + port->selected = UNSELECTED; + } + + /* Record this PDU actor params as partner params */ + memcpy(&port->partner, &lacp->actor.port_params, + sizeof(struct port_params)); + port->partner_state = lacp->actor.state; + + /* Partner parameters are not defaulted any more */ + ACTOR_STATE_CLR(port, DEFAULTED); + + /* If LACP partner params match this port actor params */ + agg = &mode_8023ad_ports[port->aggregator_port_id]; + bool match = port->actor.system_priority == + lacp->partner.port_params.system_priority && + is_same_ether_addr(&agg->actor.system, + &lacp->partner.port_params.system) && + port->actor.port_priority == + lacp->partner.port_params.port_priority && + port->actor.port_number == + lacp->partner.port_params.port_number; + + /* Update NTT if partners information are outdated (xored and masked + * bits are set)*/ + uint8_t state_mask = STATE_LACP_ACTIVE | STATE_LACP_SHORT_TIMEOUT | + STATE_SYNCHRONIZATION | STATE_AGGREGATION; + + if (((port->actor_state ^ lacp->partner.state) & state_mask) || + match == false) { + SM_FLAG_SET(port, NTT); + } + + /* If LACP partner params match this port actor params */ + if (match == true && ACTOR_STATE(port, AGGREGATION) == + PARTNER_STATE(port, AGGREGATION)) + PARTNER_STATE_SET(port, SYNCHRONIZATION); + else if (!PARTNER_STATE(port, AGGREGATION) && ACTOR_STATE(port, + AGGREGATION)) + PARTNER_STATE_SET(port, SYNCHRONIZATION); + else + PARTNER_STATE_CLR(port, SYNCHRONIZATION); + + if (ACTOR_STATE(port, LACP_SHORT_TIMEOUT)) + timeout = internals->mode4.short_timeout; + else + timeout = internals->mode4.long_timeout; + + timer_set(&port->current_while_timer, timeout); + ACTOR_STATE_CLR(port, EXPIRED); + return; /* No state change */ + } + + /* If CURRENT state timer is not running (stopped or expired) + * transit to EXPIRED state from DISABLED or CURRENT */ + if (!timer_is_running(&port->current_while_timer)) { + ACTOR_STATE_SET(port, EXPIRED); + PARTNER_STATE_CLR(port, SYNCHRONIZATION); + PARTNER_STATE_SET(port, LACP_SHORT_TIMEOUT); + timer_set(&port->current_while_timer, internals->mode4.short_timeout); + } +} + +/** + * Function handles periodic tx state machine. + * + * Function implements Periodic Transmission state machine from point 5.4.13 + * in 802.1AX documentation. It should be called periodically. + * + * @param port Port to handle state machine. + */ +static void +periodic_machine(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *port = &mode_8023ad_ports[slave_id]; + /* Calculate if either site is LACP enabled */ + uint64_t timeout; + uint8_t active = ACTOR_STATE(port, LACP_ACTIVE) || + PARTNER_STATE(port, LACP_ACTIVE); + + uint8_t is_partner_fast, was_partner_fast; + /* No periodic is on BEGIN, LACP DISABLE or when both sides are pasive */ + if (SM_FLAG(port, BEGIN) || !SM_FLAG(port, LACP_ENABLED) || !active) { + timer_cancel(&port->periodic_timer); + timer_force_expired(&port->tx_machine_timer); + SM_FLAG_CLR(port, PARTNER_SHORT_TIMEOUT); + + MODE4_DEBUG("-> NO_PERIODIC ( %s%s%s)\n", + SM_FLAG(port, BEGIN) ? "begind " : "", + SM_FLAG(port, LACP_ENABLED) ? "" : "LACP disabled ", + active ? "LACP active " : "LACP pasive "); + return; + } + + is_partner_fast = PARTNER_STATE(port, LACP_SHORT_TIMEOUT); + was_partner_fast = SM_FLAG(port, PARTNER_SHORT_TIMEOUT); + + /* If periodic timer is not started, transit from NO PERIODIC to FAST/SLOW. + * Other case: check if timer expire or partners settings changed. */ + if (!timer_is_stopped(&port->periodic_timer)) { + if (timer_is_expired(&port->periodic_timer)) { + SM_FLAG_SET(port, NTT); + } else if (is_partner_fast != was_partner_fast) { + /* Partners timeout was slow and now it is fast -> send LACP. + * In other case (was fast and now it is slow) just switch + * timeout to slow without forcing send of LACP (because standard + * say so)*/ + if (!is_partner_fast) + SM_FLAG_SET(port, NTT); + } else + return; /* Nothing changed */ + } + + /* Handle state transition to FAST/SLOW LACP timeout */ + if (is_partner_fast) { + timeout = internals->mode4.fast_periodic_timeout; + SM_FLAG_SET(port, PARTNER_SHORT_TIMEOUT); + } else { + timeout = internals->mode4.slow_periodic_timeout; + SM_FLAG_CLR(port, PARTNER_SHORT_TIMEOUT); + } + + timer_set(&port->periodic_timer, timeout); +} + +/** + * Function handles mux state machine. + * + * Function implements Mux Machine from point 5.4.15 in 802.1AX documentation. + * It should be called periodically. + * + * @param port Port to handle state machine. + */ +static void +mux_machine(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *port = &mode_8023ad_ports[slave_id]; + + /* Save current state for later use */ + const uint8_t state_mask = STATE_SYNCHRONIZATION | STATE_DISTRIBUTING | + STATE_COLLECTING; + + /* Enter DETACHED state on BEGIN condition or from any other state if + * port was unselected */ + if (SM_FLAG(port, BEGIN) || + port->selected == UNSELECTED || (port->selected == STANDBY && + (port->actor_state & state_mask) != 0)) { + /* detach mux from aggregator */ + port->actor_state &= ~state_mask; + /* Set ntt to true if BEGIN condition or transition from any other state + * which is indicated that wait_while_timer was started */ + if (SM_FLAG(port, BEGIN) || + !timer_is_stopped(&port->wait_while_timer)) { + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("-> DETACHED\n"); + } + timer_cancel(&port->wait_while_timer); + } + + if (timer_is_stopped(&port->wait_while_timer)) { + if (port->selected == SELECTED || port->selected == STANDBY) { + timer_set(&port->wait_while_timer, + internals->mode4.aggregate_wait_timeout); + + MODE4_DEBUG("DETACHED -> WAITING\n"); + } + /* Waiting state entered */ + return; + } + + /* Transit next state if port is ready */ + if (!timer_is_expired(&port->wait_while_timer)) + return; + + if ((ACTOR_STATE(port, DISTRIBUTING) || ACTOR_STATE(port, COLLECTING)) && + !PARTNER_STATE(port, SYNCHRONIZATION)) { + /* If in COLLECTING or DISTRIBUTING state and partner becomes out of + * sync transit to ATACHED state. */ + ACTOR_STATE_CLR(port, DISTRIBUTING); + ACTOR_STATE_CLR(port, COLLECTING); + /* Clear actor sync to activate transit ATACHED in condition bellow */ + ACTOR_STATE_CLR(port, SYNCHRONIZATION); + MODE4_DEBUG("Out of sync -> ATTACHED\n"); + } + + if (!ACTOR_STATE(port, SYNCHRONIZATION)) { + /* attach mux to aggregator */ + RTE_VERIFY((port->actor_state & (STATE_COLLECTING | + STATE_DISTRIBUTING)) == 0); + + ACTOR_STATE_SET(port, SYNCHRONIZATION); + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("ATTACHED Entered\n"); + } else if (!ACTOR_STATE(port, COLLECTING)) { + /* Start collecting if in sync */ + if (PARTNER_STATE(port, SYNCHRONIZATION)) { + MODE4_DEBUG("ATTACHED -> COLLECTING\n"); + ACTOR_STATE_SET(port, COLLECTING); + SM_FLAG_SET(port, NTT); + } + } else if (ACTOR_STATE(port, COLLECTING)) { + /* Check if partner is in COLLECTING state. If so this port can + * distribute frames to it */ + if (!ACTOR_STATE(port, DISTRIBUTING)) { + if (PARTNER_STATE(port, COLLECTING)) { + /* Enable DISTRIBUTING if partner is collecting */ + ACTOR_STATE_SET(port, DISTRIBUTING); + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("COLLECTING -> DISTRIBUTING\n"); + RTE_LOG(INFO, PMD, + "Bond %u: slave id %u distributing started.\n", + internals->port_id, slave_id); + } + } else { + if (!PARTNER_STATE(port, COLLECTING)) { + /* Disable DISTRIBUTING (enter COLLECTING state) if partner + * is not collecting */ + ACTOR_STATE_CLR(port, DISTRIBUTING); + SM_FLAG_SET(port, NTT); + MODE4_DEBUG("DISTRIBUTING -> COLLECTING\n"); + RTE_LOG(INFO, PMD, + "Bond %u: slave id %u distributing stopped.\n", + internals->port_id, slave_id); + } + } + } +} + +/** + * Function handles transmit state machine. + * + * Function implements Transmit Machine from point 5.4.16 in 802.1AX + * documentation. + * + * @param port + */ +static void +tx_machine(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *agg, *port = &mode_8023ad_ports[slave_id]; + + struct rte_mbuf *lacp_pkt = NULL; + struct lacpdu_header *hdr; + struct lacpdu *lacpdu; + + /* If periodic timer is not running periodic machine is in NO PERIODIC and + * according to 802.3ax standard tx machine should not transmit any frames + * and set ntt to false. */ + if (timer_is_stopped(&port->periodic_timer)) + SM_FLAG_CLR(port, NTT); + + if (!SM_FLAG(port, NTT)) + return; + + if (!timer_is_expired(&port->tx_machine_timer)) + return; + + lacp_pkt = rte_pktmbuf_alloc(port->mbuf_pool); + if (lacp_pkt == NULL) { + RTE_LOG(ERR, PMD, "Failed to allocate LACP packet from pool\n"); + return; + } + + lacp_pkt->data_len = sizeof(*hdr); + lacp_pkt->pkt_len = sizeof(*hdr); + + hdr = rte_pktmbuf_mtod(lacp_pkt, struct lacpdu_header *); + + /* Source and destination MAC */ + ether_addr_copy(&lacp_mac_addr, &hdr->eth_hdr.d_addr); + rte_eth_macaddr_get(slave_id, &hdr->eth_hdr.s_addr); + hdr->eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_SLOW); + + lacpdu = &hdr->lacpdu; + memset(lacpdu, 0, sizeof(*lacpdu)); + + /* Initialize LACP part */ + lacpdu->subtype = SLOW_SUBTYPE_LACP; + lacpdu->version_number = 1; + + /* ACTOR */ + lacpdu->actor.tlv_type_info = TLV_TYPE_ACTOR_INFORMATION; + lacpdu->actor.info_length = sizeof(struct lacpdu_actor_partner_params); + memcpy(&hdr->lacpdu.actor.port_params, &port->actor, + sizeof(port->actor)); + agg = &mode_8023ad_ports[port->aggregator_port_id]; + ether_addr_copy(&agg->actor.system, &hdr->lacpdu.actor.port_params.system); + lacpdu->actor.state = port->actor_state; + + /* PARTNER */ + lacpdu->partner.tlv_type_info = TLV_TYPE_PARTNER_INFORMATION; + lacpdu->partner.info_length = sizeof(struct lacpdu_actor_partner_params); + memcpy(&lacpdu->partner.port_params, &port->partner, + sizeof(struct port_params)); + lacpdu->partner.state = port->partner_state; + + /* Other fields */ + lacpdu->tlv_type_collector_info = TLV_TYPE_COLLECTOR_INFORMATION; + lacpdu->collector_info_length = 0x10; + lacpdu->collector_max_delay = 0; + + lacpdu->tlv_type_terminator = TLV_TYPE_TERMINATOR_INFORMATION; + lacpdu->terminator_length = 0; + + if (rte_ring_enqueue(port->tx_ring, lacp_pkt) == -ENOBUFS) { + /* If TX ring full, drop packet and free message. Retransmission + * will happen in next function call. */ + rte_pktmbuf_free(lacp_pkt); + set_warning_flags(port, WRN_TX_QUEUE_FULL); + return; + } + + MODE4_DEBUG("sending LACP frame\n"); + BOND_PRINT_LACP(lacpdu); + + timer_set(&port->tx_machine_timer, internals->mode4.tx_period_timeout); + SM_FLAG_CLR(port, NTT); +} + +/** + * Function assigns port to aggregator. + * + * @param bond_dev_private Pointer to bond_dev_private structure. + * @param port_pos Port to assign. + */ +static void +selection_logic(struct bond_dev_private *internals, uint8_t slave_id) +{ + struct port *agg, *port; + uint8_t slaves_count, new_agg_id, i; + uint8_t *slaves; + + slaves = internals->active_slaves; + slaves_count = internals->active_slave_count; + port = &mode_8023ad_ports[slave_id]; + + /* Search for aggregator suitable for this port */ + for (i = 0; i < slaves_count; ++i) { + agg = &mode_8023ad_ports[slaves[i]]; + /* Skip ports that are not aggreagators */ + if (agg->aggregator_port_id != slaves[i]) + continue; + + /* Actors system ID is not checked since all slave device have the same + * ID (MAC address). */ + if ((agg->actor.key == port->actor.key && + agg->partner.system_priority == port->partner.system_priority && + is_same_ether_addr(&agg->partner.system, &port->partner.system) == 1 + && (agg->partner.key == port->partner.key)) && + is_zero_ether_addr(&port->partner.system) != 1 && + (agg->actor.key & + rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY)) != 0) { + + break; + } + } + + /* By default, port uses it self as agregator */ + if (i == slaves_count) + new_agg_id = slave_id; + else + new_agg_id = slaves[i]; + + if (new_agg_id != port->aggregator_port_id) { + port->aggregator_port_id = new_agg_id; + + MODE4_DEBUG("-> SELECTED: ID=%3u\n" + "\t%s aggregator ID=%3u\n", + port->aggregator_port_id, + port->aggregator_port_id == slave_id ? + "aggregator not found, using default" : "aggregator found", + port->aggregator_port_id); + } + + port->selected = SELECTED; +} + +/* Function maps DPDK speed to bonding speed stored in key field */ +static uint16_t +link_speed_key(uint16_t speed) { + uint16_t key_speed; + + switch (speed) { + case ETH_LINK_SPEED_AUTONEG: + key_speed = 0x00; + break; + case ETH_LINK_SPEED_10: + key_speed = BOND_LINK_SPEED_KEY_10M; + break; + case ETH_LINK_SPEED_100: + key_speed = BOND_LINK_SPEED_KEY_100M; + break; + case ETH_LINK_SPEED_1000: + key_speed = BOND_LINK_SPEED_KEY_1000M; + break; + case ETH_LINK_SPEED_10G: + key_speed = BOND_LINK_SPEED_KEY_10G; + break; + case ETH_LINK_SPEED_20G: + key_speed = BOND_LINK_SPEED_KEY_20G; + break; + case ETH_LINK_SPEED_40G: + key_speed = BOND_LINK_SPEED_KEY_40G; + break; + default: + /* Unknown speed*/ + key_speed = 0xFFFF; + } + + return key_speed; +} + +static void +bond_mode_8023ad_periodic_cb(void *arg) +{ + struct rte_eth_dev *bond_dev = arg; + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct port *port; + struct rte_eth_link link_info; + struct ether_addr slave_addr; + + void *pkt = NULL; + uint16_t i, slave_id; + + + /* Update link status on each port */ + for (i = 0; i < internals->active_slave_count; i++) { + uint16_t key; + + slave_id = internals->active_slaves[i]; + rte_eth_link_get(slave_id, &link_info); + rte_eth_macaddr_get(slave_id, &slave_addr); + + if (link_info.link_status != 0) { + key = link_speed_key(link_info.link_speed) << 1; + if (link_info.link_duplex == ETH_LINK_FULL_DUPLEX) + key |= BOND_LINK_FULL_DUPLEX_KEY; + } else + key = 0; + + port = &mode_8023ad_ports[slave_id]; + + key = rte_cpu_to_be_16(key); + if (key != port->actor.key) { + if (!(key & rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY))) + set_warning_flags(port, WRN_NOT_LACP_CAPABLE); + + port->actor.key = key; + SM_FLAG_SET(port, NTT); + } + + if (!is_same_ether_addr(&port->actor.system, &slave_addr)) { + ether_addr_copy(&slave_addr, &port->actor.system); + if (port->aggregator_port_id == slave_id) + SM_FLAG_SET(port, NTT); + } + } + + for (i = 0; i < internals->active_slave_count; i++) { + slave_id = internals->active_slaves[i]; + port = &mode_8023ad_ports[slave_id]; + + if ((port->actor.key & + rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY)) == 0) { + + SM_FLAG_SET(port, BEGIN); + + /* LACP is disabled on half duples or link is down */ + if (SM_FLAG(port, LACP_ENABLED)) { + /* If port was enabled set it to BEGIN state */ + SM_FLAG_CLR(port, LACP_ENABLED); + ACTOR_STATE_CLR(port, DISTRIBUTING); + ACTOR_STATE_CLR(port, COLLECTING); + } + + /* Skip this port processing */ + continue; + } + + SM_FLAG_SET(port, LACP_ENABLED); + + /* Find LACP packet to this port. Do not check subtype, it is done in + * function that queued packet */ + if (rte_ring_dequeue(port->rx_ring, &pkt) == 0) { + struct rte_mbuf *lacp_pkt = pkt; + struct lacpdu_header *lacp; + + lacp = rte_pktmbuf_mtod(lacp_pkt, struct lacpdu_header *); + RTE_VERIFY(lacp->lacpdu.subtype == SLOW_SUBTYPE_LACP); + + /* This is LACP frame so pass it to rx_machine */ + rx_machine(internals, slave_id, &lacp->lacpdu); + rte_pktmbuf_free(lacp_pkt); + } else + rx_machine(internals, slave_id, NULL); + + periodic_machine(internals, slave_id); + mux_machine(internals, slave_id); + tx_machine(internals, slave_id); + selection_logic(internals, slave_id); + + SM_FLAG_CLR(port, BEGIN); + show_warnings(slave_id); + } + + rte_eal_alarm_set(internals->mode4.update_timeout_us, + bond_mode_8023ad_periodic_cb, arg); +} + +void +bond_mode_8023ad_activate_slave(struct rte_eth_dev *bond_dev, uint8_t slave_id) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + + struct port *port = &mode_8023ad_ports[slave_id]; + struct port_params initial = { + .system = { { 0 } }, + .system_priority = rte_cpu_to_be_16(0xFFFF), + .key = rte_cpu_to_be_16(BOND_LINK_FULL_DUPLEX_KEY), + .port_priority = rte_cpu_to_be_16(0x00FF), + .port_number = 0, + }; + + char mem_name[RTE_ETH_NAME_MAX_LEN]; + uint8_t socket_id; + unsigned element_size; + + /* Given slave mus not be in active list */ + RTE_VERIFY(find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) == internals->active_slave_count); + + memcpy(&port->actor, &initial, sizeof(struct port_params)); + /* Standard requires that port ID must be grater than 0. + * Add 1 do get corresponding port_number */ + port->actor.port_number = rte_cpu_to_be_16((uint16_t)slave_id + 1); + + memcpy(&port->partner, &initial, sizeof(struct port_params)); + + /* default states */ + port->actor_state = STATE_AGGREGATION | STATE_LACP_ACTIVE | STATE_DEFAULTED; + port->partner_state = STATE_LACP_ACTIVE; + port->sm_flags = SM_FLAGS_BEGIN; + + /* use this port as agregator */ + port->aggregator_port_id = slave_id; + rte_eth_promiscuous_enable(slave_id); + + timer_cancel(&port->warning_timer); + + if (port->mbuf_pool != NULL) + return; + + RTE_VERIFY(port->rx_ring == NULL); + RTE_VERIFY(port->tx_ring == NULL); + socket_id = rte_eth_devices[slave_id].pci_dev->numa_node; + + element_size = sizeof(struct slow_protocol_frame) + sizeof(struct rte_mbuf) + + RTE_PKTMBUF_HEADROOM; + + /* How big memory pool should be? If driver will not + * free packets quick enough there will be ENOMEM in tx_machine. + * For now give 511 pkts * max number of queued TX packets per slave. + * Hope it will be enough. */ + snprintf(mem_name, RTE_DIM(mem_name), "slave_port%u_pool", slave_id); + port->mbuf_pool = rte_mempool_create(mem_name, + BOND_MODE_8023AX_SLAVE_TX_PKTS * 512 - 1, + element_size, + RTE_MEMPOOL_CACHE_MAX_SIZE >= 32 ? 32 : RTE_MEMPOOL_CACHE_MAX_SIZE, + sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, + NULL, rte_pktmbuf_init, NULL, socket_id, MEMPOOL_F_NO_SPREAD); + + /* Any memory allocation failure in initalization is critical because + * resources can't be free, so reinitialization is impossible. */ + if (port->mbuf_pool == NULL) { + rte_panic("Slave %u: Failed to create memory pool '%s': %s\n", + slave_id, mem_name, rte_strerror(rte_errno)); + } + + snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_rx", slave_id); + port->rx_ring = rte_ring_create(mem_name, + rte_align32pow2(BOND_MODE_8023AX_SLAVE_RX_PKTS), socket_id, 0); + + if (port->rx_ring == NULL) { + rte_panic("Slave %u: Failed to create rx ring '%s': %s\n", slave_id, + mem_name, rte_strerror(rte_errno)); + } + + /* TX ring is at least one pkt longer to make room for marker packet. */ + snprintf(mem_name, RTE_DIM(mem_name), "slave_%u_tx", slave_id); + port->tx_ring = rte_ring_create(mem_name, + rte_align32pow2(BOND_MODE_8023AX_SLAVE_TX_PKTS + 1), socket_id, 0); + + if (port->tx_ring == NULL) { + rte_panic("Slave %u: Failed to create tx ring '%s': %s\n", slave_id, + mem_name, rte_strerror(rte_errno)); + } +} + +int +bond_mode_8023ad_deactivate_slave(struct rte_eth_dev *bond_dev, + uint8_t slave_id) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + void *pkt = NULL; + struct port *port; + uint8_t i; + + /* Given slave mus be in active list */ + RTE_VERIFY(find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) < internals->active_slave_count); + + /* Exclude slave from transmit policy. If this slave is an aggregator + * make all aggregated slaves unselected to force sellection logic + * to select suitable aggregator for this port. */ + for (i = 0; i < internals->active_slave_count; i++) { + port = &mode_8023ad_ports[internals->active_slaves[i]]; + if (port->aggregator_port_id != slave_id) + continue; + + port->selected = UNSELECTED; + + /* Use default aggregator */ + port->aggregator_port_id = internals->active_slaves[i]; + } + + port = &mode_8023ad_ports[slave_id]; + port->selected = UNSELECTED; + port->actor_state &= ~(STATE_SYNCHRONIZATION | STATE_DISTRIBUTING | + STATE_COLLECTING); + + while (rte_ring_dequeue(port->rx_ring, &pkt) == 0) + rte_pktmbuf_free((struct rte_mbuf *)pkt); + + while (rte_ring_dequeue(port->tx_ring, &pkt) == 0) + rte_pktmbuf_free((struct rte_mbuf *)pkt); + return 0; +} + +void +bond_mode_8023ad_mac_address_update(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + struct ether_addr slave_addr; + struct port *slave, *agg_slave; + uint8_t slave_id, i, j; + + bond_mode_8023ad_stop(bond_dev); + + for (i = 0; i < internals->active_slave_count; i++) { + slave_id = internals->active_slaves[i]; + slave = &mode_8023ad_ports[slave_id]; + rte_eth_macaddr_get(slave_id, &slave_addr); + + if (is_same_ether_addr(&slave_addr, &slave->actor.system)) + continue; + + ether_addr_copy(&slave_addr, &slave->actor.system); + /* Do nothing if this port is not an aggregator. In other case + * Set NTT flag on every port that use this aggregator. */ + if (slave->aggregator_port_id != slave_id) + continue; + + for (j = 0; j < internals->active_slave_count; j++) { + agg_slave = &mode_8023ad_ports[internals->active_slaves[j]]; + if (agg_slave->aggregator_port_id == slave_id) + SM_FLAG_SET(agg_slave, NTT); + } + } + + if (bond_dev->data->dev_started) + bond_mode_8023ad_start(bond_dev); +} + +void +bond_mode_8023ad_conf_get(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct bond_dev_private *internals = dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + uint64_t ms_ticks = rte_get_tsc_hz() / 1000; + + conf->fast_periodic_ms = mode4->fast_periodic_timeout / ms_ticks; + conf->slow_periodic_ms = mode4->slow_periodic_timeout / ms_ticks; + conf->short_timeout_ms = mode4->short_timeout / ms_ticks; + conf->long_timeout_ms = mode4->long_timeout / ms_ticks; + conf->aggregate_wait_timeout_ms = mode4->aggregate_wait_timeout / ms_ticks; + conf->tx_period_ms = mode4->tx_period_timeout / ms_ticks; + conf->update_timeout_ms = mode4->update_timeout_us / 1000; +} + +void +bond_mode_8023ad_setup(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_bond_8023ad_conf def_conf; + struct bond_dev_private *internals = dev->data->dev_private; + struct mode8023ad_private *mode4 = &internals->mode4; + uint64_t ms_ticks = rte_get_tsc_hz() / 1000; + + if (conf == NULL) { + conf = &def_conf; + conf->fast_periodic_ms = BOND_8023AD_FAST_PERIODIC_MS; + conf->slow_periodic_ms = BOND_8023AD_SLOW_PERIODIC_MS; + conf->short_timeout_ms = BOND_8023AD_SHORT_TIMEOUT_MS; + conf->long_timeout_ms = BOND_8023AD_LONG_TIMEOUT_MS; + conf->aggregate_wait_timeout_ms = BOND_8023AD_AGGREGATE_WAIT_TIMEOUT_MS; + conf->tx_period_ms = BOND_8023AD_TX_MACHINE_PERIOD_MS; + conf->rx_marker_period_ms = BOND_8023AD_RX_MARKER_PERIOD_MS; + conf->update_timeout_ms = BOND_MODE_8023AX_UPDATE_TIMEOUT_MS; + } + + mode4->fast_periodic_timeout = conf->fast_periodic_ms * ms_ticks; + mode4->slow_periodic_timeout = conf->slow_periodic_ms * ms_ticks; + mode4->short_timeout = conf->short_timeout_ms * ms_ticks; + mode4->long_timeout = conf->long_timeout_ms * ms_ticks; + mode4->aggregate_wait_timeout = conf->aggregate_wait_timeout_ms * ms_ticks; + mode4->tx_period_timeout = conf->tx_period_ms * ms_ticks; + mode4->rx_marker_timeout = conf->rx_marker_period_ms * ms_ticks; + mode4->update_timeout_us = conf->update_timeout_ms * 1000; +} + +int +bond_mode_8023ad_enable(struct rte_eth_dev *bond_dev) +{ + struct bond_dev_private *internals = bond_dev->data->dev_private; + uint16_t i; + + for (i = 0; i < internals->active_slave_count; i++) + bond_mode_8023ad_activate_slave(bond_dev, i); + + return 0; +} + +int +bond_mode_8023ad_start(struct rte_eth_dev *bond_dev) +{ + return rte_eal_alarm_set(BOND_MODE_8023AX_UPDATE_TIMEOUT_MS * 1000, + &bond_mode_8023ad_periodic_cb, bond_dev); +} + +void +bond_mode_8023ad_stop(struct rte_eth_dev *bond_dev) +{ + rte_eal_alarm_cancel(&bond_mode_8023ad_periodic_cb, bond_dev); +} + +void +bond_mode_8023ad_handle_slow_pkt(struct bond_dev_private *internals, + uint8_t slave_id, struct rte_mbuf *pkt) +{ + struct mode8023ad_private *mode4 = &internals->mode4; + struct port *port = &mode_8023ad_ports[slave_id]; + struct marker_header *m_hdr; + uint64_t marker_timer, old_marker_timer; + int retval; + uint8_t wrn, subtype; + /* If packet is a marker, we send response now by reusing given packet + * and update only source MAC, destination MAC is multicast so don't + * update it. Other frames will be handled later by state machines */ + subtype = rte_pktmbuf_mtod(pkt, + struct slow_protocol_frame *)->slow_protocol.subtype; + + if (subtype == SLOW_SUBTYPE_MARKER) { + m_hdr = rte_pktmbuf_mtod(pkt, struct marker_header *); + + if (likely(m_hdr->marker.tlv_type_marker != MARKER_TLV_TYPE_INFO)) { + wrn = WRN_UNKNOWN_MARKER_TYPE; + goto free_out; + } + + /* Setup marker timer. Do it in loop in case concurent access. */ + do { + old_marker_timer = port->rx_marker_timer; + if (!timer_is_expired(&old_marker_timer)) { + wrn = WRN_RX_MARKER_TO_FAST; + goto free_out; + } + + timer_set(&marker_timer, mode4->rx_marker_timeout); + retval = rte_atomic64_cmpset(&port->rx_marker_timer, + old_marker_timer, marker_timer); + } while (unlikely(retval == 0)); + + m_hdr->marker.tlv_type_marker = MARKER_TLV_TYPE_RESP; + rte_eth_macaddr_get(slave_id, &m_hdr->eth_hdr.s_addr); + + if (unlikely(rte_ring_enqueue(port->tx_ring, pkt) == -ENOBUFS)) { + /* reset timer */ + port->rx_marker_timer = 0; + wrn = WRN_TX_QUEUE_FULL; + goto free_out; + } + } else if (likely(subtype == SLOW_SUBTYPE_LACP)) { + if (unlikely(rte_ring_enqueue(port->rx_ring, pkt) == -ENOBUFS)) { + /* If RX fing full free lacpdu message and drop packet */ + wrn = WRN_RX_QUEUE_FULL; + goto free_out; + } + } else { + wrn = WRN_UNKNOWN_SLOW_TYPE; + goto free_out; + } + + return; + +free_out: + set_warning_flags(port, wrn); + rte_pktmbuf_free(pkt); +} + +int +rte_eth_bond_8023ad_conf_get(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_dev *bond_dev; + + if (valid_bonded_port_id(port_id) != 0) + return -EINVAL; + + if (conf == NULL) + return -EINVAL; + + bond_dev = &rte_eth_devices[port_id]; + bond_mode_8023ad_conf_get(bond_dev, conf); + return 0; +} + +int +rte_eth_bond_8023ad_setup(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf) +{ + struct rte_eth_dev *bond_dev; + + if (valid_bonded_port_id(port_id) != 0) + return -EINVAL; + + if (conf != NULL) { + /* Basic sanity check */ + if (conf->slow_periodic_ms == 0 || + conf->fast_periodic_ms >= conf->slow_periodic_ms || + conf->long_timeout_ms == 0 || + conf->short_timeout_ms >= conf->long_timeout_ms || + conf->aggregate_wait_timeout_ms == 0 || + conf->tx_period_ms == 0 || + conf->rx_marker_period_ms == 0 || + conf->update_timeout_ms == 0) { + RTE_LOG(ERR, PMD, "given mode 4 configuration is invalid\n"); + return -EINVAL; + } + } + + bond_dev = &rte_eth_devices[port_id]; + bond_mode_8023ad_setup(bond_dev, conf); + + return 0; +} + +int +rte_eth_bond_8023ad_slave_info(uint8_t port_id, uint8_t slave_id, + struct rte_eth_bond_8023ad_slave_info *info) +{ + struct rte_eth_dev *bond_dev; + struct bond_dev_private *internals; + struct port *port; + + if (info == NULL || valid_bonded_port_id(port_id) != 0 || + rte_eth_bond_mode_get(port_id) != BONDING_MODE_8023AD) + return -EINVAL; + + bond_dev = &rte_eth_devices[port_id]; + + internals = bond_dev->data->dev_private; + if (find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_id) == + internals->active_slave_count) + return -EINVAL; + + port = &mode_8023ad_ports[slave_id]; + info->selected = port->selected; + + info->actor_state = port->actor_state; + rte_memcpy(&info->actor, &port->actor, sizeof(port->actor)); + + info->partner_state = port->partner_state; + rte_memcpy(&info->partner, &port->partner, sizeof(port->partner)); + + info->agg_port_id = port->aggregator_port_id; + return 0; +} diff --git a/lib/librte_pmd_bond/rte_eth_bond_8023ad.h b/lib/librte_pmd_bond/rte_eth_bond_8023ad.h new file mode 100644 index 0000000000..9adc6aa160 --- /dev/null +++ b/lib/librte_pmd_bond/rte_eth_bond_8023ad.h @@ -0,0 +1,214 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_ETH_BOND_8023AD_H_ +#define RTE_ETH_BOND_8023AD_H_ + +#include + +/** + * Actor/partner states + */ +#define STATE_LACP_ACTIVE 0x01 +#define STATE_LACP_SHORT_TIMEOUT 0x02 +#define STATE_AGGREGATION 0x04 +#define STATE_SYNCHRONIZATION 0x08 +#define STATE_COLLECTING 0x10 +#define STATE_DISTRIBUTING 0x20 +/** Partners parameters are defaulted */ +#define STATE_DEFAULTED 0x40 +#define STATE_EXPIRED 0x80 + +#define TLV_TYPE_ACTOR_INFORMATION 0x01 +#define TLV_TYPE_PARTNER_INFORMATION 0x02 +#define TLV_TYPE_COLLECTOR_INFORMATION 0x03 +#define TLV_TYPE_TERMINATOR_INFORMATION 0x00 + +#define SLOW_SUBTYPE_LACP 0x01 +#define SLOW_SUBTYPE_MARKER 0x02 + +#define MARKER_TLV_TYPE_INFO 0x01 +#define MARKER_TLV_TYPE_RESP 0x02 + +enum rte_bond_8023ad_selection { + UNSELECTED, + STANDBY, + SELECTED +}; + +/** Generic slow protocol structure */ +struct slow_protocol { + uint8_t subtype; + uint8_t reserved_119[119]; +} __attribute__((__packed__)); + +/** Generic slow protocol frame type structure */ +struct slow_protocol_frame { + struct ether_hdr eth_hdr; + struct slow_protocol slow_protocol; +} __attribute__((__packed__)); + +struct port_params { + uint16_t system_priority; + /**< System priority (unused in current implementation) */ + struct ether_addr system; + /**< System ID - Slave MAC address, same as bonding MAC address */ + uint16_t key; + /**< Speed information (implementation dependednt) and duplex. */ + uint16_t port_priority; + /**< Priority of this (unused in current implementation) */ + uint16_t port_number; + /**< Port number. It corresponds to slave port id. */ +} __attribute__((__packed__)); + +struct lacpdu_actor_partner_params { + uint8_t tlv_type_info; + uint8_t info_length; + struct port_params port_params; + uint8_t state; + uint8_t reserved_3[3]; +} __attribute__((__packed__)); + +/** LACPDU structure (5.4.2 in 802.1AX documentation). */ +struct lacpdu { + uint8_t subtype; + uint8_t version_number; + + struct lacpdu_actor_partner_params actor; + struct lacpdu_actor_partner_params partner; + + uint8_t tlv_type_collector_info; + uint8_t collector_info_length; + uint16_t collector_max_delay; + uint8_t reserved_12[12]; + + uint8_t tlv_type_terminator; + uint8_t terminator_length; + uint8_t reserved_50[50]; +} __attribute__((__packed__)); + +/** LACPDU frame: Contains ethernet header and LACPDU. */ +struct lacpdu_header { + struct ether_hdr eth_hdr; + struct lacpdu lacpdu; +} __attribute__((__packed__)); + +struct marker { + uint8_t subtype; + uint8_t version_number; + + uint8_t tlv_type_marker; + uint8_t info_length; + uint16_t requester_port; + struct ether_addr requester_system; + uint32_t requester_transaction_id; + uint8_t reserved_2[2]; + + uint8_t tlv_type_terminator; + uint8_t terminator_length; + uint8_t reserved_90[90]; +} __attribute__((__packed__)); + +struct marker_header { + struct ether_hdr eth_hdr; + struct marker marker; +} __attribute__((__packed__)); + +struct rte_eth_bond_8023ad_conf { + uint32_t fast_periodic_ms; + uint32_t slow_periodic_ms; + uint32_t short_timeout_ms; + uint32_t long_timeout_ms; + uint32_t aggregate_wait_timeout_ms; + uint32_t tx_period_ms; + uint32_t rx_marker_period_ms; + uint32_t update_timeout_ms; +}; + +struct rte_eth_bond_8023ad_slave_info { + enum rte_bond_8023ad_selection selected; + uint8_t actor_state; + struct port_params actor; + uint8_t partner_state; + struct port_params partner; + uint8_t agg_port_id; +}; + +/** + * @internal + * + * Function returns current configuration of 802.3AX mode. + * + * @param port_id Bonding device id + * @param conf Pointer to timeout structure. + * + * @return + * 0 - if ok + * -EINVAL if conf is NULL + */ +int +rte_eth_bond_8023ad_conf_get(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Function set new configuration of 802.3AX mode. + * + * @param port_id Bonding device id + * @param conf Configuration, if NULL set default configuration. + * @return + * 0 - if ok + * -EINVAL if configuration is invalid. + */ +int +rte_eth_bond_8023ad_setup(uint8_t port_id, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Function returns current state of given slave device. + * + * @param slave_id Port id of valid slave. + * @param conf buffer for configuration + * @return + * 0 - if ok + * -EINVAL if conf is NULL or slave id is invalid (not a slave of given + * bonded device or is not inactive). + */ +int +rte_eth_bond_8023ad_slave_info(uint8_t port_id, uint8_t slave_id, + struct rte_eth_bond_8023ad_slave_info *conf); + +#endif /* RTE_ETH_BOND_8023AD_H_ */ diff --git a/lib/librte_pmd_bond/rte_eth_bond_8023ad_private.h b/lib/librte_pmd_bond/rte_eth_bond_8023ad_private.h new file mode 100644 index 0000000000..8adee70b4c --- /dev/null +++ b/lib/librte_pmd_bond/rte_eth_bond_8023ad_private.h @@ -0,0 +1,308 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_ETH_BOND_8023AD_PRIVATE_H_ +#define RTE_ETH_BOND_8023AD_PRIVATE_H_ + +#include + +#include +#include +#include + +#include "rte_eth_bond_8023ad.h" + +#define BOND_MODE_8023AX_UPDATE_TIMEOUT_MS 100 +/** Maximum number of packets to one slave queued in TX ring. */ +#define BOND_MODE_8023AX_SLAVE_RX_PKTS 3 +/** Maximum number of LACP packets from one slave queued in TX ring. */ +#define BOND_MODE_8023AX_SLAVE_TX_PKTS 1 +/** + * Timeouts deffinitions (5.4.4 in 802.1AX documentation). + */ +#define BOND_8023AD_FAST_PERIODIC_MS 900 +#define BOND_8023AD_SLOW_PERIODIC_MS 29000 +#define BOND_8023AD_SHORT_TIMEOUT_MS 3000 +#define BOND_8023AD_LONG_TIMEOUT_MS 90000 +#define BOND_8023AD_CHURN_DETECTION_TIMEOUT_MS 60000 +#define BOND_8023AD_AGGREGATE_WAIT_TIMEOUT_MS 2000 +#define BOND_8023AD_TX_MACHINE_PERIOD_MS 500 +#define BOND_8023AD_RX_MARKER_PERIOD_MS 2000 + +/** + * Interval of showing warning message from state machines. All messages will + * be held (and gathered together) to prevent flooding. + * This is no parto of 802.1AX standard. + */ +#define BOND_8023AD_WARNINGS_PERIOD_MS 1000 + + + +/** + * State machine flags + */ +#define SM_FLAGS_BEGIN 0x0001 +#define SM_FLAGS_LACP_ENABLED 0x0002 +#define SM_FLAGS_ACTOR_CHURN 0x0004 +#define SM_FLAGS_PARTNER_CHURN 0x0008 +#define SM_FLAGS_MOVED 0x0100 +#define SM_FLAGS_PARTNER_SHORT_TIMEOUT 0x0200 +#define SM_FLAGS_NTT 0x0400 + +#define BOND_LINK_FULL_DUPLEX_KEY 0x01 +#define BOND_LINK_SPEED_KEY_10M 0x02 +#define BOND_LINK_SPEED_KEY_100M 0x04 +#define BOND_LINK_SPEED_KEY_1000M 0x08 +#define BOND_LINK_SPEED_KEY_10G 0x10 +#define BOND_LINK_SPEED_KEY_20G 0x11 +#define BOND_LINK_SPEED_KEY_40G 0x12 + +#define WRN_RX_MARKER_TO_FAST 0x01 +#define WRN_UNKNOWN_SLOW_TYPE 0x02 +#define WRN_UNKNOWN_MARKER_TYPE 0x04 +#define WRN_NOT_LACP_CAPABLE 0x08 +#define WRN_RX_QUEUE_FULL 0x10 +#define WRN_TX_QUEUE_FULL 0x20 + +#define CHECK_FLAGS(_variable, _f) ((_variable) & (_f)) +#define SET_FLAGS(_variable, _f) ((_variable) |= (_f)) +#define CLEAR_FLAGS(_variable, _f) ((_variable) &= ~(_f)) + +#define SM_FLAG(_p, _f) (!!CHECK_FLAGS((_p)->sm_flags, SM_FLAGS_ ## _f)) +#define SM_FLAG_SET(_p, _f) SET_FLAGS((_p)->sm_flags, SM_FLAGS_ ## _f) +#define SM_FLAG_CLR(_p, _f) CLEAR_FLAGS((_p)->sm_flags, SM_FLAGS_ ## _f) + +#define ACTOR_STATE(_p, _f) (!!CHECK_FLAGS((_p)->actor_state, STATE_ ## _f)) +#define ACTOR_STATE_SET(_p, _f) SET_FLAGS((_p)->actor_state, STATE_ ## _f) +#define ACTOR_STATE_CLR(_p, _f) CLEAR_FLAGS((_p)->actor_state, STATE_ ## _f) + +#define PARTNER_STATE(_p, _f) (!!CHECK_FLAGS((_p)->partner_state, STATE_ ## _f)) +#define PARTNER_STATE_SET(_p, _f) SET_FLAGS((_p)->partner_state, STATE_ ## _f) +#define PARTNER_STATE_CLR(_p, _f) CLEAR_FLAGS((_p)->partner_state, STATE_ ## _f) + +/** Variables associated with each port (5.4.7 in 802.1AX documentation). */ +struct port { + /** + * The operational values of the Actor's state parameters. Bitmask + * of port states. + */ + uint8_t actor_state; + + /** The operational Actor's port parameters */ + struct port_params actor; + + /** + * The operational value of the Actor's view of the current values of + * the Partner's state parameters. The Actor sets this variable either + * to the value received from the Partner in an LACPDU, or to the value + * of Partner_Admin_Port_State. Bitmask of port states. + */ + uint8_t partner_state; + + /** The operational Partner's port parameters */ + struct port_params partner; + + /* Additional port parameters not listed in documentation */ + /** State machine flags */ + uint16_t sm_flags; + enum rte_bond_8023ad_selection selected; + + uint64_t current_while_timer; + uint64_t periodic_timer; + uint64_t wait_while_timer; + uint64_t tx_machine_timer; + uint64_t tx_marker_timer; + /* Agregator parameters */ + /** Used aggregator port ID */ + uint16_t aggregator_port_id; + + /** Memory pool used to allocate rings */ + struct rte_mempool *mbuf_pool; + + /** Ring of LACP packets from RX burst function */ + struct rte_ring *rx_ring; + + /** Ring of slow protocol packets (LACP and MARKERS) to TX burst function */ + struct rte_ring *tx_ring; + + /** Timer which is also used as mutex. If is 0 (not running) RX marker + * packet might be responded. Otherwise shall be dropped. It is zeroed in + * mode 4 callback function after expire. */ + volatile uint64_t rx_marker_timer; + + uint64_t warning_timer; + volatile uint16_t warnings_to_show; +}; + +struct mode8023ad_private { + uint64_t fast_periodic_timeout; + uint64_t slow_periodic_timeout; + uint64_t short_timeout; + uint64_t long_timeout; + uint64_t aggregate_wait_timeout; + uint64_t tx_period_timeout; + uint64_t rx_marker_timeout; + uint64_t update_timeout_us; +}; + +/** + * @internal + * The pool of *port* structures. The size of the pool + * is configured at compile-time in the file. + */ +extern struct port mode_8023ad_ports[]; + +/* Forward declaration */ +struct bond_dev_private; + +/** + * @internal + * + * Get configuration of bonded interface. + * + * + * @param dev Bonded interface + * @param conf returned configuration + */ +void +bond_mode_8023ad_conf_get(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Set mode 4 configuration of bonded interface. + * + * @pre Bonded interface must be stopped. + * + * @param dev Bonded interface + * @param conf new configuration. If NULL set default configuration. + */ +void +bond_mode_8023ad_setup(struct rte_eth_dev *dev, + struct rte_eth_bond_8023ad_conf *conf); + +/** + * @internal + * + * Enables 802.1AX mode and all active slaves on bonded interface. + * + * @param dev Bonded interface + * @return + * 0 on success, negative value otherwise. + */ +int +bond_mode_8023ad_enable(struct rte_eth_dev *dev); + +/** + * @internal + * + * Disables 802.1AX mode of the bonded interface and slaves. + * + * @param dev Bonded interface + * @return + * 0 on success, negative value otherwise. + */ +int bond_mode_8023ad_disable(struct rte_eth_dev *dev); + +/** + * @internal + * + * Starts 802.3AX state machines management logic. + * @param dev Bonded interface + * @return + * 0 if machines was started, 1 if machines was already running, + * negative value otherwise. + */ +int +bond_mode_8023ad_start(struct rte_eth_dev *dev); + +/** + * @internal + * + * Stops 802.3AX state machines management logic. + * @param dev Bonded interface + * @return + * 0 if this call stopped state machines, -ENOENT if alarm was not set. + */ +void +bond_mode_8023ad_stop(struct rte_eth_dev *dev); + +/** + * @internal + * + * Passes given slow packet to state machines management logic. + * @param internals Bonded device private data. + * @param slave_id Slave port id. + * @param slot_pkt Slow packet. + */ +void +bond_mode_8023ad_handle_slow_pkt(struct bond_dev_private *internals, + uint8_t slave_id, struct rte_mbuf *pkt); + +/** + * @internal + * + * Appends given slave used slave + * + * @param dev Bonded interface. + * @param port_id Slave port ID to be added + * + * @return + * 0 on success, negative value otherwise. + */ +void +bond_mode_8023ad_activate_slave(struct rte_eth_dev *dev, uint8_t port_id); + +/** + * @internal + * + * Denitializes and removes given slave from 802.1AX mode. + * + * @param dev Bonded interface. + * @param slave_num Position of slave in active_slaves array + * + * @return + * 0 on success, negative value otherwise. + */ +int +bond_mode_8023ad_deactivate_slave(struct rte_eth_dev *dev, uint8_t slave_pos); + +/** + * Updates state when MAC was changed on bonded device or one of its slaves. + * @param bond_dev Bonded device + */ +void +bond_mode_8023ad_mac_address_update(struct rte_eth_dev *bond_dev); + +#endif /* RTE_ETH_BOND_8023AD_H_ */ diff --git a/lib/librte_pmd_bond/rte_eth_bond_api.c b/lib/librte_pmd_bond/rte_eth_bond_api.c index f146bda25b..c8fb42c464 100644 --- a/lib/librte_pmd_bond/rte_eth_bond_api.c +++ b/lib/librte_pmd_bond/rte_eth_bond_api.c @@ -31,6 +31,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include + #include #include #include @@ -38,6 +40,7 @@ #include "rte_eth_bond.h" #include "rte_eth_bond_private.h" +#include "rte_eth_bond_8023ad_private.h" #define DEFAULT_POLLING_INTERVAL_10_MS (10) @@ -104,6 +107,49 @@ valid_slave_port_id(uint8_t port_id) return 0; } +void +activate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id) +{ + struct bond_dev_private *internals = eth_dev->data->dev_private; + + if (internals->mode == BONDING_MODE_8023AD) + bond_mode_8023ad_activate_slave(eth_dev, port_id); + + internals->active_slaves[internals->active_slave_count] = port_id; + internals->active_slave_count++; +} + +void +deactivate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id) +{ + uint8_t slave_pos; + struct bond_dev_private *internals = eth_dev->data->dev_private; + uint8_t active_count = internals->active_slave_count; + + if (internals->mode == BONDING_MODE_8023AD) { + bond_mode_8023ad_stop(eth_dev); + bond_mode_8023ad_deactivate_slave(eth_dev, port_id); + } + + slave_pos = find_slave_by_id(internals->active_slaves, active_count, + port_id); + + /* If slave was not at the end of the list + * shift active slaves up active array list */ + if (slave_pos < active_count) { + active_count--; + memmove(internals->active_slaves + slave_pos, + internals->active_slaves + slave_pos + 1, + (active_count - slave_pos) * + sizeof(internals->active_slaves[0])); + } + + internals->active_slave_count = active_count; + + if (eth_dev->data->dev_started && internals->mode == BONDING_MODE_8023AD) + bond_mode_8023ad_start(eth_dev); +} + uint8_t number_of_sockets(void) { @@ -216,15 +262,10 @@ rte_eth_bond_create(const char *name, uint8_t mode, uint8_t socket_id) eth_dev->dev_ops = &default_dev_ops; eth_dev->pci_dev = pci_dev; - if (bond_ethdev_mode_set(eth_dev, mode)) { - RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d", - eth_dev->data->port_id, mode); - goto err; - } - rte_spinlock_init(&internals->lock); internals->port_id = eth_dev->data->port_id; + internals->mode = BONDING_MODE_INVALID; internals->current_primary_port = 0; internals->balance_xmit_policy = BALANCE_XMIT_POLICY_LAYER2; internals->user_defined_mac = 0; @@ -242,6 +283,14 @@ rte_eth_bond_create(const char *name, uint8_t mode, uint8_t socket_id) memset(internals->active_slaves, 0, sizeof(internals->active_slaves)); memset(internals->slaves, 0, sizeof(internals->slaves)); + /* Set mode 4 default configuration */ + bond_mode_8023ad_setup(eth_dev, NULL); + if (bond_ethdev_mode_set(eth_dev, mode)) { + RTE_BOND_LOG(ERR, "Failed to set bonded device %d mode too %d", + eth_dev->data->port_id, mode); + goto err; + } + return eth_dev->data->port_id; err: @@ -349,14 +398,12 @@ __eth_bond_slave_add_lock_free(uint8_t bonded_port_id, uint8_t slave_port_id) rte_eth_link_get_nowait(slave_port_id, &link_props); if (link_props.link_status == 1) - internals->active_slaves[internals->active_slave_count++] = - slave_port_id; + activate_slave(bonded_eth_dev, slave_port_id); } return 0; } - int rte_eth_bond_slave_add(uint8_t bonded_port_id, uint8_t slave_port_id) { @@ -381,31 +428,26 @@ rte_eth_bond_slave_add(uint8_t bonded_port_id, uint8_t slave_port_id) return retval; } - static int __eth_bond_slave_remove_lock_free(uint8_t bonded_port_id, uint8_t slave_port_id) { + struct rte_eth_dev *bonded_eth_dev; struct bond_dev_private *internals; - int i, slave_idx = -1; + int i, slave_idx; if (valid_slave_port_id(slave_port_id) != 0) return -1; - internals = rte_eth_devices[bonded_port_id].data->dev_private; + bonded_eth_dev = &rte_eth_devices[bonded_port_id]; + internals = bonded_eth_dev->data->dev_private; /* first remove from active slave list */ - for (i = 0; i < internals->active_slave_count; i++) { - if (internals->active_slaves[i] == slave_port_id) - slave_idx = i; - - /* shift active slaves up active array list */ - if (slave_idx >= 0 && i < (internals->active_slave_count - 1)) - internals->active_slaves[i] = internals->active_slaves[i+1]; - } + slave_idx = find_slave_by_id(internals->active_slaves, + internals->active_slave_count, slave_port_id); - if (slave_idx >= 0) - internals->active_slave_count--; + if (slave_idx < internals->active_slave_count) + deactivate_slave(bonded_eth_dev, slave_port_id); slave_idx = -1; /* now find in slave list */ @@ -539,11 +581,12 @@ rte_eth_bond_primary_get(uint8_t bonded_port_id) return internals->current_primary_port; } + int rte_eth_bond_slaves_get(uint8_t bonded_port_id, uint8_t slaves[], uint8_t len) { struct bond_dev_private *internals; - int i; + uint8_t i; if (valid_bonded_port_id(bonded_port_id) != 0) return -1; @@ -675,7 +718,6 @@ rte_eth_bond_xmit_policy_get(uint8_t bonded_port_id) return internals->balance_xmit_policy; } - int rte_eth_bond_link_monitoring_set(uint8_t bonded_port_id, uint32_t internal_ms) { @@ -731,7 +773,6 @@ rte_eth_bond_link_down_prop_delay_get(uint8_t bonded_port_id) return internals->link_down_delay_ms; } - int rte_eth_bond_link_up_prop_delay_set(uint8_t bonded_port_id, uint32_t delay_ms) diff --git a/lib/librte_pmd_bond/rte_eth_bond_args.c b/lib/librte_pmd_bond/rte_eth_bond_args.c index d8ce681e27..bf7c1bc3c7 100644 --- a/lib/librte_pmd_bond/rte_eth_bond_args.c +++ b/lib/librte_pmd_bond/rte_eth_bond_args.c @@ -173,6 +173,7 @@ bond_ethdev_parse_slave_mode_kvarg(const char *key __rte_unused, #ifdef RTE_MBUF_REFCNT case BONDING_MODE_BROADCAST: #endif + case BONDING_MODE_8023AD: return 0; default: RTE_BOND_LOG(ERR, "Invalid slave mode value (%s) specified", value); diff --git a/lib/librte_pmd_bond/rte_eth_bond_pmd.c b/lib/librte_pmd_bond/rte_eth_bond_pmd.c index cf2fbab53f..aa52813609 100644 --- a/lib/librte_pmd_bond/rte_eth_bond_pmd.c +++ b/lib/librte_pmd_bond/rte_eth_bond_pmd.c @@ -44,6 +44,7 @@ #include "rte_eth_bond.h" #include "rte_eth_bond_private.h" +#include "rte_eth_bond_8023ad_private.h" static uint16_t bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) @@ -90,6 +91,77 @@ bond_ethdev_rx_burst_active_backup(void *queue, struct rte_mbuf **bufs, bd_rx_q->queue_id, bufs, nb_pkts); } +static uint16_t +bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + /* Cast to structure, containing bonded device's port id and queue id */ + struct bond_rx_queue *bd_rx_q = (struct bond_rx_queue *)queue; + struct bond_dev_private *internals = bd_rx_q->dev_private; + struct ether_addr bond_mac; + + struct ether_hdr *hdr; + + const uint16_t ether_type_slow_be = rte_be_to_cpu_16(ETHER_TYPE_SLOW); + uint16_t num_rx_total = 0; /* Total number of received packets */ + uint8_t slaves[RTE_MAX_ETHPORTS]; + uint8_t slave_count; + + uint8_t collecting; /* current slave collecting status */ + const uint8_t promisc = internals->promiscuous_en; + uint8_t i, j, k; + + rte_eth_macaddr_get(internals->port_id, &bond_mac); + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + slave_count = internals->active_slave_count; + memcpy(slaves, internals->active_slaves, + sizeof(internals->active_slaves[0]) * slave_count); + + for (i = 0; i < slave_count && num_rx_total < nb_pkts; i++) { + j = num_rx_total; + collecting = ACTOR_STATE(&mode_8023ad_ports[slaves[i]], COLLECTING); + + /* Read packets from this slave */ + num_rx_total += rte_eth_rx_burst(slaves[i], bd_rx_q->queue_id, + &bufs[num_rx_total], nb_pkts - num_rx_total); + + for (k = j; k < 2 && k < num_rx_total; k++) + rte_prefetch0(rte_pktmbuf_mtod(bufs[k], void *)); + + /* Handle slow protocol packets. */ + while (j < num_rx_total) { + if (j + 3 < num_rx_total) + rte_prefetch0(rte_pktmbuf_mtod(bufs[j + 3], void *)); + + hdr = rte_pktmbuf_mtod(bufs[j], struct ether_hdr *); + /* Remove packet from array if it is slow packet or slave is not + * in collecting state or bondign interface is not in promiscus + * mode and packet address does not match. */ + if (unlikely(hdr->ether_type == ether_type_slow_be || + !collecting || (!promisc && + !is_same_ether_addr(&bond_mac, &hdr->d_addr)))) { + + if (hdr->ether_type == ether_type_slow_be) { + bond_mode_8023ad_handle_slow_pkt(internals, slaves[i], + bufs[j]); + } else + rte_pktmbuf_free(bufs[j]); + + /* Packet is managed by mode 4 or dropped, shift the array */ + num_rx_total--; + if (j < num_rx_total) { + memmove(&bufs[j], &bufs[j + 1], sizeof(bufs[0]) * + (num_rx_total - j)); + } + } else + j++; + } + } + + return num_rx_total; +} + static uint16_t bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) @@ -143,7 +215,8 @@ bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs, tx_fail_total += tx_fail_slave; memcpy(&bufs[nb_pkts - tx_fail_total], - &slave_bufs[i][num_tx_slave], tx_fail_slave * sizeof(bufs[0])); + &slave_bufs[i][num_tx_slave], + tx_fail_slave * sizeof(bufs[0])); } num_tx_total += num_tx_slave; } @@ -338,14 +411,107 @@ bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs, int slave_tx_fail_count = slave_nb_pkts[i] - num_tx_slave; tx_fail_total += slave_tx_fail_count; - memcpy(bufs[nb_pkts - tx_fail_total], - slave_bufs[i][num_tx_slave], slave_tx_fail_count); + memcpy(&bufs[nb_pkts - tx_fail_total], + &slave_bufs[i][num_tx_slave], + slave_tx_fail_count * sizeof(bufs[0])); } num_tx_total += num_tx_slave; } } + return num_tx_total; +} + +static uint16_t +bond_ethdev_tx_burst_8023ad(void *queue, struct rte_mbuf **bufs, + uint16_t nb_pkts) +{ + struct bond_dev_private *internals; + struct bond_tx_queue *bd_tx_q; + + uint8_t num_of_slaves; + uint8_t slaves[RTE_MAX_ETHPORTS]; + /* possitions in slaves, not ID */ + uint8_t distributing_offsets[RTE_MAX_ETHPORTS]; + uint8_t distributing_count; + + uint16_t num_tx_slave, num_tx_total = 0, num_tx_fail_total = 0; + uint16_t i, j, op_slave_idx; + const uint16_t buffs_size = nb_pkts + BOND_MODE_8023AX_SLAVE_TX_PKTS + 1; + + /* Allocate additional packets in case 8023AD mode. */ + struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS][buffs_size]; + void *slow_pkts[BOND_MODE_8023AX_SLAVE_TX_PKTS]; + + /* Total amount of packets in slave_bufs */ + uint16_t slave_nb_pkts[RTE_MAX_ETHPORTS] = { 0 }; + /* Slow packets placed in each slave */ + uint8_t slave_slow_nb_pkts[RTE_MAX_ETHPORTS] = { 0 }; + + bd_tx_q = (struct bond_tx_queue *)queue; + internals = bd_tx_q->dev_private; + + /* Copy slave list to protect against slave up/down changes during tx + * bursting */ + num_of_slaves = internals->active_slave_count; + if (num_of_slaves < 1) + return num_tx_total; + + memcpy(slaves, internals->active_slaves, sizeof(slaves[0]) * num_of_slaves); + + distributing_count = 0; + for (i = 0; i < num_of_slaves; i++) { + struct port *port = &mode_8023ad_ports[slaves[i]]; + + slave_slow_nb_pkts[i] = rte_ring_dequeue_burst(port->tx_ring, + slow_pkts, BOND_MODE_8023AX_SLAVE_TX_PKTS); + slave_nb_pkts[i] = slave_slow_nb_pkts[i]; + + for (j = 0; j < slave_slow_nb_pkts[i]; j++) + slave_bufs[i][j] = slow_pkts[j]; + + if (ACTOR_STATE(port, DISTRIBUTING)) + distributing_offsets[distributing_count++] = i; + } + + if (likely(distributing_count > 0)) { + /* Populate slaves mbuf with the packets which are to be sent on it */ + for (i = 0; i < nb_pkts; i++) { + /* Select output slave using hash based on xmit policy */ + op_slave_idx = xmit_slave_hash(bufs[i], distributing_count, + internals->balance_xmit_policy); + + /* Populate slave mbuf arrays with mbufs for that slave. Use only + * slaves that are currently distributing. */ + uint8_t slave_offset = distributing_offsets[op_slave_idx]; + slave_bufs[slave_offset][slave_nb_pkts[slave_offset]] = bufs[i]; + slave_nb_pkts[slave_offset]++; + } + } + + /* Send packet burst on each slave device */ + for (i = 0; i < num_of_slaves; i++) { + if (slave_nb_pkts[i] == 0) + continue; + + num_tx_slave = rte_eth_tx_burst(slaves[i], bd_tx_q->queue_id, + slave_bufs[i], slave_nb_pkts[i]); + + /* If tx burst fails drop slow packets */ + for ( ; num_tx_slave < slave_slow_nb_pkts[i]; num_tx_slave++) + rte_pktmbuf_free(slave_bufs[i][num_tx_slave]); + + num_tx_total += num_tx_slave - slave_slow_nb_pkts[i]; + num_tx_fail_total += slave_nb_pkts[i] - num_tx_slave; + + /* If tx burst fails move packets to end of bufs */ + if (unlikely(num_tx_slave < slave_nb_pkts[i])) { + uint16_t j = nb_pkts - num_tx_fail_total; + for ( ; num_tx_slave < slave_nb_pkts[i]; j++, num_tx_slave++) + bufs[j] = slave_bufs[i][num_tx_slave]; + } + } return num_tx_total; } @@ -450,6 +616,27 @@ link_properties_valid(struct rte_eth_link *bonded_dev_link, return 0; } +int +mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr) +{ + struct ether_addr *mac_addr; + + mac_addr = eth_dev->data->mac_addrs; + + if (eth_dev == NULL) { + RTE_LOG(ERR, PMD, "%s: NULL pointer eth_dev specified\n", __func__); + return -1; + } + + if (dst_mac_addr == NULL) { + RTE_LOG(ERR, PMD, "%s: NULL pointer MAC specified\n", __func__); + return -1; + } + + ether_addr_copy(mac_addr, dst_mac_addr); + return 0; +} + int mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr) { @@ -458,7 +645,7 @@ mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr) mac_addr = eth_dev->data->mac_addrs; if (eth_dev == NULL) { - RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified"); + RTE_BOND_LOG(ERR, "NULL pointer eth_dev specified"); return -1; } @@ -499,6 +686,9 @@ mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev) } } break; + case BONDING_MODE_8023AD: + bond_mode_8023ad_mac_address_update(bonded_eth_dev); + break; case BONDING_MODE_ACTIVE_BACKUP: default: for (i = 0; i < internals->slave_count; i++) { @@ -551,6 +741,16 @@ bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode) eth_dev->rx_pkt_burst = bond_ethdev_rx_burst; break; #endif + case BONDING_MODE_8023AD: + if (bond_mode_8023ad_enable(eth_dev) != 0) + return -1; + + eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_8023ad; + eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_8023ad; + RTE_BOND_LOG(WARNING, + "Using mode 4, it is necessary to do TX burst and RX burst " + "at least every 100ms."); + break; default: return -1; } @@ -762,6 +962,9 @@ bond_ethdev_start(struct rte_eth_dev *eth_dev) if (internals->user_defined_primary_port) bond_ethdev_primary_set(internals, internals->primary_port); + if (internals->mode == BONDING_MODE_8023AD) + bond_mode_8023ad_start(eth_dev); + return 0; } @@ -769,6 +972,27 @@ static void bond_ethdev_stop(struct rte_eth_dev *eth_dev) { struct bond_dev_private *internals = eth_dev->data->dev_private; + uint8_t i; + + if (internals->mode == BONDING_MODE_8023AD) { + struct port *port; + void *pkt = NULL; + + bond_mode_8023ad_stop(eth_dev); + + /* Discard all messages to/from mode 4 state machines */ + for (i = 0; i < internals->slave_count; i++) { + port = &mode_8023ad_ports[internals->slaves[i].port_id]; + + RTE_VERIFY(port->rx_ring != NULL); + while (rte_ring_dequeue(port->rx_ring, &pkt) != -ENOENT) + rte_pktmbuf_free(pkt); + + RTE_VERIFY(port->tx_ring != NULL); + while (rte_ring_dequeue(port->tx_ring, &pkt) != -ENOENT) + rte_pktmbuf_free(pkt); + } + } internals->active_slave_count = 0; internals->link_status_polling_enabled = 0; @@ -834,7 +1058,7 @@ bond_ethdev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id, 0, dev->pci_dev->numa_node); if (bd_tx_q == NULL) - return -1; + return -1; bd_tx_q->queue_id = tx_queue_id; bd_tx_q->dev_private = dev->data->dev_private; @@ -865,7 +1089,6 @@ bond_ethdev_tx_queue_release(void *queue) rte_free(queue); } - static void bond_ethdev_slave_link_status_change_monitor(void *cb_arg) { @@ -1014,11 +1237,13 @@ bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev) for (i = 0; i < internals->slave_count; i++) rte_eth_promiscuous_enable(internals->slaves[i].port_id); break; + /* In mode4 promiscus mode is managed when slave is added/removed */ + case BONDING_MODE_8023AD: + break; /* Promiscuous mode is propagated only to primary slave */ case BONDING_MODE_ACTIVE_BACKUP: default: rte_eth_promiscuous_enable(internals->current_primary_port); - } } @@ -1040,6 +1265,9 @@ bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev) for (i = 0; i < internals->slave_count; i++) rte_eth_promiscuous_disable(internals->slaves[i].port_id); break; + /* In mode4 promiscus mode is set managed when slave is added/removed */ + case BONDING_MODE_8023AD: + break; /* Promiscuous mode is propagated only to primary slave */ case BONDING_MODE_ACTIVE_BACKUP: default: @@ -1065,7 +1293,8 @@ bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type, struct bond_dev_private *internals; struct rte_eth_link link; - int i, valid_slave = 0, active_pos = -1; + int i, valid_slave = 0; + uint8_t active_pos; uint8_t lsc_flag = 0; if (type != RTE_ETH_EVENT_INTR_LSC || param == NULL) @@ -1095,16 +1324,12 @@ bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type, return; /* Search for port in active port list */ - for (i = 0; i < internals->active_slave_count; i++) { - if (port_id == internals->active_slaves[i]) { - active_pos = i; - break; - } - } + active_pos = find_slave_by_id(internals->active_slaves, + internals->active_slave_count, port_id); rte_eth_link_get_nowait(port_id, &link); if (link.link_status) { - if (active_pos >= 0) + if (active_pos < internals->active_slave_count) return; /* if no active slave ports then set this port to be primary port */ @@ -1118,21 +1343,19 @@ bond_ethdev_lsc_event_callback(uint8_t port_id, enum rte_eth_event_type type, link_properties_set(bonded_eth_dev, &(slave_eth_dev->data->dev_link)); } - internals->active_slaves[internals->active_slave_count++] = port_id; + + activate_slave(bonded_eth_dev, port_id); /* If user has defined the primary port then default to using it */ if (internals->user_defined_primary_port && internals->primary_port == port_id) bond_ethdev_primary_set(internals, port_id); } else { - if (active_pos < 0) + if (active_pos == internals->active_slave_count) return; /* Remove from active slave list */ - for (i = active_pos; i < (internals->active_slave_count - 1); i++) - internals->active_slaves[i] = internals->active_slaves[i+1]; - - internals->active_slave_count--; + deactivate_slave(bonded_eth_dev, port_id); /* No active slaves, change link status to down and reset other * link properties */ diff --git a/lib/librte_pmd_bond/rte_eth_bond_private.h b/lib/librte_pmd_bond/rte_eth_bond_private.h index 6254c84420..600fc08abb 100644 --- a/lib/librte_pmd_bond/rte_eth_bond_private.h +++ b/lib/librte_pmd_bond/rte_eth_bond_private.h @@ -42,6 +42,7 @@ extern "C" { #include #include "rte_eth_bond.h" +#include "rte_eth_bond_8023ad_private.h" #define PMD_BOND_SLAVE_PORT_KVARG ("slave") #define PMD_BOND_PRIMARY_SLAVE_KVARG ("primary") @@ -60,6 +61,8 @@ extern "C" { #define RTE_BOND_LOG(lvl, msg, ...) \ RTE_LOG(lvl, PMD, "%s(%d) - " msg "\n", __func__, __LINE__, ##__VA_ARGS__) +#define BONDING_MODE_INVALID 0xFF + extern const char *pmd_bond_init_valid_arguments[]; extern const char *driver_name; @@ -89,7 +92,6 @@ struct bond_tx_queue { /**< Copy of TX configuration structure for queue */ }; - /** Bonded slave devices structure */ struct bond_ethdev_slave_ports { uint8_t slaves[RTE_MAX_ETHPORTS]; /**< Slave port id array */ @@ -124,7 +126,7 @@ struct bond_dev_private { uint8_t user_defined_mac; /**< Flag for whether MAC address is user defined or not */ uint8_t promiscuous_en; - /**< Enabled/disable promiscuous mode on slave devices */ + /**< Enabled/disable promiscuous mode on bonding device */ uint8_t link_props_set; /**< flag to denote if the link properties are set */ @@ -144,6 +146,8 @@ struct bond_dev_private { struct bond_slave_details slaves[RTE_MAX_ETHPORTS]; /**< Arary of bonded slaves details */ + struct mode8023ad_private mode4; + struct rte_kvargs *kvlist; }; @@ -152,6 +156,20 @@ extern struct eth_dev_ops default_dev_ops; int valid_bonded_ethdev(struct rte_eth_dev *eth_dev); +/* Search given slave array to find possition of given id. + * Return slave pos or slaves_count if not found. */ +static inline uint8_t +find_slave_by_id(uint8_t *slaves, uint8_t slaves_count, uint8_t slave_id) { + + uint8_t pos; + for (pos = 0; pos < slaves_count; pos++) { + if (slave_id == slaves[pos]) + break; + } + + return pos; +} + int valid_port_id(uint8_t port_id); @@ -161,6 +179,12 @@ valid_bonded_port_id(uint8_t port_id); int valid_slave_port_id(uint8_t port_id); +void +deactivate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id); + +void +activate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id); + void link_properties_set(struct rte_eth_dev *bonded_eth_dev, struct rte_eth_link *slave_dev_link); @@ -174,6 +198,9 @@ link_properties_valid(struct rte_eth_link *bonded_dev_link, int mac_address_set(struct rte_eth_dev *eth_dev, struct ether_addr *new_mac_addr); +int +mac_address_get(struct rte_eth_dev *eth_dev, struct ether_addr *dst_mac_addr); + int mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev); -- 2.20.1