From: Adrien Mazarguil Date: Fri, 1 Sep 2017 08:06:57 +0000 (+0200) Subject: net/mlx4: separate Rx/Tx functions X-Git-Tag: spdx-start~2014 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=7f45cb82da4d5d509c720d67c257ccfe27aa9ae4;p=dpdk.git net/mlx4: separate Rx/Tx functions This commit groups all data plane functions (Rx/Tx) into a separate file and adjusts header files accordingly. Private functions are now prefixed with "mlx4_" to prevent them from conflicting with their mlx5 PMD counterparts at link time. No impact on functionality. Signed-off-by: Adrien Mazarguil --- diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index f6e3001e7d..8def32a0fa 100644 --- a/drivers/net/mlx4/Makefile +++ b/drivers/net/mlx4/Makefile @@ -38,6 +38,7 @@ LIB = librte_pmd_mlx4.a SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c +SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c # Basic CFLAGS. diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index ba060753c3..a409ec2299 100644 --- a/drivers/net/mlx4/mlx4.c +++ b/drivers/net/mlx4/mlx4.c @@ -56,13 +56,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include /* Generated configuration header. */ @@ -505,9 +503,6 @@ mlx4_dev_configure(struct rte_eth_dev *dev) return 0; } -static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t); -static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); - /* TX queues handling. */ /** @@ -630,53 +625,6 @@ txq_cleanup(struct txq *txq) memset(txq, 0, sizeof(*txq)); } -/** - * Manage TX completions. - * - * When sending a burst, mlx4_tx_burst() posts several WRs. - * To improve performance, a completion event is only required once every - * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information - * for other WRs, but this information would not be used anyway. - * - * @param txq - * Pointer to TX queue structure. - * - * @return - * 0 on success, -1 on failure. - */ -static int -txq_complete(struct txq *txq) -{ - unsigned int elts_comp = txq->elts_comp; - unsigned int elts_tail = txq->elts_tail; - const unsigned int elts_n = txq->elts_n; - struct ibv_wc wcs[elts_comp]; - int wcs_n; - - if (unlikely(elts_comp == 0)) - return 0; - wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs); - if (unlikely(wcs_n == 0)) - return 0; - if (unlikely(wcs_n < 0)) { - DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", - (void *)txq, wcs_n); - return -1; - } - elts_comp -= wcs_n; - assert(elts_comp <= txq->elts_comp); - /* - * Assume WC status is successful as nothing can be done about it - * anyway. - */ - elts_tail += wcs_n * txq->elts_comp_cd_init; - if (elts_tail >= elts_n) - elts_tail -= elts_n; - txq->elts_tail = elts_tail; - txq->elts_comp = elts_comp; - return 0; -} - struct mlx4_check_mempool_data { int ret; char *start; @@ -738,10 +686,6 @@ static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, return data.ret; } -/* For best performance, this function should not be inlined. */ -static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *) - __rte_noinline; - /** * Register mempool as a memory region. * @@ -753,7 +697,7 @@ static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *) * @return * Memory region pointer, NULL in case of error and rte_errno is set. */ -static struct ibv_mr * +struct ibv_mr * mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp) { const struct rte_memseg *ms = rte_eal_get_physmem_layout(); @@ -794,81 +738,6 @@ mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp) return mr; } -/** - * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which - * the cloned mbuf is allocated is returned instead. - * - * @param buf - * Pointer to mbuf. - * - * @return - * Memory pool where data is located for given mbuf. - */ -static struct rte_mempool * -txq_mb2mp(struct rte_mbuf *buf) -{ - if (unlikely(RTE_MBUF_INDIRECT(buf))) - return rte_mbuf_from_indirect(buf)->pool; - return buf->pool; -} - -/** - * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. - * - * @param txq - * Pointer to TX queue structure. - * @param[in] mp - * Memory Pool for which a Memory Region lkey must be returned. - * - * @return - * mr->lkey on success, (uint32_t)-1 on failure. - */ -static uint32_t -txq_mp2mr(struct txq *txq, struct rte_mempool *mp) -{ - unsigned int i; - struct ibv_mr *mr; - - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mp == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i].mp == mp) { - assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; - } - } - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq, mp->name, (void *)mp); - mr = mlx4_mp2mr(txq->priv->pd, mp); - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq); - return (uint32_t)-1; - } - if (unlikely(i == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq); - --i; - claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); - } - /* Store the new entry. */ - txq->mp2mr[i].mp = mp; - txq->mp2mr[i].mr = mr; - txq->mp2mr[i].lkey = mr->lkey; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; -} - struct txq_mp2mr_mbuf_check_data { int ret; }; @@ -923,172 +792,7 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg) if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || data.ret == -1) return; - txq_mp2mr(txq, mp); -} - -/** - * DPDK callback for TX. - * - * @param dpdk_txq - * Generic pointer to TX queue structure. - * @param[in] pkts - * Packets to transmit. - * @param pkts_n - * Number of packets in array. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static uint16_t -mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct txq *txq = (struct txq *)dpdk_txq; - struct ibv_send_wr *wr_head = NULL; - struct ibv_send_wr **wr_next = &wr_head; - struct ibv_send_wr *wr_bad = NULL; - unsigned int elts_head = txq->elts_head; - const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp_cd = txq->elts_comp_cd; - unsigned int elts_comp = 0; - unsigned int i; - unsigned int max; - int err; - - assert(elts_comp_cd != 0); - txq_complete(txq); - max = (elts_n - (elts_head - txq->elts_tail)); - if (max > elts_n) - max -= elts_n; - assert(max >= 1); - assert(max <= elts_n); - /* Always leave one free entry in the ring. */ - --max; - if (max == 0) - return 0; - if (max > pkts_n) - max = pkts_n; - for (i = 0; (i != max); ++i) { - struct rte_mbuf *buf = pkts[i]; - unsigned int elts_head_next = - (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); - struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; - struct txq_elt *elt = &(*txq->elts)[elts_head]; - struct ibv_send_wr *wr = &elt->wr; - unsigned int segs = buf->nb_segs; - unsigned int sent_size = 0; - uint32_t send_flags = 0; - - /* Clean up old buffer. */ - if (likely(elt->buf != NULL)) { - struct rte_mbuf *tmp = elt->buf; - -#ifndef NDEBUG - /* Poisoning. */ - memset(elt, 0x66, sizeof(*elt)); -#endif - /* Faster than rte_pktmbuf_free(). */ - do { - struct rte_mbuf *next = tmp->next; - - rte_pktmbuf_free_seg(tmp); - tmp = next; - } while (tmp != NULL); - } - /* Request TX completion. */ - if (unlikely(--elts_comp_cd == 0)) { - elts_comp_cd = txq->elts_comp_cd_init; - ++elts_comp; - send_flags |= IBV_SEND_SIGNALED; - } - if (likely(segs == 1)) { - struct ibv_sge *sge = &elt->sge; - uintptr_t addr; - uint32_t length; - uint32_t lkey; - - /* Retrieve buffer information. */ - addr = rte_pktmbuf_mtod(buf, uintptr_t); - length = buf->data_len; - /* Retrieve Memory Region key for this memory pool. */ - lkey = txq_mp2mr(txq, txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { - /* MR does not exist. */ - DEBUG("%p: unable to get MP <-> MR" - " association", (void *)txq); - /* Clean up TX element. */ - elt->buf = NULL; - goto stop; - } - /* Update element. */ - elt->buf = buf; - if (txq->priv->vf) - rte_prefetch0((volatile void *) - (uintptr_t)addr); - RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - sge->addr = addr; - sge->length = length; - sge->lkey = lkey; - sent_size += length; - } else { - err = -1; - goto stop; - } - if (sent_size <= txq->max_inline) - send_flags |= IBV_SEND_INLINE; - elts_head = elts_head_next; - /* Increment sent bytes counter. */ - txq->stats.obytes += sent_size; - /* Set up WR. */ - wr->sg_list = &elt->sge; - wr->num_sge = segs; - wr->opcode = IBV_WR_SEND; - wr->send_flags = send_flags; - *wr_next = wr; - wr_next = &wr->next; - } -stop: - /* Take a shortcut if nothing must be sent. */ - if (unlikely(i == 0)) - return 0; - /* Increment sent packets counter. */ - txq->stats.opackets += i; - /* Ring QP doorbell. */ - *wr_next = NULL; - assert(wr_head); - err = ibv_post_send(txq->qp, wr_head, &wr_bad); - if (unlikely(err)) { - uint64_t obytes = 0; - uint64_t opackets = 0; - - /* Rewind bad WRs. */ - while (wr_bad != NULL) { - int j; - - /* Force completion request if one was lost. */ - if (wr_bad->send_flags & IBV_SEND_SIGNALED) { - elts_comp_cd = 1; - --elts_comp; - } - ++opackets; - for (j = 0; j < wr_bad->num_sge; ++j) - obytes += wr_bad->sg_list[j].length; - elts_head = (elts_head ? elts_head : elts_n) - 1; - wr_bad = wr_bad->next; - } - txq->stats.opackets -= opackets; - txq->stats.obytes -= obytes; - i -= opackets; - DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets" - " (%" PRIu64 " bytes) rejected: %s", - (void *)txq, - opackets, - obytes, - (err <= -1) ? "Internal error" : strerror(err)); - } - txq->elts_head = elts_head; - txq->elts_comp += elts_comp; - txq->elts_comp_cd = elts_comp_cd; - return i; + mlx4_txq_mp2mr(txq, mp); } /** @@ -1545,132 +1249,6 @@ rxq_cleanup(struct rxq *rxq) memset(rxq, 0, sizeof(*rxq)); } -/** - * DPDK callback for RX. - * - * The following function doesn't manage scattered packets. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -static uint16_t -mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_wc wcs[pkts_n]; - struct ibv_recv_wr *wr_head = NULL; - struct ibv_recv_wr **wr_next = &wr_head; - struct ibv_recv_wr *wr_bad = NULL; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; - - ret = ibv_poll_cq(rxq->cq, pkts_n, wcs); - if (unlikely(ret == 0)) - return 0; - if (unlikely(ret < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)", - (void *)rxq, ret); - return 0; - } - assert(ret <= (int)pkts_n); - /* For each work completion. */ - for (i = 0; i != (unsigned int)ret; ++i) { - struct ibv_wc *wc = &wcs[i]; - struct rxq_elt *elt = &(*elts)[elts_head]; - struct ibv_recv_wr *wr = &elt->wr; - uint32_t len = wc->byte_len; - struct rte_mbuf *seg = elt->buf; - struct rte_mbuf *rep; - - /* Sanity checks. */ - assert(wr->sg_list == &elt->sge); - assert(wr->num_sge == 1); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_mbuf_prefetch_part1(seg); - rte_mbuf_prefetch_part2(seg); - /* Link completed WRs together for repost. */ - *wr_next = wr; - wr_next = &wr->next; - if (unlikely(wc->status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p: bad work completion status (%d): %s", - (void *)rxq, wc->status, - ibv_wc_status_str(wc->status)); - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; - goto repost; - } - rep = rte_mbuf_raw_alloc(rxq->mp); - if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - /* Increase out of memory counters. */ - ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; - } - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - elt->buf = rep; - /* Update seg information. */ - seg->data_off = RTE_PKTMBUF_HEADROOM; - seg->nb_segs = 1; - seg->port = rxq->port_id; - seg->next = NULL; - seg->pkt_len = len; - seg->data_len = len; - seg->packet_type = 0; - seg->ol_flags = 0; - /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; - /* Increase bytes counter. */ - rxq->stats.ibytes += len; -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; - } - if (unlikely(i == 0)) - return 0; - /* Repost WRs. */ - *wr_next = NULL; - assert(wr_head); - ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; - /* Increase packets counter. */ - rxq->stats.ipackets += pkts_ret; - return pkts_ret; -} - /** * Allocate a Queue Pair. * Optionally setup inline receive if supported. @@ -2031,56 +1609,6 @@ mlx4_dev_stop(struct rte_eth_dev *dev) priv_mac_addr_del(priv); } -/** - * Dummy DPDK callback for TX. - * - * This function is used to temporarily replace the real callback during - * unsafe control operations on the queue, or in case of error. - * - * @param dpdk_txq - * Generic pointer to TX queue structure. - * @param[in] pkts - * Packets to transmit. - * @param pkts_n - * Number of packets in array. - * - * @return - * Number of packets successfully transmitted (<= pkts_n). - */ -static uint16_t -removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - (void)dpdk_txq; - (void)pkts; - (void)pkts_n; - return 0; -} - -/** - * Dummy DPDK callback for RX. - * - * This function is used to temporarily replace the real callback during - * unsafe control operations on the queue, or in case of error. - * - * @param dpdk_rxq - * Generic pointer to RX queue structure. - * @param[out] pkts - * Array to store received packets. - * @param pkts_n - * Maximum number of packets in array. - * - * @return - * Number of packets successfully received (<= pkts_n). - */ -static uint16_t -removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) -{ - (void)dpdk_rxq; - (void)pkts; - (void)pkts_n; - return 0; -} - /** * DPDK callback to close the device. * @@ -2107,8 +1635,8 @@ mlx4_dev_close(struct rte_eth_dev *dev) * still required for DPDK 1.3 because some programs (such as testpmd) * never release them before closing the device. */ - dev->rx_pkt_burst = removed_rx_burst; - dev->tx_pkt_burst = removed_tx_burst; + dev->rx_pkt_burst = mlx4_rx_burst_removed; + dev->tx_pkt_burst = mlx4_tx_burst_removed; if (priv->rxqs != NULL) { /* XXX race condition if mlx4_rx_burst() is still running. */ usleep(1000); @@ -2173,8 +1701,8 @@ priv_set_link(struct priv *priv, int up) err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); if (err) return err; - dev->rx_pkt_burst = removed_rx_burst; - dev->tx_pkt_burst = removed_tx_burst; + dev->rx_pkt_burst = mlx4_rx_burst_removed; + dev->tx_pkt_burst = mlx4_tx_burst_removed; } return 0; } diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index edbece613a..efccf1a88d 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -49,6 +49,7 @@ #include #include #include +#include /* Request send completion once in every 64 sends, might be less. */ #define MLX4_PMD_TX_PER_COMP_REQ 64 @@ -115,6 +116,7 @@ struct priv { /* mlx4.c */ +struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp); int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete); /* mlx4_intr.c */ diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c new file mode 100644 index 0000000000..b5e77771c8 --- /dev/null +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -0,0 +1,524 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file + * Data plane functions for mlx4 driver. + */ + +#include +#include +#include +#include + +/* Verbs headers do not support -pedantic. */ +#ifdef PEDANTIC +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#include +#ifdef PEDANTIC +#pragma GCC diagnostic error "-Wpedantic" +#endif + +#include +#include +#include +#include +#include + +#include "mlx4.h" +#include "mlx4_rxtx.h" +#include "mlx4_utils.h" + +/** + * Manage Tx completions. + * + * When sending a burst, mlx4_tx_burst() posts several WRs. + * To improve performance, a completion event is only required once every + * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information + * for other WRs, but this information would not be used anyway. + * + * @param txq + * Pointer to Tx queue structure. + * + * @return + * 0 on success, -1 on failure. + */ +static int +mlx4_txq_complete(struct txq *txq) +{ + unsigned int elts_comp = txq->elts_comp; + unsigned int elts_tail = txq->elts_tail; + const unsigned int elts_n = txq->elts_n; + struct ibv_wc wcs[elts_comp]; + int wcs_n; + + if (unlikely(elts_comp == 0)) + return 0; + wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs); + if (unlikely(wcs_n == 0)) + return 0; + if (unlikely(wcs_n < 0)) { + DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", + (void *)txq, wcs_n); + return -1; + } + elts_comp -= wcs_n; + assert(elts_comp <= txq->elts_comp); + /* + * Assume WC status is successful as nothing can be done about it + * anyway. + */ + elts_tail += wcs_n * txq->elts_comp_cd_init; + if (elts_tail >= elts_n) + elts_tail -= elts_n; + txq->elts_tail = elts_tail; + txq->elts_comp = elts_comp; + return 0; +} + +/** + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which + * the cloned mbuf is allocated is returned instead. + * + * @param buf + * Pointer to mbuf. + * + * @return + * Memory pool where data is located for given mbuf. + */ +static struct rte_mempool * +mlx4_txq_mb2mp(struct rte_mbuf *buf) +{ + if (unlikely(RTE_MBUF_INDIRECT(buf))) + return rte_mbuf_from_indirect(buf)->pool; + return buf->pool; +} + +/** + * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[]. + * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, + * remove an entry first. + * + * @param txq + * Pointer to Tx queue structure. + * @param[in] mp + * Memory pool for which a memory region lkey must be returned. + * + * @return + * mr->lkey on success, (uint32_t)-1 on failure. + */ +uint32_t +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) +{ + unsigned int i; + struct ibv_mr *mr; + + for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { + if (unlikely(txq->mp2mr[i].mp == NULL)) { + /* Unknown MP, add a new MR for it. */ + break; + } + if (txq->mp2mr[i].mp == mp) { + assert(txq->mp2mr[i].lkey != (uint32_t)-1); + assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); + return txq->mp2mr[i].lkey; + } + } + /* Add a new entry, register MR first. */ + DEBUG("%p: discovered new memory pool \"%s\" (%p)", + (void *)txq, mp->name, (void *)mp); + mr = mlx4_mp2mr(txq->priv->pd, mp); + if (unlikely(mr == NULL)) { + DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", + (void *)txq); + return (uint32_t)-1; + } + if (unlikely(i == RTE_DIM(txq->mp2mr))) { + /* Table is full, remove oldest entry. */ + DEBUG("%p: MR <-> MP table full, dropping oldest entry.", + (void *)txq); + --i; + claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); + memmove(&txq->mp2mr[0], &txq->mp2mr[1], + (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); + } + /* Store the new entry. */ + txq->mp2mr[i].mp = mp; + txq->mp2mr[i].mr = mr; + txq->mp2mr[i].lkey = mr->lkey; + DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, + (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); + return txq->mp2mr[i].lkey; +} + +/** + * DPDK callback for Tx. + * + * @param dpdk_txq + * Generic pointer to Tx queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct txq *txq = (struct txq *)dpdk_txq; + struct ibv_send_wr *wr_head = NULL; + struct ibv_send_wr **wr_next = &wr_head; + struct ibv_send_wr *wr_bad = NULL; + unsigned int elts_head = txq->elts_head; + const unsigned int elts_n = txq->elts_n; + unsigned int elts_comp_cd = txq->elts_comp_cd; + unsigned int elts_comp = 0; + unsigned int i; + unsigned int max; + int err; + + assert(elts_comp_cd != 0); + mlx4_txq_complete(txq); + max = (elts_n - (elts_head - txq->elts_tail)); + if (max > elts_n) + max -= elts_n; + assert(max >= 1); + assert(max <= elts_n); + /* Always leave one free entry in the ring. */ + --max; + if (max == 0) + return 0; + if (max > pkts_n) + max = pkts_n; + for (i = 0; (i != max); ++i) { + struct rte_mbuf *buf = pkts[i]; + unsigned int elts_head_next = + (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); + struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; + struct txq_elt *elt = &(*txq->elts)[elts_head]; + struct ibv_send_wr *wr = &elt->wr; + unsigned int segs = buf->nb_segs; + unsigned int sent_size = 0; + uint32_t send_flags = 0; + + /* Clean up old buffer. */ + if (likely(elt->buf != NULL)) { + struct rte_mbuf *tmp = elt->buf; + +#ifndef NDEBUG + /* Poisoning. */ + memset(elt, 0x66, sizeof(*elt)); +#endif + /* Faster than rte_pktmbuf_free(). */ + do { + struct rte_mbuf *next = tmp->next; + + rte_pktmbuf_free_seg(tmp); + tmp = next; + } while (tmp != NULL); + } + /* Request Tx completion. */ + if (unlikely(--elts_comp_cd == 0)) { + elts_comp_cd = txq->elts_comp_cd_init; + ++elts_comp; + send_flags |= IBV_SEND_SIGNALED; + } + if (likely(segs == 1)) { + struct ibv_sge *sge = &elt->sge; + uintptr_t addr; + uint32_t length; + uint32_t lkey; + + /* Retrieve buffer information. */ + addr = rte_pktmbuf_mtod(buf, uintptr_t); + length = buf->data_len; + /* Retrieve memory region key for this memory pool. */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); + if (unlikely(lkey == (uint32_t)-1)) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR" + " association", (void *)txq); + /* Clean up Tx element. */ + elt->buf = NULL; + goto stop; + } + /* Update element. */ + elt->buf = buf; + if (txq->priv->vf) + rte_prefetch0((volatile void *) + (uintptr_t)addr); + RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); + sge->addr = addr; + sge->length = length; + sge->lkey = lkey; + sent_size += length; + } else { + err = -1; + goto stop; + } + if (sent_size <= txq->max_inline) + send_flags |= IBV_SEND_INLINE; + elts_head = elts_head_next; + /* Increment sent bytes counter. */ + txq->stats.obytes += sent_size; + /* Set up WR. */ + wr->sg_list = &elt->sge; + wr->num_sge = segs; + wr->opcode = IBV_WR_SEND; + wr->send_flags = send_flags; + *wr_next = wr; + wr_next = &wr->next; + } +stop: + /* Take a shortcut if nothing must be sent. */ + if (unlikely(i == 0)) + return 0; + /* Increment sent packets counter. */ + txq->stats.opackets += i; + /* Ring QP doorbell. */ + *wr_next = NULL; + assert(wr_head); + err = ibv_post_send(txq->qp, wr_head, &wr_bad); + if (unlikely(err)) { + uint64_t obytes = 0; + uint64_t opackets = 0; + + /* Rewind bad WRs. */ + while (wr_bad != NULL) { + int j; + + /* Force completion request if one was lost. */ + if (wr_bad->send_flags & IBV_SEND_SIGNALED) { + elts_comp_cd = 1; + --elts_comp; + } + ++opackets; + for (j = 0; j < wr_bad->num_sge; ++j) + obytes += wr_bad->sg_list[j].length; + elts_head = (elts_head ? elts_head : elts_n) - 1; + wr_bad = wr_bad->next; + } + txq->stats.opackets -= opackets; + txq->stats.obytes -= obytes; + i -= opackets; + DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets" + " (%" PRIu64 " bytes) rejected: %s", + (void *)txq, + opackets, + obytes, + (err <= -1) ? "Internal error" : strerror(err)); + } + txq->elts_head = elts_head; + txq->elts_comp += elts_comp; + txq->elts_comp_cd = elts_comp_cd; + return i; +} + +/** + * DPDK callback for Rx. + * + * The following function doesn't manage scattered packets. + * + * @param dpdk_rxq + * Generic pointer to Rx queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + struct rxq *rxq = (struct rxq *)dpdk_rxq; + struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; + const unsigned int elts_n = rxq->elts_n; + unsigned int elts_head = rxq->elts_head; + struct ibv_wc wcs[pkts_n]; + struct ibv_recv_wr *wr_head = NULL; + struct ibv_recv_wr **wr_next = &wr_head; + struct ibv_recv_wr *wr_bad = NULL; + unsigned int i; + unsigned int pkts_ret = 0; + int ret; + + ret = ibv_poll_cq(rxq->cq, pkts_n, wcs); + if (unlikely(ret == 0)) + return 0; + if (unlikely(ret < 0)) { + DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)", + (void *)rxq, ret); + return 0; + } + assert(ret <= (int)pkts_n); + /* For each work completion. */ + for (i = 0; i != (unsigned int)ret; ++i) { + struct ibv_wc *wc = &wcs[i]; + struct rxq_elt *elt = &(*elts)[elts_head]; + struct ibv_recv_wr *wr = &elt->wr; + uint32_t len = wc->byte_len; + struct rte_mbuf *seg = elt->buf; + struct rte_mbuf *rep; + + /* Sanity checks. */ + assert(wr->sg_list == &elt->sge); + assert(wr->num_sge == 1); + assert(elts_head < rxq->elts_n); + assert(rxq->elts_head < rxq->elts_n); + /* + * Fetch initial bytes of packet descriptor into a + * cacheline while allocating rep. + */ + rte_mbuf_prefetch_part1(seg); + rte_mbuf_prefetch_part2(seg); + /* Link completed WRs together for repost. */ + *wr_next = wr; + wr_next = &wr->next; + if (unlikely(wc->status != IBV_WC_SUCCESS)) { + /* Whatever, just repost the offending WR. */ + DEBUG("rxq=%p: bad work completion status (%d): %s", + (void *)rxq, wc->status, + ibv_wc_status_str(wc->status)); + /* Increment dropped packets counter. */ + ++rxq->stats.idropped; + goto repost; + } + rep = rte_mbuf_raw_alloc(rxq->mp); + if (unlikely(rep == NULL)) { + /* + * Unable to allocate a replacement mbuf, + * repost WR. + */ + DEBUG("rxq=%p: can't allocate a new mbuf", + (void *)rxq); + /* Increase out of memory counters. */ + ++rxq->stats.rx_nombuf; + ++rxq->priv->dev->data->rx_mbuf_alloc_failed; + goto repost; + } + /* Reconfigure sge to use rep instead of seg. */ + elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; + assert(elt->sge.lkey == rxq->mr->lkey); + elt->buf = rep; + /* Update seg information. */ + seg->data_off = RTE_PKTMBUF_HEADROOM; + seg->nb_segs = 1; + seg->port = rxq->port_id; + seg->next = NULL; + seg->pkt_len = len; + seg->data_len = len; + seg->packet_type = 0; + seg->ol_flags = 0; + /* Return packet. */ + *(pkts++) = seg; + ++pkts_ret; + /* Increase bytes counter. */ + rxq->stats.ibytes += len; +repost: + if (++elts_head >= elts_n) + elts_head = 0; + continue; + } + if (unlikely(i == 0)) + return 0; + /* Repost WRs. */ + *wr_next = NULL; + assert(wr_head); + ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad); + if (unlikely(ret)) { + /* Inability to repost WRs is fatal. */ + DEBUG("%p: recv_burst(): failed (ret=%d)", + (void *)rxq->priv, + ret); + abort(); + } + rxq->elts_head = elts_head; + /* Increase packets counter. */ + rxq->stats.ipackets += pkts_ret; + return pkts_ret; +} + +/** + * Dummy DPDK callback for Tx. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_txq + * Generic pointer to Tx queue structure. + * @param[in] pkts + * Packets to transmit. + * @param pkts_n + * Number of packets in array. + * + * @return + * Number of packets successfully transmitted (<= pkts_n). + */ +uint16_t +mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_txq; + (void)pkts; + (void)pkts_n; + return 0; +} + +/** + * Dummy DPDK callback for Rx. + * + * This function is used to temporarily replace the real callback during + * unsafe control operations on the queue, or in case of error. + * + * @param dpdk_rxq + * Generic pointer to Rx queue structure. + * @param[out] pkts + * Array to store received packets. + * @param pkts_n + * Maximum number of packets in array. + * + * @return + * Number of packets successfully received (<= pkts_n). + */ +uint16_t +mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) +{ + (void)dpdk_rxq; + (void)pkts; + (void)pkts_n; + return 0; +} diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index ea55aed787..669c8a4540 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -119,4 +119,16 @@ struct txq { unsigned int socket; /**< CPU socket ID for allocations. */ }; +/* mlx4_rxtx.c */ + +uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp); +uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, + uint16_t pkts_n); +uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, + uint16_t pkts_n); + #endif /* MLX4_RXTX_H_ */