SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
# Basic CFLAGS.
#include <rte_mbuf.h>
#include <rte_errno.h>
#include <rte_mempool.h>
-#include <rte_prefetch.h>
#include <rte_malloc.h>
#include <rte_memory.h>
#include <rte_flow.h>
#include <rte_kvargs.h>
#include <rte_interrupts.h>
-#include <rte_branch_prediction.h>
#include <rte_common.h>
/* Generated configuration header. */
return 0;
}
-static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t);
-static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
-
/* TX queues handling. */
/**
memset(txq, 0, sizeof(*txq));
}
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx4_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
- *
- * @param txq
- * Pointer to TX queue structure.
- *
- * @return
- * 0 on success, -1 on failure.
- */
-static int
-txq_complete(struct txq *txq)
-{
- unsigned int elts_comp = txq->elts_comp;
- unsigned int elts_tail = txq->elts_tail;
- const unsigned int elts_n = txq->elts_n;
- struct ibv_wc wcs[elts_comp];
- int wcs_n;
-
- if (unlikely(elts_comp == 0))
- return 0;
- wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
- if (unlikely(wcs_n == 0))
- return 0;
- if (unlikely(wcs_n < 0)) {
- DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
- (void *)txq, wcs_n);
- return -1;
- }
- elts_comp -= wcs_n;
- assert(elts_comp <= txq->elts_comp);
- /*
- * Assume WC status is successful as nothing can be done about it
- * anyway.
- */
- elts_tail += wcs_n * txq->elts_comp_cd_init;
- if (elts_tail >= elts_n)
- elts_tail -= elts_n;
- txq->elts_tail = elts_tail;
- txq->elts_comp = elts_comp;
- return 0;
-}
-
struct mlx4_check_mempool_data {
int ret;
char *start;
return data.ret;
}
-/* For best performance, this function should not be inlined. */
-static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
- __rte_noinline;
-
/**
* Register mempool as a memory region.
*
* @return
* Memory region pointer, NULL in case of error and rte_errno is set.
*/
-static struct ibv_mr *
+struct ibv_mr *
mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
{
const struct rte_memseg *ms = rte_eal_get_physmem_layout();
return mr;
}
-/**
- * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- * Pointer to mbuf.
- *
- * @return
- * Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-txq_mb2mp(struct rte_mbuf *buf)
-{
- if (unlikely(RTE_MBUF_INDIRECT(buf)))
- return rte_mbuf_from_indirect(buf)->pool;
- return buf->pool;
-}
-
-/**
- * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- * Pointer to TX queue structure.
- * @param[in] mp
- * Memory Pool for which a Memory Region lkey must be returned.
- *
- * @return
- * mr->lkey on success, (uint32_t)-1 on failure.
- */
-static uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
- unsigned int i;
- struct ibv_mr *mr;
-
- for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
- if (unlikely(txq->mp2mr[i].mp == NULL)) {
- /* Unknown MP, add a new MR for it. */
- break;
- }
- if (txq->mp2mr[i].mp == mp) {
- assert(txq->mp2mr[i].lkey != (uint32_t)-1);
- assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
- }
- }
- /* Add a new entry, register MR first. */
- DEBUG("%p: discovered new memory pool \"%s\" (%p)",
- (void *)txq, mp->name, (void *)mp);
- mr = mlx4_mp2mr(txq->priv->pd, mp);
- if (unlikely(mr == NULL)) {
- DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
- (void *)txq);
- return (uint32_t)-1;
- }
- if (unlikely(i == RTE_DIM(txq->mp2mr))) {
- /* Table is full, remove oldest entry. */
- DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
- (void *)txq);
- --i;
- claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
- memmove(&txq->mp2mr[0], &txq->mp2mr[1],
- (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
- }
- /* Store the new entry. */
- txq->mp2mr[i].mp = mp;
- txq->mp2mr[i].mr = mr;
- txq->mp2mr[i].lkey = mr->lkey;
- DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
- (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
- return txq->mp2mr[i].lkey;
-}
-
struct txq_mp2mr_mbuf_check_data {
int ret;
};
if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
data.ret == -1)
return;
- txq_mp2mr(txq, mp);
-}
-
-/**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- * Generic pointer to TX queue structure.
- * @param[in] pkts
- * Packets to transmit.
- * @param pkts_n
- * Number of packets in array.
- *
- * @return
- * Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct txq *txq = (struct txq *)dpdk_txq;
- struct ibv_send_wr *wr_head = NULL;
- struct ibv_send_wr **wr_next = &wr_head;
- struct ibv_send_wr *wr_bad = NULL;
- unsigned int elts_head = txq->elts_head;
- const unsigned int elts_n = txq->elts_n;
- unsigned int elts_comp_cd = txq->elts_comp_cd;
- unsigned int elts_comp = 0;
- unsigned int i;
- unsigned int max;
- int err;
-
- assert(elts_comp_cd != 0);
- txq_complete(txq);
- max = (elts_n - (elts_head - txq->elts_tail));
- if (max > elts_n)
- max -= elts_n;
- assert(max >= 1);
- assert(max <= elts_n);
- /* Always leave one free entry in the ring. */
- --max;
- if (max == 0)
- return 0;
- if (max > pkts_n)
- max = pkts_n;
- for (i = 0; (i != max); ++i) {
- struct rte_mbuf *buf = pkts[i];
- unsigned int elts_head_next =
- (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
- struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
- struct txq_elt *elt = &(*txq->elts)[elts_head];
- struct ibv_send_wr *wr = &elt->wr;
- unsigned int segs = buf->nb_segs;
- unsigned int sent_size = 0;
- uint32_t send_flags = 0;
-
- /* Clean up old buffer. */
- if (likely(elt->buf != NULL)) {
- struct rte_mbuf *tmp = elt->buf;
-
-#ifndef NDEBUG
- /* Poisoning. */
- memset(elt, 0x66, sizeof(*elt));
-#endif
- /* Faster than rte_pktmbuf_free(). */
- do {
- struct rte_mbuf *next = tmp->next;
-
- rte_pktmbuf_free_seg(tmp);
- tmp = next;
- } while (tmp != NULL);
- }
- /* Request TX completion. */
- if (unlikely(--elts_comp_cd == 0)) {
- elts_comp_cd = txq->elts_comp_cd_init;
- ++elts_comp;
- send_flags |= IBV_SEND_SIGNALED;
- }
- if (likely(segs == 1)) {
- struct ibv_sge *sge = &elt->sge;
- uintptr_t addr;
- uint32_t length;
- uint32_t lkey;
-
- /* Retrieve buffer information. */
- addr = rte_pktmbuf_mtod(buf, uintptr_t);
- length = buf->data_len;
- /* Retrieve Memory Region key for this memory pool. */
- lkey = txq_mp2mr(txq, txq_mb2mp(buf));
- if (unlikely(lkey == (uint32_t)-1)) {
- /* MR does not exist. */
- DEBUG("%p: unable to get MP <-> MR"
- " association", (void *)txq);
- /* Clean up TX element. */
- elt->buf = NULL;
- goto stop;
- }
- /* Update element. */
- elt->buf = buf;
- if (txq->priv->vf)
- rte_prefetch0((volatile void *)
- (uintptr_t)addr);
- RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
- sge->addr = addr;
- sge->length = length;
- sge->lkey = lkey;
- sent_size += length;
- } else {
- err = -1;
- goto stop;
- }
- if (sent_size <= txq->max_inline)
- send_flags |= IBV_SEND_INLINE;
- elts_head = elts_head_next;
- /* Increment sent bytes counter. */
- txq->stats.obytes += sent_size;
- /* Set up WR. */
- wr->sg_list = &elt->sge;
- wr->num_sge = segs;
- wr->opcode = IBV_WR_SEND;
- wr->send_flags = send_flags;
- *wr_next = wr;
- wr_next = &wr->next;
- }
-stop:
- /* Take a shortcut if nothing must be sent. */
- if (unlikely(i == 0))
- return 0;
- /* Increment sent packets counter. */
- txq->stats.opackets += i;
- /* Ring QP doorbell. */
- *wr_next = NULL;
- assert(wr_head);
- err = ibv_post_send(txq->qp, wr_head, &wr_bad);
- if (unlikely(err)) {
- uint64_t obytes = 0;
- uint64_t opackets = 0;
-
- /* Rewind bad WRs. */
- while (wr_bad != NULL) {
- int j;
-
- /* Force completion request if one was lost. */
- if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
- elts_comp_cd = 1;
- --elts_comp;
- }
- ++opackets;
- for (j = 0; j < wr_bad->num_sge; ++j)
- obytes += wr_bad->sg_list[j].length;
- elts_head = (elts_head ? elts_head : elts_n) - 1;
- wr_bad = wr_bad->next;
- }
- txq->stats.opackets -= opackets;
- txq->stats.obytes -= obytes;
- i -= opackets;
- DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
- " (%" PRIu64 " bytes) rejected: %s",
- (void *)txq,
- opackets,
- obytes,
- (err <= -1) ? "Internal error" : strerror(err));
- }
- txq->elts_head = elts_head;
- txq->elts_comp += elts_comp;
- txq->elts_comp_cd = elts_comp_cd;
- return i;
+ mlx4_txq_mp2mr(txq, mp);
}
/**
memset(rxq, 0, sizeof(*rxq));
}
-/**
- * DPDK callback for RX.
- *
- * The following function doesn't manage scattered packets.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_wc wcs[pkts_n];
- struct ibv_recv_wr *wr_head = NULL;
- struct ibv_recv_wr **wr_next = &wr_head;
- struct ibv_recv_wr *wr_bad = NULL;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
-
- ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
- if (unlikely(ret == 0))
- return 0;
- if (unlikely(ret < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
- (void *)rxq, ret);
- return 0;
- }
- assert(ret <= (int)pkts_n);
- /* For each work completion. */
- for (i = 0; i != (unsigned int)ret; ++i) {
- struct ibv_wc *wc = &wcs[i];
- struct rxq_elt *elt = &(*elts)[elts_head];
- struct ibv_recv_wr *wr = &elt->wr;
- uint32_t len = wc->byte_len;
- struct rte_mbuf *seg = elt->buf;
- struct rte_mbuf *rep;
-
- /* Sanity checks. */
- assert(wr->sg_list == &elt->sge);
- assert(wr->num_sge == 1);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- /* Link completed WRs together for repost. */
- *wr_next = wr;
- wr_next = &wr->next;
- if (unlikely(wc->status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p: bad work completion status (%d): %s",
- (void *)rxq, wc->status,
- ibv_wc_status_str(wc->status));
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
- goto repost;
- }
- rep = rte_mbuf_raw_alloc(rxq->mp);
- if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- /* Increase out of memory counters. */
- ++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
- }
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
- elt->buf = rep;
- /* Update seg information. */
- seg->data_off = RTE_PKTMBUF_HEADROOM;
- seg->nb_segs = 1;
- seg->port = rxq->port_id;
- seg->next = NULL;
- seg->pkt_len = len;
- seg->data_len = len;
- seg->packet_type = 0;
- seg->ol_flags = 0;
- /* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
- /* Increase bytes counter. */
- rxq->stats.ibytes += len;
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
- }
- if (unlikely(i == 0))
- return 0;
- /* Repost WRs. */
- *wr_next = NULL;
- assert(wr_head);
- ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
- /* Increase packets counter. */
- rxq->stats.ipackets += pkts_ret;
- return pkts_ret;
-}
-
/**
* Allocate a Queue Pair.
* Optionally setup inline receive if supported.
priv_mac_addr_del(priv);
}
-/**
- * Dummy DPDK callback for TX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_txq
- * Generic pointer to TX queue structure.
- * @param[in] pkts
- * Packets to transmit.
- * @param pkts_n
- * Number of packets in array.
- *
- * @return
- * Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- (void)dpdk_txq;
- (void)pkts;
- (void)pkts_n;
- return 0;
-}
-
-/**
- * Dummy DPDK callback for RX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_rxq
- * Generic pointer to RX queue structure.
- * @param[out] pkts
- * Array to store received packets.
- * @param pkts_n
- * Maximum number of packets in array.
- *
- * @return
- * Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
- (void)dpdk_rxq;
- (void)pkts;
- (void)pkts_n;
- return 0;
-}
-
/**
* DPDK callback to close the device.
*
* still required for DPDK 1.3 because some programs (such as testpmd)
* never release them before closing the device.
*/
- dev->rx_pkt_burst = removed_rx_burst;
- dev->tx_pkt_burst = removed_tx_burst;
+ dev->rx_pkt_burst = mlx4_rx_burst_removed;
+ dev->tx_pkt_burst = mlx4_tx_burst_removed;
if (priv->rxqs != NULL) {
/* XXX race condition if mlx4_rx_burst() is still running. */
usleep(1000);
err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
if (err)
return err;
- dev->rx_pkt_burst = removed_rx_burst;
- dev->tx_pkt_burst = removed_tx_burst;
+ dev->rx_pkt_burst = mlx4_rx_burst_removed;
+ dev->tx_pkt_burst = mlx4_tx_burst_removed;
}
return 0;
}
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_interrupts.h>
+#include <rte_mempool.h>
/* Request send completion once in every 64 sends, might be less. */
#define MLX4_PMD_TX_PER_COMP_REQ 64
/* mlx4.c */
+struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
/* mlx4_intr.c */
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Data plane functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Manage Tx completions.
+ *
+ * When sending a burst, mlx4_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+static int
+mlx4_txq_complete(struct txq *txq)
+{
+ unsigned int elts_comp = txq->elts_comp;
+ unsigned int elts_tail = txq->elts_tail;
+ const unsigned int elts_n = txq->elts_n;
+ struct ibv_wc wcs[elts_comp];
+ int wcs_n;
+
+ if (unlikely(elts_comp == 0))
+ return 0;
+ wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
+ if (unlikely(wcs_n == 0))
+ return 0;
+ if (unlikely(wcs_n < 0)) {
+ DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
+ (void *)txq, wcs_n);
+ return -1;
+ }
+ elts_comp -= wcs_n;
+ assert(elts_comp <= txq->elts_comp);
+ /*
+ * Assume WC status is successful as nothing can be done about it
+ * anyway.
+ */
+ elts_tail += wcs_n * txq->elts_comp_cd_init;
+ if (elts_tail >= elts_n)
+ elts_tail -= elts_n;
+ txq->elts_tail = elts_tail;
+ txq->elts_comp = elts_comp;
+ return 0;
+}
+
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ * Pointer to mbuf.
+ *
+ * @return
+ * Memory pool where data is located for given mbuf.
+ */
+static struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+ if (unlikely(RTE_MBUF_INDIRECT(buf)))
+ return rte_mbuf_from_indirect(buf)->pool;
+ return buf->pool;
+}
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
+ * remove an entry first.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param[in] mp
+ * Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+ unsigned int i;
+ struct ibv_mr *mr;
+
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp) {
+ assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+ assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+ return txq->mp2mr[i].lkey;
+ }
+ }
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+ (void *)txq, mp->name, (void *)mp);
+ mr = mlx4_mp2mr(txq->priv->pd, mp);
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --i;
+ claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[i].mp = mp;
+ txq->mp2mr[i].mr = mr;
+ txq->mp2mr[i].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+ (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+ return txq->mp2mr[i].lkey;
+}
+
+/**
+ * DPDK callback for Tx.
+ *
+ * @param dpdk_txq
+ * Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ struct ibv_send_wr *wr_head = NULL;
+ struct ibv_send_wr **wr_next = &wr_head;
+ struct ibv_send_wr *wr_bad = NULL;
+ unsigned int elts_head = txq->elts_head;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int elts_comp_cd = txq->elts_comp_cd;
+ unsigned int elts_comp = 0;
+ unsigned int i;
+ unsigned int max;
+ int err;
+
+ assert(elts_comp_cd != 0);
+ mlx4_txq_complete(txq);
+ max = (elts_n - (elts_head - txq->elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ unsigned int elts_head_next =
+ (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
+ struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
+ struct txq_elt *elt = &(*txq->elts)[elts_head];
+ struct ibv_send_wr *wr = &elt->wr;
+ unsigned int segs = buf->nb_segs;
+ unsigned int sent_size = 0;
+ uint32_t send_flags = 0;
+
+ /* Clean up old buffer. */
+ if (likely(elt->buf != NULL)) {
+ struct rte_mbuf *tmp = elt->buf;
+
+#ifndef NDEBUG
+ /* Poisoning. */
+ memset(elt, 0x66, sizeof(*elt));
+#endif
+ /* Faster than rte_pktmbuf_free(). */
+ do {
+ struct rte_mbuf *next = tmp->next;
+
+ rte_pktmbuf_free_seg(tmp);
+ tmp = next;
+ } while (tmp != NULL);
+ }
+ /* Request Tx completion. */
+ if (unlikely(--elts_comp_cd == 0)) {
+ elts_comp_cd = txq->elts_comp_cd_init;
+ ++elts_comp;
+ send_flags |= IBV_SEND_SIGNALED;
+ }
+ if (likely(segs == 1)) {
+ struct ibv_sge *sge = &elt->sge;
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = buf->data_len;
+ /* Retrieve memory region key for this memory pool. */
+ lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up Tx element. */
+ elt->buf = NULL;
+ goto stop;
+ }
+ /* Update element. */
+ elt->buf = buf;
+ if (txq->priv->vf)
+ rte_prefetch0((volatile void *)
+ (uintptr_t)addr);
+ RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+ sge->addr = addr;
+ sge->length = length;
+ sge->lkey = lkey;
+ sent_size += length;
+ } else {
+ err = -1;
+ goto stop;
+ }
+ if (sent_size <= txq->max_inline)
+ send_flags |= IBV_SEND_INLINE;
+ elts_head = elts_head_next;
+ /* Increment sent bytes counter. */
+ txq->stats.obytes += sent_size;
+ /* Set up WR. */
+ wr->sg_list = &elt->sge;
+ wr->num_sge = segs;
+ wr->opcode = IBV_WR_SEND;
+ wr->send_flags = send_flags;
+ *wr_next = wr;
+ wr_next = &wr->next;
+ }
+stop:
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Increment sent packets counter. */
+ txq->stats.opackets += i;
+ /* Ring QP doorbell. */
+ *wr_next = NULL;
+ assert(wr_head);
+ err = ibv_post_send(txq->qp, wr_head, &wr_bad);
+ if (unlikely(err)) {
+ uint64_t obytes = 0;
+ uint64_t opackets = 0;
+
+ /* Rewind bad WRs. */
+ while (wr_bad != NULL) {
+ int j;
+
+ /* Force completion request if one was lost. */
+ if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
+ elts_comp_cd = 1;
+ --elts_comp;
+ }
+ ++opackets;
+ for (j = 0; j < wr_bad->num_sge; ++j)
+ obytes += wr_bad->sg_list[j].length;
+ elts_head = (elts_head ? elts_head : elts_n) - 1;
+ wr_bad = wr_bad->next;
+ }
+ txq->stats.opackets -= opackets;
+ txq->stats.obytes -= obytes;
+ i -= opackets;
+ DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
+ " (%" PRIu64 " bytes) rejected: %s",
+ (void *)txq,
+ opackets,
+ obytes,
+ (err <= -1) ? "Internal error" : strerror(err));
+ }
+ txq->elts_head = elts_head;
+ txq->elts_comp += elts_comp;
+ txq->elts_comp_cd = elts_comp_cd;
+ return i;
+}
+
+/**
+ * DPDK callback for Rx.
+ *
+ * The following function doesn't manage scattered packets.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
+ const unsigned int elts_n = rxq->elts_n;
+ unsigned int elts_head = rxq->elts_head;
+ struct ibv_wc wcs[pkts_n];
+ struct ibv_recv_wr *wr_head = NULL;
+ struct ibv_recv_wr **wr_next = &wr_head;
+ struct ibv_recv_wr *wr_bad = NULL;
+ unsigned int i;
+ unsigned int pkts_ret = 0;
+ int ret;
+
+ ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
+ if (unlikely(ret == 0))
+ return 0;
+ if (unlikely(ret < 0)) {
+ DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
+ (void *)rxq, ret);
+ return 0;
+ }
+ assert(ret <= (int)pkts_n);
+ /* For each work completion. */
+ for (i = 0; i != (unsigned int)ret; ++i) {
+ struct ibv_wc *wc = &wcs[i];
+ struct rxq_elt *elt = &(*elts)[elts_head];
+ struct ibv_recv_wr *wr = &elt->wr;
+ uint32_t len = wc->byte_len;
+ struct rte_mbuf *seg = elt->buf;
+ struct rte_mbuf *rep;
+
+ /* Sanity checks. */
+ assert(wr->sg_list == &elt->sge);
+ assert(wr->num_sge == 1);
+ assert(elts_head < rxq->elts_n);
+ assert(rxq->elts_head < rxq->elts_n);
+ /*
+ * Fetch initial bytes of packet descriptor into a
+ * cacheline while allocating rep.
+ */
+ rte_mbuf_prefetch_part1(seg);
+ rte_mbuf_prefetch_part2(seg);
+ /* Link completed WRs together for repost. */
+ *wr_next = wr;
+ wr_next = &wr->next;
+ if (unlikely(wc->status != IBV_WC_SUCCESS)) {
+ /* Whatever, just repost the offending WR. */
+ DEBUG("rxq=%p: bad work completion status (%d): %s",
+ (void *)rxq, wc->status,
+ ibv_wc_status_str(wc->status));
+ /* Increment dropped packets counter. */
+ ++rxq->stats.idropped;
+ goto repost;
+ }
+ rep = rte_mbuf_raw_alloc(rxq->mp);
+ if (unlikely(rep == NULL)) {
+ /*
+ * Unable to allocate a replacement mbuf,
+ * repost WR.
+ */
+ DEBUG("rxq=%p: can't allocate a new mbuf",
+ (void *)rxq);
+ /* Increase out of memory counters. */
+ ++rxq->stats.rx_nombuf;
+ ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
+ goto repost;
+ }
+ /* Reconfigure sge to use rep instead of seg. */
+ elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
+ assert(elt->sge.lkey == rxq->mr->lkey);
+ elt->buf = rep;
+ /* Update seg information. */
+ seg->data_off = RTE_PKTMBUF_HEADROOM;
+ seg->nb_segs = 1;
+ seg->port = rxq->port_id;
+ seg->next = NULL;
+ seg->pkt_len = len;
+ seg->data_len = len;
+ seg->packet_type = 0;
+ seg->ol_flags = 0;
+ /* Return packet. */
+ *(pkts++) = seg;
+ ++pkts_ret;
+ /* Increase bytes counter. */
+ rxq->stats.ibytes += len;
+repost:
+ if (++elts_head >= elts_n)
+ elts_head = 0;
+ continue;
+ }
+ if (unlikely(i == 0))
+ return 0;
+ /* Repost WRs. */
+ *wr_next = NULL;
+ assert(wr_head);
+ ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
+ if (unlikely(ret)) {
+ /* Inability to repost WRs is fatal. */
+ DEBUG("%p: recv_burst(): failed (ret=%d)",
+ (void *)rxq->priv,
+ ret);
+ abort();
+ }
+ rxq->elts_head = elts_head;
+ /* Increase packets counter. */
+ rxq->stats.ipackets += pkts_ret;
+ return pkts_ret;
+}
+
+/**
+ * Dummy DPDK callback for Tx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ * Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ (void)dpdk_txq;
+ (void)pkts;
+ (void)pkts_n;
+ return 0;
+}
+
+/**
+ * Dummy DPDK callback for Rx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ (void)dpdk_rxq;
+ (void)pkts;
+ (void)pkts_n;
+ return 0;
+}
unsigned int socket; /**< CPU socket ID for allocations. */
};
+/* mlx4_rxtx.c */
+
+uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
+uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
+
#endif /* MLX4_RXTX_H_ */