RSS implementation with parent/child QPs comes from mlx4 and is temporary.
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
+CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N=4
+CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
+CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8
#
# Compile burst-oriented Broadcom PMD driver
#
CONFIG_RTE_LIBRTE_MLX5_PMD=n
CONFIG_RTE_LIBRTE_MLX5_DEBUG=n
+CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N=4
+CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE=0
+CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8
#
# Compile burst-oriented Broadcom PMD driver
# Sources.
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_ethdev.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mac.c
CFLAGS += -DNDEBUG -UPEDANTIC
endif
+ifdef CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N
+CFLAGS += -DMLX5_PMD_SGE_WR_N=$(CONFIG_RTE_LIBRTE_MLX5_SGE_WR_N)
+endif
+
+ifdef CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE
+CFLAGS += -DMLX5_PMD_MAX_INLINE=$(CONFIG_RTE_LIBRTE_MLX5_MAX_INLINE)
+endif
+
+ifdef CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE
+CFLAGS += -DMLX5_PMD_TX_MP_CACHE=$(CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE)
+endif
+
include $(RTE_SDK)/mk/rte.lib.mk
# Generate and clean-up mlx5_autoconf.h.
#include "mlx5.h"
#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
/**
mlx5_dev_close(struct rte_eth_dev *dev)
{
struct priv *priv = dev->data->dev_private;
+ void *tmp;
+ unsigned int i;
priv_lock(priv);
DEBUG("%p: closing device \"%s\"",
(void *)dev,
((priv->ctx != NULL) ? priv->ctx->device->name : ""));
+ /* Prevent crashes when queues are still in use. */
+ dev->rx_pkt_burst = removed_rx_burst;
+ dev->tx_pkt_burst = removed_tx_burst;
+ if (priv->rxqs != NULL) {
+ /* XXX race condition if mlx5_rx_burst() is still running. */
+ usleep(1000);
+ for (i = 0; (i != priv->rxqs_n); ++i) {
+ tmp = (*priv->rxqs)[i];
+ if (tmp == NULL)
+ continue;
+ (*priv->rxqs)[i] = NULL;
+ rxq_cleanup(tmp);
+ rte_free(tmp);
+ }
+ priv->rxqs_n = 0;
+ priv->rxqs = NULL;
+ }
+ if (priv->txqs != NULL) {
+ /* XXX race condition if mlx5_tx_burst() is still running. */
+ usleep(1000);
+ for (i = 0; (i != priv->txqs_n); ++i) {
+ tmp = (*priv->txqs)[i];
+ if (tmp == NULL)
+ continue;
+ (*priv->txqs)[i] = NULL;
+ txq_cleanup(tmp);
+ rte_free(tmp);
+ }
+ priv->txqs_n = 0;
+ priv->txqs = NULL;
+ }
+ if (priv->rss)
+ rxq_cleanup(&priv->rxq_parent);
if (priv->pd != NULL) {
assert(priv->ctx != NULL);
claim_zero(ibv_dealloc_pd(priv->pd));
static const struct eth_dev_ops mlx5_dev_ops = {
.dev_close = mlx5_dev_close,
+ .rx_queue_setup = mlx5_rx_queue_setup,
+ .tx_queue_setup = mlx5_tx_queue_setup,
+ .rx_queue_release = mlx5_rx_queue_release,
+ .tx_queue_release = mlx5_tx_queue_release,
};
static struct {
#endif
#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
unsigned int rss:1; /* RSS is enabled. */
unsigned int vf:1; /* This is a VF device. */
unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */
+ /* RX/TX queues. */
+ struct rxq rxq_parent; /* Parent queue when RSS is enabled. */
+ unsigned int rxqs_n; /* RX queues array size. */
+ unsigned int txqs_n; /* TX queues array size. */
+ struct rxq *(*rxqs)[]; /* RX queues. */
+ struct txq *(*txqs)[]; /* TX queues. */
rte_spinlock_t lock; /* Lock for control functions. */
};
+/* Work Request ID data type (64 bit). */
+typedef union {
+ struct {
+ uint32_t id;
+ uint16_t offset;
+ } data;
+ uint64_t raw;
+} wr_id_t;
+
+/* Compile-time check. */
+static inline void wr_id_t_check(void)
+{
+ wr_id_t check[1 + (2 * -!(sizeof(wr_id_t) == sizeof(uint64_t)))];
+
+ (void)check;
+ (void)wr_id_t_check;
+}
+
/**
* Lock private structure to protect it from concurrent access in the
* control path.
/* Maximum number of simultaneous MAC addresses. */
#define MLX5_MAX_MAC_ADDRESSES 128
+/* Request send completion once in every 64 sends, might be less. */
+#define MLX5_PMD_TX_PER_COMP_REQ 64
+
+/* Maximum number of Scatter/Gather Elements per Work Request. */
+#ifndef MLX5_PMD_SGE_WR_N
+#define MLX5_PMD_SGE_WR_N 4
+#endif
+
+/* Maximum size for inline data. */
+#ifndef MLX5_PMD_MAX_INLINE
+#define MLX5_PMD_MAX_INLINE 0
+#endif
+
+/*
+ * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
+ * from which buffers are to be transmitted will have to be mapped by this
+ * driver to their own Memory Region (MR). This is a slow operation.
+ *
+ * This value is always 1 for RX queues.
+ */
+#ifndef MLX5_PMD_TX_MP_CACHE
+#define MLX5_PMD_TX_MP_CACHE 8
+#endif
+
#endif /* RTE_PMD_MLX5_DEFS_H_ */
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_common.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+#include "mlx5_defs.h"
+
+/**
+ * Allocate RX queue elements.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param elts_n
+ * Number of elements to allocate.
+ * @param[in] pool
+ * If not NULL, fetch buffers from this array instead of allocating them
+ * with rte_pktmbuf_alloc().
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
+{
+ unsigned int i;
+ struct rxq_elt (*elts)[elts_n] =
+ rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
+ rxq->socket);
+ int ret = 0;
+
+ if (elts == NULL) {
+ ERROR("%p: can't allocate packets array", (void *)rxq);
+ ret = ENOMEM;
+ goto error;
+ }
+ /* For each WR (packet). */
+ for (i = 0; (i != elts_n); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct ibv_recv_wr *wr = &elt->wr;
+ struct ibv_sge *sge = &(*elts)[i].sge;
+ struct rte_mbuf *buf;
+
+ if (pool != NULL) {
+ buf = *(pool++);
+ assert(buf != NULL);
+ rte_pktmbuf_reset(buf);
+ } else
+ buf = rte_pktmbuf_alloc(rxq->mp);
+ if (buf == NULL) {
+ assert(pool == NULL);
+ ERROR("%p: empty mbuf pool", (void *)rxq);
+ ret = ENOMEM;
+ goto error;
+ }
+ /* Configure WR. Work request ID contains its own index in
+ * the elts array and the offset between SGE buffer header and
+ * its data. */
+ WR_ID(wr->wr_id).id = i;
+ WR_ID(wr->wr_id).offset =
+ (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
+ (uintptr_t)buf);
+ wr->next = &(*elts)[(i + 1)].wr;
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+ /* Headroom is reserved by rte_pktmbuf_alloc(). */
+ assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
+ /* Buffer is supposed to be empty. */
+ assert(rte_pktmbuf_data_len(buf) == 0);
+ assert(rte_pktmbuf_pkt_len(buf) == 0);
+ /* sge->addr must be able to store a pointer. */
+ assert(sizeof(sge->addr) >= sizeof(uintptr_t));
+ /* SGE keeps its headroom. */
+ sge->addr = (uintptr_t)
+ ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
+ sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
+ sge->lkey = rxq->mr->lkey;
+ /* Redundant check for tailroom. */
+ assert(sge->length == rte_pktmbuf_tailroom(buf));
+ /* Make sure elts index and SGE mbuf pointer can be deduced
+ * from WR ID. */
+ if ((WR_ID(wr->wr_id).id != i) ||
+ ((void *)((uintptr_t)sge->addr -
+ WR_ID(wr->wr_id).offset) != buf)) {
+ ERROR("%p: cannot store index and offset in WR ID",
+ (void *)rxq);
+ sge->addr = 0;
+ rte_pktmbuf_free(buf);
+ ret = EOVERFLOW;
+ goto error;
+ }
+ }
+ /* The last WR pointer must be NULL. */
+ (*elts)[(i - 1)].wr.next = NULL;
+ DEBUG("%p: allocated and configured %u single-segment WRs",
+ (void *)rxq, elts_n);
+ rxq->elts_n = elts_n;
+ rxq->elts_head = 0;
+ rxq->elts.no_sp = elts;
+ assert(ret == 0);
+ return 0;
+error:
+ if (elts != NULL) {
+ assert(pool == NULL);
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct rte_mbuf *buf;
+
+ if (elt->sge.addr == 0)
+ continue;
+ assert(WR_ID(elt->wr.wr_id).id == i);
+ buf = (void *)((uintptr_t)elt->sge.addr -
+ WR_ID(elt->wr.wr_id).offset);
+ rte_pktmbuf_free_seg(buf);
+ }
+ rte_free(elts);
+ }
+ DEBUG("%p: failed, freed everything", (void *)rxq);
+ assert(ret > 0);
+ return ret;
+}
+
+/**
+ * Free RX queue elements.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts(struct rxq *rxq)
+{
+ unsigned int i;
+ unsigned int elts_n = rxq->elts_n;
+ struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp;
+
+ DEBUG("%p: freeing WRs", (void *)rxq);
+ rxq->elts_n = 0;
+ rxq->elts.no_sp = NULL;
+ if (elts == NULL)
+ return;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct rxq_elt *elt = &(*elts)[i];
+ struct rte_mbuf *buf;
+
+ if (elt->sge.addr == 0)
+ continue;
+ assert(WR_ID(elt->wr.wr_id).id == i);
+ buf = (void *)((uintptr_t)elt->sge.addr -
+ WR_ID(elt->wr.wr_id).offset);
+ rte_pktmbuf_free_seg(buf);
+ }
+ rte_free(elts);
+}
+
+/**
+ * Clean up a RX queue.
+ *
+ * Destroy objects, free allocated memory and reset the structure for reuse.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ */
+void
+rxq_cleanup(struct rxq *rxq)
+{
+ struct ibv_exp_release_intf_params params;
+
+ DEBUG("cleaning up %p", (void *)rxq);
+ rxq_free_elts(rxq);
+ if (rxq->if_qp != NULL) {
+ assert(rxq->priv != NULL);
+ assert(rxq->priv->ctx != NULL);
+ assert(rxq->qp != NULL);
+ params = (struct ibv_exp_release_intf_params){
+ .comp_mask = 0,
+ };
+ claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
+ rxq->if_qp,
+ ¶ms));
+ }
+ if (rxq->if_cq != NULL) {
+ assert(rxq->priv != NULL);
+ assert(rxq->priv->ctx != NULL);
+ assert(rxq->cq != NULL);
+ params = (struct ibv_exp_release_intf_params){
+ .comp_mask = 0,
+ };
+ claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
+ rxq->if_cq,
+ ¶ms));
+ }
+ if (rxq->qp != NULL) {
+ claim_zero(ibv_destroy_qp(rxq->qp));
+ }
+ if (rxq->cq != NULL)
+ claim_zero(ibv_destroy_cq(rxq->cq));
+ if (rxq->rd != NULL) {
+ struct ibv_exp_destroy_res_domain_attr attr = {
+ .comp_mask = 0,
+ };
+
+ assert(rxq->priv != NULL);
+ assert(rxq->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx,
+ rxq->rd,
+ &attr));
+ }
+ if (rxq->mr != NULL)
+ claim_zero(ibv_dereg_mr(rxq->mr));
+ memset(rxq, 0, sizeof(*rxq));
+}
+
+/**
+ * Allocate a Queue Pair.
+ * Optionally setup inline receive if supported.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param cq
+ * Completion queue to associate with QP.
+ * @param desc
+ * Number of descriptors in QP (hint only).
+ *
+ * @return
+ * QP pointer or NULL in case of error.
+ */
+static struct ibv_qp *
+rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
+ struct ibv_exp_res_domain *rd)
+{
+ struct ibv_exp_qp_init_attr attr = {
+ /* CQ to be associated with the send queue. */
+ .send_cq = cq,
+ /* CQ to be associated with the receive queue. */
+ .recv_cq = cq,
+ .cap = {
+ /* Max number of outstanding WRs. */
+ .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
+ priv->device_attr.max_qp_wr :
+ desc),
+ /* Max number of scatter/gather elements in a WR. */
+ .max_recv_sge = ((priv->device_attr.max_sge <
+ MLX5_PMD_SGE_WR_N) ?
+ priv->device_attr.max_sge :
+ MLX5_PMD_SGE_WR_N),
+ },
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
+ IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
+ .pd = priv->pd,
+ .res_domain = rd,
+ };
+
+ return ibv_exp_create_qp(priv->ctx, &attr);
+}
+
+#ifdef RSS_SUPPORT
+
+/**
+ * Allocate a RSS Queue Pair.
+ * Optionally setup inline receive if supported.
+ *
+ * @param priv
+ * Pointer to private structure.
+ * @param cq
+ * Completion queue to associate with QP.
+ * @param desc
+ * Number of descriptors in QP (hint only).
+ * @param parent
+ * If nonzero, create a parent QP, otherwise a child.
+ *
+ * @return
+ * QP pointer or NULL in case of error.
+ */
+static struct ibv_qp *
+rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
+ int parent, struct ibv_exp_res_domain *rd)
+{
+ struct ibv_exp_qp_init_attr attr = {
+ /* CQ to be associated with the send queue. */
+ .send_cq = cq,
+ /* CQ to be associated with the receive queue. */
+ .recv_cq = cq,
+ .cap = {
+ /* Max number of outstanding WRs. */
+ .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
+ priv->device_attr.max_qp_wr :
+ desc),
+ /* Max number of scatter/gather elements in a WR. */
+ .max_recv_sge = ((priv->device_attr.max_sge <
+ MLX5_PMD_SGE_WR_N) ?
+ priv->device_attr.max_sge :
+ MLX5_PMD_SGE_WR_N),
+ },
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
+ IBV_EXP_QP_INIT_ATTR_RES_DOMAIN |
+ IBV_EXP_QP_INIT_ATTR_QPG),
+ .pd = priv->pd,
+ .res_domain = rd,
+ };
+
+ if (parent) {
+ attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
+ /* TSS isn't necessary. */
+ attr.qpg.parent_attrib.tss_child_count = 0;
+ attr.qpg.parent_attrib.rss_child_count = priv->rxqs_n;
+ DEBUG("initializing parent RSS queue");
+ } else {
+ attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
+ attr.qpg.qpg_parent = priv->rxq_parent.qp;
+ DEBUG("initializing child RSS queue");
+ }
+ return ibv_exp_create_qp(priv->ctx, &attr);
+}
+
+#endif /* RSS_SUPPORT */
+
+/**
+ * Configure a RX queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ * @param mp
+ * Memory pool for buffer allocations.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+int
+rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
+ unsigned int socket, const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rxq tmpl = {
+ .priv = priv,
+ .mp = mp,
+ .socket = socket
+ };
+ struct ibv_exp_qp_attr mod;
+ union {
+ struct ibv_exp_query_intf_params params;
+ struct ibv_exp_cq_init_attr cq;
+ struct ibv_exp_res_domain_init_attr rd;
+ } attr;
+ enum ibv_exp_query_intf_status status;
+ struct ibv_recv_wr *bad_wr;
+ struct rte_mbuf *buf;
+ int ret = 0;
+ int parent = (rxq == &priv->rxq_parent);
+
+ (void)conf; /* Thresholds configuration (ignored). */
+ /*
+ * If this is a parent queue, hardware must support RSS and
+ * RSS must be enabled.
+ */
+ assert((!parent) || ((priv->hw_rss) && (priv->rss)));
+ if (parent) {
+ /* Even if unused, ibv_create_cq() requires at least one
+ * descriptor. */
+ desc = 1;
+ goto skip_mr;
+ }
+ if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
+ ERROR("%p: invalid number of RX descriptors (must be a"
+ " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ return EINVAL;
+ }
+ /* Get mbuf length. */
+ buf = rte_pktmbuf_alloc(mp);
+ if (buf == NULL) {
+ ERROR("%p: unable to allocate mbuf", (void *)dev);
+ return ENOMEM;
+ }
+ tmpl.mb_len = buf->buf_len;
+ assert((rte_pktmbuf_headroom(buf) +
+ rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
+ assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
+ rte_pktmbuf_free(buf);
+ /* Use the entire RX mempool as the memory region. */
+ tmpl.mr = ibv_reg_mr(priv->pd,
+ (void *)mp->elt_va_start,
+ (mp->elt_va_end - mp->elt_va_start),
+ (IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_WRITE));
+ if (tmpl.mr == NULL) {
+ ret = EINVAL;
+ ERROR("%p: MR creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+skip_mr:
+ attr.rd = (struct ibv_exp_res_domain_init_attr){
+ .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
+ IBV_EXP_RES_DOMAIN_MSG_MODEL),
+ .thread_model = IBV_EXP_THREAD_SINGLE,
+ .msg_model = IBV_EXP_MSG_HIGH_BW,
+ };
+ tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
+ if (tmpl.rd == NULL) {
+ ret = ENOMEM;
+ ERROR("%p: RD creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ attr.cq = (struct ibv_exp_cq_init_attr){
+ .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
+ .res_domain = tmpl.rd,
+ };
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
+ if (tmpl.cq == NULL) {
+ ret = ENOMEM;
+ ERROR("%p: CQ creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ DEBUG("priv->device_attr.max_qp_wr is %d",
+ priv->device_attr.max_qp_wr);
+ DEBUG("priv->device_attr.max_sge is %d",
+ priv->device_attr.max_sge);
+#ifdef RSS_SUPPORT
+ if (priv->rss)
+ tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent,
+ tmpl.rd);
+ else
+#endif /* RSS_SUPPORT */
+ tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd);
+ if (tmpl.qp == NULL) {
+ ret = (errno ? errno : EINVAL);
+ ERROR("%p: QP creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ mod = (struct ibv_exp_qp_attr){
+ /* Move the QP to this state. */
+ .qp_state = IBV_QPS_INIT,
+ /* Primary port number. */
+ .port_num = priv->port
+ };
+ ret = ibv_exp_modify_qp(tmpl.qp, &mod,
+ (IBV_EXP_QP_STATE |
+#ifdef RSS_SUPPORT
+ (parent ? IBV_EXP_QP_GROUP_RSS : 0) |
+#endif /* RSS_SUPPORT */
+ IBV_EXP_QP_PORT));
+ if (ret) {
+ ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ /* Allocate descriptors for RX queues, except for the RSS parent. */
+ if (parent)
+ goto skip_alloc;
+ ret = rxq_alloc_elts(&tmpl, desc, NULL);
+ if (ret) {
+ ERROR("%p: RXQ allocation failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ ret = ibv_post_recv(tmpl.qp,
+ &(*tmpl.elts.no_sp)[0].wr,
+ &bad_wr);
+ if (ret) {
+ ERROR("%p: ibv_post_recv() failed for WR %p: %s",
+ (void *)dev,
+ (void *)bad_wr,
+ strerror(ret));
+ goto error;
+ }
+skip_alloc:
+ mod = (struct ibv_exp_qp_attr){
+ .qp_state = IBV_QPS_RTR
+ };
+ ret = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
+ if (ret) {
+ ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ /* Save port ID. */
+ tmpl.port_id = dev->data->port_id;
+ DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+ attr.params = (struct ibv_exp_query_intf_params){
+ .intf_scope = IBV_EXP_INTF_GLOBAL,
+ .intf = IBV_EXP_INTF_CQ,
+ .obj = tmpl.cq,
+ };
+ tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
+ if (tmpl.if_cq == NULL) {
+ ERROR("%p: CQ interface family query failed with status %d",
+ (void *)dev, status);
+ goto error;
+ }
+ attr.params = (struct ibv_exp_query_intf_params){
+ .intf_scope = IBV_EXP_INTF_GLOBAL,
+ .intf = IBV_EXP_INTF_QP_BURST,
+ .obj = tmpl.qp,
+ };
+ tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
+ if (tmpl.if_qp == NULL) {
+ ERROR("%p: QP interface family query failed with status %d",
+ (void *)dev, status);
+ goto error;
+ }
+ /* Clean up rxq in case we're reinitializing it. */
+ DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
+ rxq_cleanup(rxq);
+ *rxq = tmpl;
+ DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+ assert(ret == 0);
+ return 0;
+error:
+ rxq_cleanup(&tmpl);
+ assert(ret > 0);
+ return ret;
+}
+
+/**
+ * DPDK callback to configure a RX queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * RX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ * @param mp
+ * Memory pool for buffer allocations.
+ *
+ * @return
+ * 0 on success, negative errno value on failure.
+ */
+int
+mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_rxconf *conf,
+ struct rte_mempool *mp)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct rxq *rxq = (*priv->rxqs)[idx];
+ int ret;
+
+ priv_lock(priv);
+ DEBUG("%p: configuring queue %u for %u descriptors",
+ (void *)dev, idx, desc);
+ if (idx >= priv->rxqs_n) {
+ ERROR("%p: queue index out of range (%u >= %u)",
+ (void *)dev, idx, priv->rxqs_n);
+ priv_unlock(priv);
+ return -EOVERFLOW;
+ }
+ if (rxq != NULL) {
+ DEBUG("%p: reusing already allocated queue index %u (%p)",
+ (void *)dev, idx, (void *)rxq);
+ if (priv->started) {
+ priv_unlock(priv);
+ return -EEXIST;
+ }
+ (*priv->rxqs)[idx] = NULL;
+ rxq_cleanup(rxq);
+ } else {
+ rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket);
+ if (rxq == NULL) {
+ ERROR("%p: unable to allocate queue index %u",
+ (void *)dev, idx);
+ priv_unlock(priv);
+ return -ENOMEM;
+ }
+ }
+ ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
+ if (ret)
+ rte_free(rxq);
+ else {
+ DEBUG("%p: adding RX queue %p to list",
+ (void *)dev, (void *)rxq);
+ (*priv->rxqs)[idx] = rxq;
+ /* Update receive callback. */
+ dev->rx_pkt_burst = mlx5_rx_burst;
+ }
+ priv_unlock(priv);
+ return -ret;
+}
+
+/**
+ * DPDK callback to release a RX queue.
+ *
+ * @param dpdk_rxq
+ * Generic RX queue pointer.
+ */
+void
+mlx5_rx_queue_release(void *dpdk_rxq)
+{
+ struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct priv *priv;
+ unsigned int i;
+
+ if (rxq == NULL)
+ return;
+ priv = rxq->priv;
+ priv_lock(priv);
+ assert(rxq != &priv->rxq_parent);
+ for (i = 0; (i != priv->rxqs_n); ++i)
+ if ((*priv->rxqs)[i] == rxq) {
+ DEBUG("%p: removing RX queue %p from list",
+ (void *)priv->dev, (void *)rxq);
+ (*priv->rxqs)[i] = NULL;
+ break;
+ }
+ rxq_cleanup(rxq);
+ rte_free(rxq);
+ priv_unlock(priv);
+}
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_defs.h"
+
+/**
+ * Manage TX completions.
+ *
+ * When sending a burst, mlx5_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ *
+ * @return
+ * 0 on success, -1 on failure.
+ */
+static int
+txq_complete(struct txq *txq)
+{
+ unsigned int elts_comp = txq->elts_comp;
+ unsigned int elts_tail = txq->elts_tail;
+ const unsigned int elts_n = txq->elts_n;
+ int wcs_n;
+
+ if (unlikely(elts_comp == 0))
+ return 0;
+#ifdef DEBUG_SEND
+ DEBUG("%p: processing %u work requests completions",
+ (void *)txq, elts_comp);
+#endif
+ wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
+ if (unlikely(wcs_n == 0))
+ return 0;
+ if (unlikely(wcs_n < 0)) {
+ DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
+ (void *)txq, wcs_n);
+ return -1;
+ }
+ elts_comp -= wcs_n;
+ assert(elts_comp <= txq->elts_comp);
+ /*
+ * Assume WC status is successful as nothing can be done about it
+ * anyway.
+ */
+ elts_tail += wcs_n * txq->elts_comp_cd_init;
+ if (elts_tail >= elts_n)
+ elts_tail -= elts_n;
+ txq->elts_tail = elts_tail;
+ txq->elts_comp = elts_comp;
+ return 0;
+}
+
+/**
+ * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
+ * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
+ * remove an entry first.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param[in] mp
+ * Memory Pool for which a Memory Region lkey must be returned.
+ *
+ * @return
+ * mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+ unsigned int i;
+ struct ibv_mr *mr;
+
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (unlikely(txq->mp2mr[i].mp == NULL)) {
+ /* Unknown MP, add a new MR for it. */
+ break;
+ }
+ if (txq->mp2mr[i].mp == mp) {
+ assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+ assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+ return txq->mp2mr[i].lkey;
+ }
+ }
+ /* Add a new entry, register MR first. */
+ DEBUG("%p: discovered new memory pool %p", (void *)txq, (void *)mp);
+ mr = ibv_reg_mr(txq->priv->pd,
+ (void *)mp->elt_va_start,
+ (mp->elt_va_end - mp->elt_va_start),
+ (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+ if (unlikely(mr == NULL)) {
+ DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+ (void *)txq);
+ return (uint32_t)-1;
+ }
+ if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+ /* Table is full, remove oldest entry. */
+ DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+ (void *)txq);
+ --i;
+ claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
+ memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+ (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+ }
+ /* Store the new entry. */
+ txq->mp2mr[i].mp = mp;
+ txq->mp2mr[i].mr = mr;
+ txq->mp2mr[i].lkey = mr->lkey;
+ DEBUG("%p: new MR lkey for MP %p: 0x%08" PRIu32,
+ (void *)txq, (void *)mp, txq->mp2mr[i].lkey);
+ return txq->mp2mr[i].lkey;
+}
+
+/**
+ * DPDK callback for TX.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ unsigned int elts_head = txq->elts_head;
+ const unsigned int elts_tail = txq->elts_tail;
+ const unsigned int elts_n = txq->elts_n;
+ unsigned int elts_comp_cd = txq->elts_comp_cd;
+ unsigned int elts_comp = 0;
+ unsigned int i;
+ unsigned int max;
+ int err;
+
+ assert(elts_comp_cd != 0);
+ txq_complete(txq);
+ max = (elts_n - (elts_head - elts_tail));
+ if (max > elts_n)
+ max -= elts_n;
+ assert(max >= 1);
+ assert(max <= elts_n);
+ /* Always leave one free entry in the ring. */
+ --max;
+ if (max == 0)
+ return 0;
+ if (max > pkts_n)
+ max = pkts_n;
+ for (i = 0; (i != max); ++i) {
+ struct rte_mbuf *buf = pkts[i];
+ unsigned int elts_head_next =
+ (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
+ struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
+ struct txq_elt *elt = &(*txq->elts)[elts_head];
+ unsigned int segs = NB_SEGS(buf);
+ uint32_t send_flags = 0;
+
+ /* Clean up old buffer. */
+ if (likely(elt->buf != NULL)) {
+ struct rte_mbuf *tmp = elt->buf;
+
+ /* Faster than rte_pktmbuf_free(). */
+ do {
+ struct rte_mbuf *next = NEXT(tmp);
+
+ rte_pktmbuf_free_seg(tmp);
+ tmp = next;
+ } while (tmp != NULL);
+ }
+ /* Request TX completion. */
+ if (unlikely(--elts_comp_cd == 0)) {
+ elts_comp_cd = txq->elts_comp_cd_init;
+ ++elts_comp;
+ send_flags |= IBV_EXP_QP_BURST_SIGNALED;
+ }
+ if (likely(segs == 1)) {
+ uintptr_t addr;
+ uint32_t length;
+ uint32_t lkey;
+
+ /* Retrieve buffer information. */
+ addr = rte_pktmbuf_mtod(buf, uintptr_t);
+ length = DATA_LEN(buf);
+ /* Retrieve Memory Region key for this memory pool. */
+ lkey = txq_mp2mr(txq, buf->pool);
+ if (unlikely(lkey == (uint32_t)-1)) {
+ /* MR does not exist. */
+ DEBUG("%p: unable to get MP <-> MR"
+ " association", (void *)txq);
+ /* Clean up TX element. */
+ elt->buf = NULL;
+ goto stop;
+ }
+ /* Update element. */
+ elt->buf = buf;
+ if (txq->priv->vf)
+ rte_prefetch0((volatile void *)
+ (uintptr_t)addr);
+ RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+ /* Put packet into send queue. */
+#if MLX5_PMD_MAX_INLINE > 0
+ if (length <= txq->max_inline)
+ err = txq->if_qp->send_pending_inline
+ (txq->qp,
+ (void *)addr,
+ length,
+ send_flags);
+ else
+#endif
+ err = txq->if_qp->send_pending
+ (txq->qp,
+ addr,
+ length,
+ lkey,
+ send_flags);
+ if (unlikely(err))
+ goto stop;
+ } else {
+ DEBUG("%p: TX scattered buffers support not"
+ " compiled in", (void *)txq);
+ goto stop;
+ }
+ elts_head = elts_head_next;
+ }
+stop:
+ /* Take a shortcut if nothing must be sent. */
+ if (unlikely(i == 0))
+ return 0;
+ /* Ring QP doorbell. */
+ err = txq->if_qp->send_flush(txq->qp);
+ if (unlikely(err)) {
+ /* A nonzero value is not supposed to be returned.
+ * Nothing can be done about it. */
+ DEBUG("%p: send_flush() failed with error %d",
+ (void *)txq, err);
+ }
+ txq->elts_head = elts_head;
+ txq->elts_comp += elts_comp;
+ txq->elts_comp_cd = elts_comp_cd;
+ return i;
+}
+
+/**
+ * DPDK callback for RX.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct rxq *rxq = (struct rxq *)dpdk_rxq;
+ struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+ const unsigned int elts_n = rxq->elts_n;
+ unsigned int elts_head = rxq->elts_head;
+ struct ibv_sge sges[pkts_n];
+ unsigned int i;
+ unsigned int pkts_ret = 0;
+ int ret;
+
+ for (i = 0; (i != pkts_n); ++i) {
+ struct rxq_elt *elt = &(*elts)[elts_head];
+ struct ibv_recv_wr *wr = &elt->wr;
+ uint64_t wr_id = wr->wr_id;
+ unsigned int len;
+ struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
+ WR_ID(wr_id).offset);
+ struct rte_mbuf *rep;
+ uint32_t flags;
+
+ /* Sanity checks. */
+ assert(WR_ID(wr_id).id < rxq->elts_n);
+ assert(wr->sg_list == &elt->sge);
+ assert(wr->num_sge == 1);
+ assert(elts_head < rxq->elts_n);
+ assert(rxq->elts_head < rxq->elts_n);
+ /*
+ * Fetch initial bytes of packet descriptor into a
+ * cacheline while allocating rep.
+ */
+ rte_prefetch0(seg);
+ rte_prefetch0(&seg->cacheline1);
+ ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
+ &flags);
+ if (unlikely(ret < 0)) {
+ struct ibv_wc wc;
+ int wcs_n;
+
+ DEBUG("rxq=%p, poll_length() failed (ret=%d)",
+ (void *)rxq, ret);
+ /* ibv_poll_cq() must be used in case of failure. */
+ wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
+ if (unlikely(wcs_n == 0))
+ break;
+ if (unlikely(wcs_n < 0)) {
+ DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
+ (void *)rxq, wcs_n);
+ break;
+ }
+ assert(wcs_n == 1);
+ if (unlikely(wc.status != IBV_WC_SUCCESS)) {
+ /* Whatever, just repost the offending WR. */
+ DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
+ " completion status (%d): %s",
+ (void *)rxq, wc.wr_id, wc.status,
+ ibv_wc_status_str(wc.status));
+ /* Add SGE to array for repost. */
+ sges[i] = elt->sge;
+ goto repost;
+ }
+ ret = wc.byte_len;
+ }
+ if (ret == 0)
+ break;
+ len = ret;
+ rep = __rte_mbuf_raw_alloc(rxq->mp);
+ if (unlikely(rep == NULL)) {
+ /*
+ * Unable to allocate a replacement mbuf,
+ * repost WR.
+ */
+ DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
+ " can't allocate a new mbuf",
+ (void *)rxq, WR_ID(wr_id).id);
+ /* Increment out of memory counters. */
+ ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
+ goto repost;
+ }
+
+ /* Reconfigure sge to use rep instead of seg. */
+ elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
+ assert(elt->sge.lkey == rxq->mr->lkey);
+ WR_ID(wr->wr_id).offset =
+ (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
+ (uintptr_t)rep);
+ assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
+
+ /* Add SGE to array for repost. */
+ sges[i] = elt->sge;
+
+ /* Update seg information. */
+ SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
+ NB_SEGS(seg) = 1;
+ PORT(seg) = rxq->port_id;
+ NEXT(seg) = NULL;
+ PKT_LEN(seg) = len;
+ DATA_LEN(seg) = len;
+
+ /* Return packet. */
+ *(pkts++) = seg;
+ ++pkts_ret;
+repost:
+ if (++elts_head >= elts_n)
+ elts_head = 0;
+ continue;
+ }
+ if (unlikely(i == 0))
+ return 0;
+ /* Repost WRs. */
+#ifdef DEBUG_RECV
+ DEBUG("%p: reposting %u WRs", (void *)rxq, i);
+#endif
+ ret = rxq->if_qp->recv_burst(rxq->qp, sges, i);
+ if (unlikely(ret)) {
+ /* Inability to repost WRs is fatal. */
+ DEBUG("%p: recv_burst(): failed (ret=%d)",
+ (void *)rxq->priv,
+ ret);
+ abort();
+ }
+ rxq->elts_head = elts_head;
+ return pkts_ret;
+}
+
+/**
+ * Dummy DPDK callback for TX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ (void)dpdk_txq;
+ (void)pkts;
+ (void)pkts_n;
+ return 0;
+}
+
+/**
+ * Dummy DPDK callback for RX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ (void)dpdk_rxq;
+ (void)pkts;
+ (void)pkts_n;
+ return 0;
+}
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_RXTX_H_
+#define RTE_PMD_MLX5_RXTX_H_
+
+#include <stdint.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5_utils.h"
+#include "mlx5.h"
+#include "mlx5_defs.h"
+
+/* RX element. */
+struct rxq_elt {
+ struct ibv_recv_wr wr; /* Work Request. */
+ struct ibv_sge sge; /* Scatter/Gather Element. */
+ /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */
+};
+
+struct priv;
+
+/* RX queue descriptor. */
+struct rxq {
+ struct priv *priv; /* Back pointer to private data. */
+ struct rte_mempool *mp; /* Memory Pool for allocations. */
+ struct ibv_mr *mr; /* Memory Region (for mp). */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_qp *qp; /* Queue Pair. */
+ struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
+ struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+ unsigned int port_id; /* Port ID for incoming packets. */
+ unsigned int elts_n; /* (*elts)[] length. */
+ unsigned int elts_head; /* Current index in (*elts)[]. */
+ union {
+ struct rxq_elt (*no_sp)[]; /* RX elements. */
+ } elts;
+ uint32_t mb_len; /* Length of a mp-issued mbuf. */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ struct ibv_exp_res_domain *rd; /* Resource Domain. */
+};
+
+/* TX element. */
+struct txq_elt {
+ struct rte_mbuf *buf;
+};
+
+/* Linear buffer type. It is used when transmitting buffers with too many
+ * segments that do not fit the hardware queue (see max_send_sge).
+ * Extra segments are copied (linearized) in such buffers, replacing the
+ * last SGE during TX.
+ * The size is arbitrary but large enough to hold a jumbo frame with
+ * 8 segments considering mbuf.buf_len is about 2048 bytes. */
+typedef uint8_t linear_t[16384];
+
+/* TX queue descriptor. */
+struct txq {
+ struct priv *priv; /* Back pointer to private data. */
+ struct {
+ struct rte_mempool *mp; /* Cached Memory Pool. */
+ struct ibv_mr *mr; /* Memory Region (for mp). */
+ uint32_t lkey; /* mr->lkey */
+ } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
+ struct ibv_cq *cq; /* Completion Queue. */
+ struct ibv_qp *qp; /* Queue Pair. */
+ struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
+ struct ibv_exp_cq_family *if_cq; /* CQ interface. */
+#if MLX5_PMD_MAX_INLINE > 0
+ uint32_t max_inline; /* Max inline send size <= MLX5_PMD_MAX_INLINE. */
+#endif
+ unsigned int elts_n; /* (*elts)[] length. */
+ struct txq_elt (*elts)[]; /* TX elements. */
+ unsigned int elts_head; /* Current index in (*elts)[]. */
+ unsigned int elts_tail; /* First element awaiting completion. */
+ unsigned int elts_comp; /* Number of completion requests. */
+ unsigned int elts_comp_cd; /* Countdown for next completion request. */
+ unsigned int elts_comp_cd_init; /* Initial value for countdown. */
+ linear_t (*elts_linear)[]; /* Linearized buffers. */
+ struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */
+ unsigned int socket; /* CPU socket ID for allocations. */
+ struct ibv_exp_res_domain *rd; /* Resource Domain. */
+};
+
+/* mlx5_rxq.c */
+
+void rxq_cleanup(struct rxq *);
+int rxq_setup(struct rte_eth_dev *, struct rxq *, uint16_t, unsigned int,
+ const struct rte_eth_rxconf *, struct rte_mempool *);
+int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
+ const struct rte_eth_rxconf *, struct rte_mempool *);
+void mlx5_rx_queue_release(void *);
+
+/* mlx5_txq.c */
+
+void txq_cleanup(struct txq *);
+int mlx5_tx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
+ const struct rte_eth_txconf *);
+void mlx5_tx_queue_release(void *);
+
+/* mlx5_rxtx.c */
+
+uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
+
+#endif /* RTE_PMD_MLX5_RXTX_H_ */
--- /dev/null
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+
+/* Verbs header. */
+/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+/* DPDK headers don't like -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-pedantic"
+#endif
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_common.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-pedantic"
+#endif
+
+#include "mlx5_utils.h"
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_defs.h"
+
+/**
+ * Allocate TX queue elements.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param elts_n
+ * Number of elements to allocate.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+txq_alloc_elts(struct txq *txq, unsigned int elts_n)
+{
+ unsigned int i;
+ struct txq_elt (*elts)[elts_n] =
+ rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket);
+ linear_t (*elts_linear)[elts_n] =
+ rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0,
+ txq->socket);
+ struct ibv_mr *mr_linear = NULL;
+ int ret = 0;
+
+ if ((elts == NULL) || (elts_linear == NULL)) {
+ ERROR("%p: can't allocate packets array", (void *)txq);
+ ret = ENOMEM;
+ goto error;
+ }
+ mr_linear =
+ ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear),
+ (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
+ if (mr_linear == NULL) {
+ ERROR("%p: unable to configure MR, ibv_reg_mr() failed",
+ (void *)txq);
+ ret = EINVAL;
+ goto error;
+ }
+ for (i = 0; (i != elts_n); ++i) {
+ struct txq_elt *elt = &(*elts)[i];
+
+ elt->buf = NULL;
+ }
+ DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n);
+ txq->elts_n = elts_n;
+ txq->elts = elts;
+ txq->elts_head = 0;
+ txq->elts_tail = 0;
+ txq->elts_comp = 0;
+ /* Request send completion every MLX5_PMD_TX_PER_COMP_REQ packets or
+ * at least 4 times per ring. */
+ txq->elts_comp_cd_init =
+ ((MLX5_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ?
+ MLX5_PMD_TX_PER_COMP_REQ : (elts_n / 4));
+ txq->elts_comp_cd = txq->elts_comp_cd_init;
+ txq->elts_linear = elts_linear;
+ txq->mr_linear = mr_linear;
+ assert(ret == 0);
+ return 0;
+error:
+ if (mr_linear != NULL)
+ claim_zero(ibv_dereg_mr(mr_linear));
+
+ rte_free(elts_linear);
+ rte_free(elts);
+
+ DEBUG("%p: failed, freed everything", (void *)txq);
+ assert(ret > 0);
+ return ret;
+}
+
+/**
+ * Free TX queue elements.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ */
+static void
+txq_free_elts(struct txq *txq)
+{
+ unsigned int i;
+ unsigned int elts_n = txq->elts_n;
+ struct txq_elt (*elts)[elts_n] = txq->elts;
+ linear_t (*elts_linear)[elts_n] = txq->elts_linear;
+ struct ibv_mr *mr_linear = txq->mr_linear;
+
+ DEBUG("%p: freeing WRs", (void *)txq);
+ txq->elts_n = 0;
+ txq->elts = NULL;
+ txq->elts_linear = NULL;
+ txq->mr_linear = NULL;
+ if (mr_linear != NULL)
+ claim_zero(ibv_dereg_mr(mr_linear));
+
+ rte_free(elts_linear);
+ if (elts == NULL)
+ return;
+ for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+ struct txq_elt *elt = &(*elts)[i];
+
+ if (elt->buf == NULL)
+ continue;
+ rte_pktmbuf_free(elt->buf);
+ }
+ rte_free(elts);
+}
+
+/**
+ * Clean up a TX queue.
+ *
+ * Destroy objects, free allocated memory and reset the structure for reuse.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ */
+void
+txq_cleanup(struct txq *txq)
+{
+ struct ibv_exp_release_intf_params params;
+ size_t i;
+
+ DEBUG("cleaning up %p", (void *)txq);
+ txq_free_elts(txq);
+ if (txq->if_qp != NULL) {
+ assert(txq->priv != NULL);
+ assert(txq->priv->ctx != NULL);
+ assert(txq->qp != NULL);
+ params = (struct ibv_exp_release_intf_params){
+ .comp_mask = 0,
+ };
+ claim_zero(ibv_exp_release_intf(txq->priv->ctx,
+ txq->if_qp,
+ ¶ms));
+ }
+ if (txq->if_cq != NULL) {
+ assert(txq->priv != NULL);
+ assert(txq->priv->ctx != NULL);
+ assert(txq->cq != NULL);
+ params = (struct ibv_exp_release_intf_params){
+ .comp_mask = 0,
+ };
+ claim_zero(ibv_exp_release_intf(txq->priv->ctx,
+ txq->if_cq,
+ ¶ms));
+ }
+ if (txq->qp != NULL)
+ claim_zero(ibv_destroy_qp(txq->qp));
+ if (txq->cq != NULL)
+ claim_zero(ibv_destroy_cq(txq->cq));
+ if (txq->rd != NULL) {
+ struct ibv_exp_destroy_res_domain_attr attr = {
+ .comp_mask = 0,
+ };
+
+ assert(txq->priv != NULL);
+ assert(txq->priv->ctx != NULL);
+ claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx,
+ txq->rd,
+ &attr));
+ }
+ for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+ if (txq->mp2mr[i].mp == NULL)
+ break;
+ assert(txq->mp2mr[i].mr != NULL);
+ claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
+ }
+ memset(txq, 0, sizeof(*txq));
+}
+
+/**
+ * Configure a TX queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ *
+ * @return
+ * 0 on success, errno value on failure.
+ */
+static int
+txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
+ unsigned int socket, const struct rte_eth_txconf *conf)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct txq tmpl = {
+ .priv = priv,
+ .socket = socket
+ };
+ union {
+ struct ibv_exp_query_intf_params params;
+ struct ibv_exp_qp_init_attr init;
+ struct ibv_exp_res_domain_init_attr rd;
+ struct ibv_exp_cq_init_attr cq;
+ struct ibv_exp_qp_attr mod;
+ } attr;
+ enum ibv_exp_query_intf_status status;
+ int ret = 0;
+
+ (void)conf; /* Thresholds configuration (ignored). */
+ if ((desc == 0) || (desc % MLX5_PMD_SGE_WR_N)) {
+ ERROR("%p: invalid number of TX descriptors (must be a"
+ " multiple of %d)", (void *)dev, MLX5_PMD_SGE_WR_N);
+ return EINVAL;
+ }
+ desc /= MLX5_PMD_SGE_WR_N;
+ /* MRs will be registered in mp2mr[] later. */
+ attr.rd = (struct ibv_exp_res_domain_init_attr){
+ .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
+ IBV_EXP_RES_DOMAIN_MSG_MODEL),
+ .thread_model = IBV_EXP_THREAD_SINGLE,
+ .msg_model = IBV_EXP_MSG_HIGH_BW,
+ };
+ tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
+ if (tmpl.rd == NULL) {
+ ret = ENOMEM;
+ ERROR("%p: RD creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ attr.cq = (struct ibv_exp_cq_init_attr){
+ .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
+ .res_domain = tmpl.rd,
+ };
+ tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq);
+ if (tmpl.cq == NULL) {
+ ret = ENOMEM;
+ ERROR("%p: CQ creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ DEBUG("priv->device_attr.max_qp_wr is %d",
+ priv->device_attr.max_qp_wr);
+ DEBUG("priv->device_attr.max_sge is %d",
+ priv->device_attr.max_sge);
+ attr.init = (struct ibv_exp_qp_init_attr){
+ /* CQ to be associated with the send queue. */
+ .send_cq = tmpl.cq,
+ /* CQ to be associated with the receive queue. */
+ .recv_cq = tmpl.cq,
+ .cap = {
+ /* Max number of outstanding WRs. */
+ .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ?
+ priv->device_attr.max_qp_wr :
+ desc),
+ /* Max number of scatter/gather elements in a WR. */
+ .max_send_sge = ((priv->device_attr.max_sge <
+ MLX5_PMD_SGE_WR_N) ?
+ priv->device_attr.max_sge :
+ MLX5_PMD_SGE_WR_N),
+#if MLX5_PMD_MAX_INLINE > 0
+ .max_inline_data = MLX5_PMD_MAX_INLINE,
+#endif
+ },
+ .qp_type = IBV_QPT_RAW_PACKET,
+ /* Do *NOT* enable this, completions events are managed per
+ * TX burst. */
+ .sq_sig_all = 0,
+ .pd = priv->pd,
+ .res_domain = tmpl.rd,
+ .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
+ IBV_EXP_QP_INIT_ATTR_RES_DOMAIN),
+ };
+ tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
+ if (tmpl.qp == NULL) {
+ ret = (errno ? errno : EINVAL);
+ ERROR("%p: QP creation failure: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+#if MLX5_PMD_MAX_INLINE > 0
+ /* ibv_create_qp() updates this value. */
+ tmpl.max_inline = attr.init.cap.max_inline_data;
+#endif
+ attr.mod = (struct ibv_exp_qp_attr){
+ /* Move the QP to this state. */
+ .qp_state = IBV_QPS_INIT,
+ /* Primary port number. */
+ .port_num = priv->port
+ };
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod,
+ (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT));
+ if (ret) {
+ ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ ret = txq_alloc_elts(&tmpl, desc);
+ if (ret) {
+ ERROR("%p: TXQ allocation failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ attr.mod = (struct ibv_exp_qp_attr){
+ .qp_state = IBV_QPS_RTR
+ };
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ if (ret) {
+ ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ attr.mod.qp_state = IBV_QPS_RTS;
+ ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE);
+ if (ret) {
+ ERROR("%p: QP state to IBV_QPS_RTS failed: %s",
+ (void *)dev, strerror(ret));
+ goto error;
+ }
+ attr.params = (struct ibv_exp_query_intf_params){
+ .intf_scope = IBV_EXP_INTF_GLOBAL,
+ .intf = IBV_EXP_INTF_CQ,
+ .obj = tmpl.cq,
+ };
+ tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
+ if (tmpl.if_cq == NULL) {
+ ret = EINVAL;
+ ERROR("%p: CQ interface family query failed with status %d",
+ (void *)dev, status);
+ goto error;
+ }
+ attr.params = (struct ibv_exp_query_intf_params){
+ .intf_scope = IBV_EXP_INTF_GLOBAL,
+ .intf = IBV_EXP_INTF_QP_BURST,
+ .obj = tmpl.qp,
+ };
+ tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
+ if (tmpl.if_qp == NULL) {
+ ret = EINVAL;
+ ERROR("%p: QP interface family query failed with status %d",
+ (void *)dev, status);
+ goto error;
+ }
+ /* Clean up txq in case we're reinitializing it. */
+ DEBUG("%p: cleaning-up old txq just in case", (void *)txq);
+ txq_cleanup(txq);
+ *txq = tmpl;
+ DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl);
+ assert(ret == 0);
+ return 0;
+error:
+ txq_cleanup(&tmpl);
+ assert(ret > 0);
+ return ret;
+}
+
+/**
+ * DPDK callback to configure a TX queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param idx
+ * TX queue index.
+ * @param desc
+ * Number of descriptors to configure in queue.
+ * @param socket
+ * NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ * Thresholds parameters.
+ *
+ * @return
+ * 0 on success, negative errno value on failure.
+ */
+int
+mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+ unsigned int socket, const struct rte_eth_txconf *conf)
+{
+ struct priv *priv = dev->data->dev_private;
+ struct txq *txq = (*priv->txqs)[idx];
+ int ret;
+
+ priv_lock(priv);
+ DEBUG("%p: configuring queue %u for %u descriptors",
+ (void *)dev, idx, desc);
+ if (idx >= priv->txqs_n) {
+ ERROR("%p: queue index out of range (%u >= %u)",
+ (void *)dev, idx, priv->txqs_n);
+ priv_unlock(priv);
+ return -EOVERFLOW;
+ }
+ if (txq != NULL) {
+ DEBUG("%p: reusing already allocated queue index %u (%p)",
+ (void *)dev, idx, (void *)txq);
+ if (priv->started) {
+ priv_unlock(priv);
+ return -EEXIST;
+ }
+ (*priv->txqs)[idx] = NULL;
+ txq_cleanup(txq);
+ } else {
+ txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket);
+ if (txq == NULL) {
+ ERROR("%p: unable to allocate queue index %u",
+ (void *)dev, idx);
+ priv_unlock(priv);
+ return -ENOMEM;
+ }
+ }
+ ret = txq_setup(dev, txq, desc, socket, conf);
+ if (ret)
+ rte_free(txq);
+ else {
+ DEBUG("%p: adding TX queue %p to list",
+ (void *)dev, (void *)txq);
+ (*priv->txqs)[idx] = txq;
+ /* Update send callback. */
+ dev->tx_pkt_burst = mlx5_tx_burst;
+ }
+ priv_unlock(priv);
+ return -ret;
+}
+
+/**
+ * DPDK callback to release a TX queue.
+ *
+ * @param dpdk_txq
+ * Generic TX queue pointer.
+ */
+void
+mlx5_tx_queue_release(void *dpdk_txq)
+{
+ struct txq *txq = (struct txq *)dpdk_txq;
+ struct priv *priv;
+ unsigned int i;
+
+ if (txq == NULL)
+ return;
+ priv = txq->priv;
+ priv_lock(priv);
+ for (i = 0; (i != priv->txqs_n); ++i)
+ if ((*priv->txqs)[i] == txq) {
+ DEBUG("%p: removing TX queue %p from list",
+ (void *)priv->dev, (void *)txq);
+ (*priv->txqs)[i] = NULL;
+ break;
+ }
+ txq_cleanup(txq);
+ rte_free(txq);
+ priv_unlock(priv);
+}
#define WARN(...) PMD_DRV_LOG(WARNING, __VA_ARGS__)
#define ERROR(...) PMD_DRV_LOG(ERR, __VA_ARGS__)
+/* Convenience macros for accessing mbuf fields. */
+#define NEXT(m) ((m)->next)
+#define DATA_LEN(m) ((m)->data_len)
+#define PKT_LEN(m) ((m)->pkt_len)
+#define DATA_OFF(m) ((m)->data_off)
+#define SET_DATA_OFF(m, o) ((m)->data_off = (o))
+#define NB_SEGS(m) ((m)->nb_segs)
+#define PORT(m) ((m)->port)
+
/* Allocate a buffer on the stack and fill it with a printf format string. */
#define MKSTR(name, ...) \
char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \
\
snprintf(name, sizeof(name), __VA_ARGS__)
+#define WR_ID(o) (((wr_id_t *)&(o))->data)
+
#endif /* RTE_PMD_MLX5_UTILS_H_ */