#pragma GCC diagnostic error "-Wpedantic"
#endif
+#include <rte_byteorder.h>
#include <rte_common.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
static int
mlx4_rxq_alloc_elts(struct rxq *rxq)
{
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
+ const uint32_t elts_n = 1 << rxq->elts_n;
+ const uint32_t sges_n = 1 << rxq->sges_n;
+ struct rte_mbuf *(*elts)[elts_n] = rxq->elts;
unsigned int i;
- /* For each WR (packet). */
+ assert(rte_is_power_of_2(elts_n));
for (i = 0; i != RTE_DIM(*elts); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct ibv_recv_wr *wr = &elt->wr;
- struct ibv_sge *sge = &(*elts)[i].sge;
+ volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[i];
struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
if (buf == NULL) {
while (i--) {
- rte_pktmbuf_free_seg((*elts)[i].buf);
- (*elts)[i].buf = NULL;
+ rte_pktmbuf_free_seg((*elts)[i]);
+ (*elts)[i] = NULL;
}
rte_errno = ENOMEM;
return -rte_errno;
}
- elt->buf = buf;
- wr->next = &(*elts)[(i + 1)].wr;
- wr->sg_list = sge;
- wr->num_sge = 1;
/* Headroom is reserved by rte_pktmbuf_alloc(). */
assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
/* Buffer is supposed to be empty. */
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- /* SGE keeps its headroom. */
- sge->addr = (uintptr_t)
- ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
- sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
+ /* Only the first segment keeps headroom. */
+ if (i % sges_n)
+ buf->data_off = 0;
+ buf->port = rxq->port_id;
+ buf->data_len = rte_pktmbuf_tailroom(buf);
+ buf->pkt_len = rte_pktmbuf_tailroom(buf);
+ buf->nb_segs = 1;
+ *scat = (struct mlx4_wqe_data_seg){
+ .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+ uintptr_t)),
+ .byte_count = rte_cpu_to_be_32(buf->data_len),
+ .lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+ };
+ (*elts)[i] = buf;
}
- /* The last WR pointer must be NULL. */
- (*elts)[(i - 1)].wr.next = NULL;
+ DEBUG("%p: allocated and configured %u segments (max %u packets)",
+ (void *)rxq, elts_n, elts_n / sges_n);
return 0;
}
mlx4_rxq_free_elts(struct rxq *rxq)
{
unsigned int i;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
+ struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts;
- DEBUG("%p: freeing WRs", (void *)rxq);
+ DEBUG("%p: freeing Rx queue elements", (void *)rxq);
for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- if (!(*elts)[i].buf)
+ if (!(*elts)[i])
continue;
- rte_pktmbuf_free_seg((*elts)[i].buf);
- (*elts)[i].buf = NULL;
+ rte_pktmbuf_free_seg((*elts)[i]);
+ (*elts)[i] = NULL;
}
}
struct rte_mempool *mp)
{
struct priv *priv = dev->data->dev_private;
+ struct mlx4dv_obj mlxdv;
+ struct mlx4dv_rwq dv_rwq;
+ struct mlx4dv_cq dv_cq;
uint32_t mb_len = rte_pktmbuf_data_room_size(mp);
- struct rxq_elt (*elts)[desc];
+ struct rte_mbuf *(*elts)[rte_align32pow2(desc)];
struct rte_flow_error error;
struct rxq *rxq;
struct mlx4_malloc_vec vec[] = {
ERROR("%p: invalid number of Rx descriptors", (void *)dev);
return -rte_errno;
}
+ if (desc != RTE_DIM(*elts)) {
+ desc = RTE_DIM(*elts);
+ WARN("%p: increased number of descriptors in Rx queue %u"
+ " to the next power of two (%u)",
+ (void *)dev, idx, desc);
+ }
/* Allocate and initialize Rx queue. */
mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket);
if (!rxq) {
.priv = priv,
.mp = mp,
.port_id = dev->data->port_id,
- .elts_n = desc,
- .elts_head = 0,
+ .sges_n = 0,
+ .elts_n = rte_log2_u32(desc),
.elts = elts,
.stats.idx = idx,
.socket = socket,
(mb_len - RTE_PKTMBUF_HEADROOM)) {
;
} else if (dev->data->dev_conf.rxmode.enable_scatter) {
- WARN("%p: scattered mode has been requested but is"
- " not supported, this may lead to packet loss",
- (void *)dev);
+ uint32_t size =
+ RTE_PKTMBUF_HEADROOM +
+ dev->data->dev_conf.rxmode.max_rx_pkt_len;
+ uint32_t sges_n;
+
+ /*
+ * Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two.
+ */
+ sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len));
+ rxq->sges_n = sges_n;
+ /* Make sure sges_n did not overflow. */
+ size = mb_len * (1 << rxq->sges_n);
+ size -= RTE_PKTMBUF_HEADROOM;
+ if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+ rte_errno = EOVERFLOW;
+ ERROR("%p: too many SGEs (%u) needed to handle"
+ " requested maximum packet size %u",
+ (void *)dev,
+ 1 << sges_n,
+ dev->data->dev_conf.rxmode.max_rx_pkt_len);
+ goto error;
+ }
} else {
WARN("%p: the requested maximum Rx packet size (%u) is"
" larger than a single mbuf (%u) and scattered"
dev->data->dev_conf.rxmode.max_rx_pkt_len,
mb_len - RTE_PKTMBUF_HEADROOM);
}
+ DEBUG("%p: maximum number of segments per packet: %u",
+ (void *)dev, 1 << rxq->sges_n);
+ if (desc % (1 << rxq->sges_n)) {
+ rte_errno = EINVAL;
+ ERROR("%p: number of Rx queue descriptors (%u) is not a"
+ " multiple of maximum segments per packet (%u)",
+ (void *)dev,
+ desc,
+ 1 << rxq->sges_n);
+ goto error;
+ }
/* Use the entire Rx mempool as the memory region. */
rxq->mr = mlx4_mp2mr(priv->pd, mp);
if (!rxq->mr) {
goto error;
}
}
- rxq->cq = ibv_create_cq(priv->ctx, desc, NULL, rxq->channel, 0);
+ rxq->cq = ibv_create_cq(priv->ctx, desc >> rxq->sges_n, NULL,
+ rxq->channel, 0);
if (!rxq->cq) {
rte_errno = ENOMEM;
ERROR("%p: CQ creation failure: %s",
(priv->ctx,
&(struct ibv_wq_init_attr){
.wq_type = IBV_WQT_RQ,
- .max_wr = RTE_MIN(priv->device_attr.max_qp_wr, desc),
- .max_sge = 1,
+ .max_wr = desc >> rxq->sges_n,
+ .max_sge = 1 << rxq->sges_n,
.pd = priv->pd,
.cq = rxq->cq,
});
(void *)dev, strerror(rte_errno));
goto error;
}
- ret = mlx4_rxq_alloc_elts(rxq);
+ /* Retrieve device queue information. */
+ mlxdv.cq.in = rxq->cq;
+ mlxdv.cq.out = &dv_cq;
+ mlxdv.rwq.in = rxq->wq;
+ mlxdv.rwq.out = &dv_rwq;
+ ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ);
if (ret) {
- ERROR("%p: RXQ allocation failed: %s",
- (void *)dev, strerror(rte_errno));
+ rte_errno = EINVAL;
+ ERROR("%p: failed to obtain device information", (void *)dev);
goto error;
}
- ret = ibv_post_wq_recv(rxq->wq, &(*rxq->elts)[0].wr,
- &(struct ibv_recv_wr *){ NULL });
+ rxq->wqes =
+ (volatile struct mlx4_wqe_data_seg (*)[])
+ ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset);
+ rxq->rq_db = dv_rwq.rdb;
+ rxq->rq_ci = 0;
+ rxq->mcq.buf = dv_cq.buf.buf;
+ rxq->mcq.cqe_cnt = dv_cq.cqe_cnt;
+ rxq->mcq.set_ci_db = dv_cq.set_ci_db;
+ rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0;
+ ret = mlx4_rxq_alloc_elts(rxq);
if (ret) {
- rte_errno = ret;
- ERROR("%p: ibv_post_recv() failed: %s",
- (void *)dev,
- strerror(rte_errno));
+ ERROR("%p: RXQ allocation failed: %s",
+ (void *)dev, strerror(rte_errno));
goto error;
}
DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq);
dev->data->rx_queues[idx] = rxq;
/* Enable associated flows. */
ret = mlx4_flow_sync(priv, &error);
- if (!ret)
+ if (!ret) {
+ /* Update doorbell counter. */
+ rxq->rq_ci = desc >> rxq->sges_n;
+ rte_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
return 0;
+ }
ERROR("cannot re-attach flow rules to queue %u"
" (code %d, \"%s\"), flow error type %d, cause %p, message: %s",
idx, -ret, strerror(-ret), error.type, error.cause,
}
/**
- * DPDK callback for Rx.
+ * Poll one CQE from CQ.
*
- * The following function doesn't manage scattered packets.
+ * @param rxq
+ * Pointer to the receive queue structure.
+ * @param[out] out
+ * Just polled CQE.
+ *
+ * @return
+ * Number of bytes of the CQE, 0 in case there is no completion.
+ */
+static unsigned int
+mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out)
+{
+ int ret = 0;
+ struct mlx4_cqe *cqe = NULL;
+ struct mlx4_cq *cq = &rxq->mcq;
+
+ cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+ if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(cq->cons_index & cq->cqe_cnt))
+ goto out;
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rte_rmb();
+ assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+ assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+ MLX4_CQE_OPCODE_ERROR);
+ ret = rte_be_to_cpu_32(cqe->byte_cnt);
+ ++cq->cons_index;
+out:
+ *out = cqe;
+ return ret;
+}
+
+/**
+ * DPDK callback for Rx with scattered packets support.
*
* @param dpdk_rxq
* Generic pointer to Rx queue structure.
uint16_t
mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_wc wcs[pkts_n];
- struct ibv_recv_wr *wr_head = NULL;
- struct ibv_recv_wr **wr_next = &wr_head;
- struct ibv_recv_wr *wr_bad = NULL;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
+ struct rxq *rxq = dpdk_rxq;
+ const uint32_t wr_cnt = (1 << rxq->elts_n) - 1;
+ const uint16_t sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ unsigned int i = 0;
+ uint32_t rq_ci = rxq->rq_ci << sges_n;
+ int len = 0;
- ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
- if (unlikely(ret == 0))
- return 0;
- if (unlikely(ret < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
- (void *)rxq, ret);
- return 0;
- }
- assert(ret <= (int)pkts_n);
- /* For each work completion. */
- for (i = 0; i != (unsigned int)ret; ++i) {
- struct ibv_wc *wc = &wcs[i];
- struct rxq_elt *elt = &(*elts)[elts_head];
- struct ibv_recv_wr *wr = &elt->wr;
- uint32_t len = wc->byte_len;
- struct rte_mbuf *seg = elt->buf;
- struct rte_mbuf *rep;
+ while (pkts_n) {
+ struct mlx4_cqe *cqe;
+ uint32_t idx = rq_ci & wr_cnt;
+ struct rte_mbuf *rep = (*rxq->elts)[idx];
+ volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
- /* Sanity checks. */
- assert(wr->sg_list == &elt->sge);
- assert(wr->num_sge == 1);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- /* Link completed WRs together for repost. */
- *wr_next = wr;
- wr_next = &wr->next;
- if (unlikely(wc->status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p: bad work completion status (%d): %s",
- (void *)rxq, wc->status,
- ibv_wc_status_str(wc->status));
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
- goto repost;
- }
+ /* Update the 'next' pointer of the previous segment. */
+ if (pkt)
+ seg->next = rep;
+ seg = rep;
+ rte_prefetch0(seg);
+ rte_prefetch0(scat);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- /* Increase out of memory counters. */
++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
+ if (!pkt) {
+ /*
+ * No buffers before we even started,
+ * bail out silently.
+ */
+ break;
+ }
+ while (pkt != seg) {
+ assert(pkt != (*rxq->elts)[idx]);
+ rep = pkt->next;
+ pkt->next = NULL;
+ pkt->nb_segs = 1;
+ rte_mbuf_raw_free(pkt);
+ pkt = rep;
+ }
+ break;
+ }
+ if (!pkt) {
+ /* Looking for the new packet. */
+ len = mlx4_cq_poll_one(rxq, &cqe);
+ if (!len) {
+ rte_mbuf_raw_free(rep);
+ break;
+ }
+ if (unlikely(len < 0)) {
+ /* Rx error, packet is likely too large. */
+ rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ goto skip;
+ }
+ pkt = seg;
+ pkt->packet_type = 0;
+ pkt->ol_flags = 0;
+ pkt->pkt_len = len;
+ }
+ rep->nb_segs = 1;
+ rep->port = rxq->port_id;
+ rep->data_len = seg->data_len;
+ rep->data_off = seg->data_off;
+ (*rxq->elts)[idx] = rep;
+ /*
+ * Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes.
+ */
+ scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+ if (len > seg->data_len) {
+ len -= seg->data_len;
+ ++pkt->nb_segs;
+ ++rq_ci;
+ continue;
}
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
- elt->buf = rep;
- /* Update seg information. */
- seg->data_off = RTE_PKTMBUF_HEADROOM;
- seg->nb_segs = 1;
- seg->port = rxq->port_id;
- seg->next = NULL;
- seg->pkt_len = len;
+ /* The last segment. */
seg->data_len = len;
- seg->packet_type = 0;
- seg->ol_flags = 0;
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += pkt->pkt_len;
/* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
- /* Increase bytes counter. */
- rxq->stats.ibytes += len;
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
+ *(pkts++) = pkt;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
+skip:
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
+ ++rq_ci;
+ rq_ci <<= sges_n;
}
- if (unlikely(i == 0))
+ if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci))
return 0;
- /* Repost WRs. */
- *wr_next = NULL;
- assert(wr_head);
- ret = ibv_post_wq_recv(rxq->wq, wr_head, &wr_bad);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
- /* Increase packets counter. */
- rxq->stats.ipackets += pkts_ret;
- return pkts_ret;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci >> sges_n;
+ rte_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ *rxq->mcq.set_ci_db = rte_cpu_to_be_32(rxq->mcq.cons_index & 0xffffff);
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+ return i;
}
/**