From: Moti Haimovsky Date: Thu, 12 Oct 2017 12:29:57 +0000 (+0200) Subject: net/mlx4: add Rx bypassing Verbs X-Git-Tag: spdx-start~1341 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=6681b845034c3368f802a69b97e787cf187f1b55;p=dpdk.git net/mlx4: add Rx bypassing Verbs This patch adds support for accessing the hardware directly when handling Rx packets eliminating the need to use Verbs in the Rx data path. Rx scatter support: calculate the number of scatters on the fly according to the maximum expected packet size. Signed-off-by: Vasily Philipov Signed-off-by: Moti Haimovsky Signed-off-by: Ophir Munk Acked-by: Adrien Mazarguil --- diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini index 9750ebfd7a..19ae6882a9 100644 --- a/doc/guides/nics/features/mlx4.ini +++ b/doc/guides/nics/features/mlx4.ini @@ -12,6 +12,7 @@ Rx interrupt = Y Queue start/stop = Y MTU update = Y Jumbo frame = Y +Scattered Rx = Y Promiscuous mode = Y Allmulticast mode = Y Unicast MAC filter = Y diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c index 483fe9b0db..39c83bcc57 100644 --- a/drivers/net/mlx4/mlx4_rxq.c +++ b/drivers/net/mlx4/mlx4_rxq.c @@ -51,6 +51,7 @@ #pragma GCC diagnostic error "-Wpedantic" #endif +#include #include #include #include @@ -312,45 +313,46 @@ void mlx4_rss_detach(struct mlx4_rss *rss) static int mlx4_rxq_alloc_elts(struct rxq *rxq) { - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; + const uint32_t elts_n = 1 << rxq->elts_n; + const uint32_t sges_n = 1 << rxq->sges_n; + struct rte_mbuf *(*elts)[elts_n] = rxq->elts; unsigned int i; - /* For each WR (packet). */ + assert(rte_is_power_of_2(elts_n)); for (i = 0; i != RTE_DIM(*elts); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct ibv_recv_wr *wr = &elt->wr; - struct ibv_sge *sge = &(*elts)[i].sge; + volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[i]; struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp); if (buf == NULL) { while (i--) { - rte_pktmbuf_free_seg((*elts)[i].buf); - (*elts)[i].buf = NULL; + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; } rte_errno = ENOMEM; return -rte_errno; } - elt->buf = buf; - wr->next = &(*elts)[(i + 1)].wr; - wr->sg_list = sge; - wr->num_sge = 1; /* Headroom is reserved by rte_pktmbuf_alloc(). */ assert(buf->data_off == RTE_PKTMBUF_HEADROOM); /* Buffer is supposed to be empty. */ assert(rte_pktmbuf_data_len(buf) == 0); assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - /* SGE keeps its headroom. */ - sge->addr = (uintptr_t) - ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); - sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); + /* Only the first segment keeps headroom. */ + if (i % sges_n) + buf->data_off = 0; + buf->port = rxq->port_id; + buf->data_len = rte_pktmbuf_tailroom(buf); + buf->pkt_len = rte_pktmbuf_tailroom(buf); + buf->nb_segs = 1; + *scat = (struct mlx4_wqe_data_seg){ + .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)), + .byte_count = rte_cpu_to_be_32(buf->data_len), + .lkey = rte_cpu_to_be_32(rxq->mr->lkey), + }; + (*elts)[i] = buf; } - /* The last WR pointer must be NULL. */ - (*elts)[(i - 1)].wr.next = NULL; + DEBUG("%p: allocated and configured %u segments (max %u packets)", + (void *)rxq, elts_n, elts_n / sges_n); return 0; } @@ -364,14 +366,14 @@ static void mlx4_rxq_free_elts(struct rxq *rxq) { unsigned int i; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; + struct rte_mbuf *(*elts)[1 << rxq->elts_n] = rxq->elts; - DEBUG("%p: freeing WRs", (void *)rxq); + DEBUG("%p: freeing Rx queue elements", (void *)rxq); for (i = 0; (i != RTE_DIM(*elts)); ++i) { - if (!(*elts)[i].buf) + if (!(*elts)[i]) continue; - rte_pktmbuf_free_seg((*elts)[i].buf); - (*elts)[i].buf = NULL; + rte_pktmbuf_free_seg((*elts)[i]); + (*elts)[i] = NULL; } } @@ -400,8 +402,11 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, struct rte_mempool *mp) { struct priv *priv = dev->data->dev_private; + struct mlx4dv_obj mlxdv; + struct mlx4dv_rwq dv_rwq; + struct mlx4dv_cq dv_cq; uint32_t mb_len = rte_pktmbuf_data_room_size(mp); - struct rxq_elt (*elts)[desc]; + struct rte_mbuf *(*elts)[rte_align32pow2(desc)]; struct rte_flow_error error; struct rxq *rxq; struct mlx4_malloc_vec vec[] = { @@ -439,6 +444,12 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, ERROR("%p: invalid number of Rx descriptors", (void *)dev); return -rte_errno; } + if (desc != RTE_DIM(*elts)) { + desc = RTE_DIM(*elts); + WARN("%p: increased number of descriptors in Rx queue %u" + " to the next power of two (%u)", + (void *)dev, idx, desc); + } /* Allocate and initialize Rx queue. */ mlx4_zmallocv_socket("RXQ", vec, RTE_DIM(vec), socket); if (!rxq) { @@ -450,8 +461,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, .priv = priv, .mp = mp, .port_id = dev->data->port_id, - .elts_n = desc, - .elts_head = 0, + .sges_n = 0, + .elts_n = rte_log2_u32(desc), .elts = elts, .stats.idx = idx, .socket = socket, @@ -462,9 +473,29 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (mb_len - RTE_PKTMBUF_HEADROOM)) { ; } else if (dev->data->dev_conf.rxmode.enable_scatter) { - WARN("%p: scattered mode has been requested but is" - " not supported, this may lead to packet loss", - (void *)dev); + uint32_t size = + RTE_PKTMBUF_HEADROOM + + dev->data->dev_conf.rxmode.max_rx_pkt_len; + uint32_t sges_n; + + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = rte_log2_u32((size / mb_len) + !!(size % mb_len)); + rxq->sges_n = sges_n; + /* Make sure sges_n did not overflow. */ + size = mb_len * (1 << rxq->sges_n); + size -= RTE_PKTMBUF_HEADROOM; + if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) { + rte_errno = EOVERFLOW; + ERROR("%p: too many SGEs (%u) needed to handle" + " requested maximum packet size %u", + (void *)dev, + 1 << sges_n, + dev->data->dev_conf.rxmode.max_rx_pkt_len); + goto error; + } } else { WARN("%p: the requested maximum Rx packet size (%u) is" " larger than a single mbuf (%u) and scattered" @@ -473,6 +504,17 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, dev->data->dev_conf.rxmode.max_rx_pkt_len, mb_len - RTE_PKTMBUF_HEADROOM); } + DEBUG("%p: maximum number of segments per packet: %u", + (void *)dev, 1 << rxq->sges_n); + if (desc % (1 << rxq->sges_n)) { + rte_errno = EINVAL; + ERROR("%p: number of Rx queue descriptors (%u) is not a" + " multiple of maximum segments per packet (%u)", + (void *)dev, + desc, + 1 << rxq->sges_n); + goto error; + } /* Use the entire Rx mempool as the memory region. */ rxq->mr = mlx4_mp2mr(priv->pd, mp); if (!rxq->mr) { @@ -497,7 +539,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, goto error; } } - rxq->cq = ibv_create_cq(priv->ctx, desc, NULL, rxq->channel, 0); + rxq->cq = ibv_create_cq(priv->ctx, desc >> rxq->sges_n, NULL, + rxq->channel, 0); if (!rxq->cq) { rte_errno = ENOMEM; ERROR("%p: CQ creation failure: %s", @@ -508,8 +551,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (priv->ctx, &(struct ibv_wq_init_attr){ .wq_type = IBV_WQT_RQ, - .max_wr = RTE_MIN(priv->device_attr.max_qp_wr, desc), - .max_sge = 1, + .max_wr = desc >> rxq->sges_n, + .max_sge = 1 << rxq->sges_n, .pd = priv->pd, .cq = rxq->cq, }); @@ -531,27 +574,43 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, (void *)dev, strerror(rte_errno)); goto error; } - ret = mlx4_rxq_alloc_elts(rxq); + /* Retrieve device queue information. */ + mlxdv.cq.in = rxq->cq; + mlxdv.cq.out = &dv_cq; + mlxdv.rwq.in = rxq->wq; + mlxdv.rwq.out = &dv_rwq; + ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_RWQ | MLX4DV_OBJ_CQ); if (ret) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(rte_errno)); + rte_errno = EINVAL; + ERROR("%p: failed to obtain device information", (void *)dev); goto error; } - ret = ibv_post_wq_recv(rxq->wq, &(*rxq->elts)[0].wr, - &(struct ibv_recv_wr *){ NULL }); + rxq->wqes = + (volatile struct mlx4_wqe_data_seg (*)[]) + ((uintptr_t)dv_rwq.buf.buf + dv_rwq.rq.offset); + rxq->rq_db = dv_rwq.rdb; + rxq->rq_ci = 0; + rxq->mcq.buf = dv_cq.buf.buf; + rxq->mcq.cqe_cnt = dv_cq.cqe_cnt; + rxq->mcq.set_ci_db = dv_cq.set_ci_db; + rxq->mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0; + ret = mlx4_rxq_alloc_elts(rxq); if (ret) { - rte_errno = ret; - ERROR("%p: ibv_post_recv() failed: %s", - (void *)dev, - strerror(rte_errno)); + ERROR("%p: RXQ allocation failed: %s", + (void *)dev, strerror(rte_errno)); goto error; } DEBUG("%p: adding Rx queue %p to list", (void *)dev, (void *)rxq); dev->data->rx_queues[idx] = rxq; /* Enable associated flows. */ ret = mlx4_flow_sync(priv, &error); - if (!ret) + if (!ret) { + /* Update doorbell counter. */ + rxq->rq_ci = desc >> rxq->sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); return 0; + } ERROR("cannot re-attach flow rules to queue %u" " (code %d, \"%s\"), flow error type %d, cause %p, message: %s", idx, -ret, strerror(-ret), error.type, error.cause, diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index 38b87a01c8..cc0baaac62 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -538,9 +538,44 @@ stop: } /** - * DPDK callback for Rx. + * Poll one CQE from CQ. * - * The following function doesn't manage scattered packets. + * @param rxq + * Pointer to the receive queue structure. + * @param[out] out + * Just polled CQE. + * + * @return + * Number of bytes of the CQE, 0 in case there is no completion. + */ +static unsigned int +mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out) +{ + int ret = 0; + struct mlx4_cqe *cqe = NULL; + struct mlx4_cq *cq = &rxq->mcq; + + cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); + if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cq->cons_index & cq->cqe_cnt)) + goto out; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rte_rmb(); + assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); + assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != + MLX4_CQE_OPCODE_ERROR); + ret = rte_be_to_cpu_32(cqe->byte_cnt); + ++cq->cons_index; +out: + *out = cqe; + return ret; +} + +/** + * DPDK callback for Rx with scattered packets support. * * @param dpdk_rxq * Generic pointer to Rx queue structure. @@ -555,112 +590,107 @@ stop: uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_wc wcs[pkts_n]; - struct ibv_recv_wr *wr_head = NULL; - struct ibv_recv_wr **wr_next = &wr_head; - struct ibv_recv_wr *wr_bad = NULL; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; + struct rxq *rxq = dpdk_rxq; + const uint32_t wr_cnt = (1 << rxq->elts_n) - 1; + const uint16_t sges_n = rxq->sges_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + unsigned int i = 0; + uint32_t rq_ci = rxq->rq_ci << sges_n; + int len = 0; - ret = ibv_poll_cq(rxq->cq, pkts_n, wcs); - if (unlikely(ret == 0)) - return 0; - if (unlikely(ret < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)", - (void *)rxq, ret); - return 0; - } - assert(ret <= (int)pkts_n); - /* For each work completion. */ - for (i = 0; i != (unsigned int)ret; ++i) { - struct ibv_wc *wc = &wcs[i]; - struct rxq_elt *elt = &(*elts)[elts_head]; - struct ibv_recv_wr *wr = &elt->wr; - uint32_t len = wc->byte_len; - struct rte_mbuf *seg = elt->buf; - struct rte_mbuf *rep; + while (pkts_n) { + struct mlx4_cqe *cqe; + uint32_t idx = rq_ci & wr_cnt; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; - /* Sanity checks. */ - assert(wr->sg_list == &elt->sge); - assert(wr->num_sge == 1); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_mbuf_prefetch_part1(seg); - rte_mbuf_prefetch_part2(seg); - /* Link completed WRs together for repost. */ - *wr_next = wr; - wr_next = &wr->next; - if (unlikely(wc->status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p: bad work completion status (%d): %s", - (void *)rxq, wc->status, - ibv_wc_status_str(wc->status)); - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; - goto repost; - } + /* Update the 'next' pointer of the previous segment. */ + if (pkt) + seg->next = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(scat); rep = rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p: can't allocate a new mbuf", - (void *)rxq); - /* Increase out of memory counters. */ ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; + if (!pkt) { + /* + * No buffers before we even started, + * bail out silently. + */ + break; + } + while (pkt != seg) { + assert(pkt != (*rxq->elts)[idx]); + rep = pkt->next; + pkt->next = NULL; + pkt->nb_segs = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; + } + if (!pkt) { + /* Looking for the new packet. */ + len = mlx4_cq_poll_one(rxq, &cqe); + if (!len) { + rte_mbuf_raw_free(rep); + break; + } + if (unlikely(len < 0)) { + /* Rx error, packet is likely too large. */ + rte_mbuf_raw_free(rep); + ++rxq->stats.idropped; + goto skip; + } + pkt = seg; + pkt->packet_type = 0; + pkt->ol_flags = 0; + pkt->pkt_len = len; + } + rep->nb_segs = 1; + rep->port = rxq->port_id; + rep->data_len = seg->data_len; + rep->data_off = seg->data_off; + (*rxq->elts)[idx] = rep; + /* + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. + */ + scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t)); + if (len > seg->data_len) { + len -= seg->data_len; + ++pkt->nb_segs; + ++rq_ci; + continue; } - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - elt->buf = rep; - /* Update seg information. */ - seg->data_off = RTE_PKTMBUF_HEADROOM; - seg->nb_segs = 1; - seg->port = rxq->port_id; - seg->next = NULL; - seg->pkt_len = len; + /* The last segment. */ seg->data_len = len; - seg->packet_type = 0; - seg->ol_flags = 0; + /* Increment bytes counter. */ + rxq->stats.ibytes += pkt->pkt_len; /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; - /* Increase bytes counter. */ - rxq->stats.ibytes += len; -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sges_n; + ++rq_ci; + rq_ci <<= sges_n; } - if (unlikely(i == 0)) + if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci)) return 0; - /* Repost WRs. */ - *wr_next = NULL; - assert(wr_head); - ret = ibv_post_wq_recv(rxq->wq, wr_head, &wr_bad); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; - /* Increase packets counter. */ - rxq->stats.ipackets += pkts_ret; - return pkts_ret; + /* Update the consumer index. */ + rxq->rq_ci = rq_ci >> sges_n; + rte_wmb(); + *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); + *rxq->mcq.set_ci_db = rte_cpu_to_be_32(rxq->mcq.cons_index & 0xffffff); + /* Increment packets counter. */ + rxq->stats.ipackets += i; + return i; } /** diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index ff271262ed..fa5738f232 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -63,13 +63,6 @@ struct mlx4_rxq_stats { uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */ }; -/** Rx element. */ -struct rxq_elt { - struct ibv_recv_wr wr; /**< Work request. */ - struct ibv_sge sge; /**< Scatter/gather element. */ - struct rte_mbuf *buf; /**< Buffer. */ -}; - /** Rx queue descriptor. */ struct rxq { struct priv *priv; /**< Back pointer to private data. */ @@ -78,10 +71,14 @@ struct rxq { struct ibv_cq *cq; /**< Completion queue. */ struct ibv_wq *wq; /**< Work queue. */ struct ibv_comp_channel *channel; /**< Rx completion channel. */ - unsigned int port_id; /**< Port ID for incoming packets. */ - unsigned int elts_n; /**< (*elts)[] length. */ - unsigned int elts_head; /**< Current index in (*elts)[]. */ - struct rxq_elt (*elts)[]; /**< Rx elements. */ + uint16_t rq_ci; /**< Saved RQ consumer index. */ + uint16_t port_id; /**< Port ID for incoming packets. */ + uint16_t sges_n; /**< Number of segments per packet (log2 value). */ + uint16_t elts_n; /**< Mbuf queue size (log2 value). */ + struct rte_mbuf *(*elts)[]; /**< Rx elements. */ + volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */ + volatile uint32_t *rq_db; /**< RQ doorbell record. */ + struct mlx4_cq mcq; /**< Info for directly manipulating the CQ. */ struct mlx4_rxq_stats stats; /**< Rx queue counters. */ unsigned int socket; /**< CPU socket ID for allocations. */ uint8_t data[]; /**< Remaining queue resources. */