#pragma GCC diagnostic ignored "-pedantic"
#endif
#include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
+#include <infiniband/arch.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
#include <rte_prefetch.h>
#include <rte_common.h>
#include <rte_branch_prediction.h>
-#include <rte_memory.h>
+#include <rte_ether.h>
#ifdef PEDANTIC
#pragma GCC diagnostic error "-pedantic"
#endif
#include "mlx5_rxtx.h"
#include "mlx5_autoconf.h"
#include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+static inline volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+ __attribute__((always_inline));
+
+static inline int
+rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+
+static volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+ unsigned int cqes_n, uint16_t *ci)
+{
+ volatile struct mlx5_cqe64 *cqe;
+ uint16_t idx = *ci;
+ uint8_t op_own;
+
+ cqe = &cqes[idx & (cqes_n - 1)].cqe64;
+ op_own = cqe->op_own;
+ if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
+ return NULL;
+ } else if (unlikely(op_own & 0x80)) {
+ switch (op_own >> 4) {
+ case MLX5_CQE_INVALID:
+ return NULL; /* No CQE */
+ case MLX5_CQE_REQ_ERR:
+ return cqe;
+ case MLX5_CQE_RESP_ERR:
+ ++(*ci);
+ return NULL;
+ default:
+ return NULL;
+ }
+ }
+ if (cqe) {
+ *ci = idx + 1;
+ return cqe;
+ }
+ return NULL;
+}
/**
* Manage TX completions.
/**
* Translate RX completion flags to packet type.
*
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @note: fix mlx5_dev_supported_ptypes_get() if any change here.
*
* Packet type for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
{
uint32_t pkt_type;
+ uint8_t flags = cqe->l4_hdr_type_etc;
+ uint8_t info = cqe->rsvd0[0];
- if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
+ if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
pkt_type =
TRANSPOSE(flags,
IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
else
pkt_type =
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV4_PACKET,
- RTE_PTYPE_L3_IPV4) |
+ MLX5_CQE_L3_HDR_TYPE_IPV6,
+ RTE_PTYPE_L3_IPV6) |
TRANSPOSE(flags,
- IBV_EXP_CQ_RX_IPV6_PACKET,
- RTE_PTYPE_L3_IPV6);
+ MLX5_CQE_L3_HDR_TYPE_IPV4,
+ RTE_PTYPE_L3_IPV4);
return pkt_type;
}
*
* @param[in] rxq
* Pointer to RX queue structure.
- * @param flags
- * RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ * Pointer to CQE.
*
* @return
* Offload flags (ol_flags) for struct rte_mbuf.
*/
static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
{
uint32_t ol_flags = 0;
+ uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
+ uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
+ uint8_t info = cqe->rsvd0[0];
- if (rxq->csum) {
- /* Set IP checksum flag only for IPv4/IPv6 packets. */
- if (flags &
- (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_IP_CSUM_OK,
- PKT_RX_IP_CKSUM_BAD);
- /* Set L4 checksum flag only for TCP/UDP packets. */
- if (flags &
- (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
- ol_flags |=
- TRANSPOSE(~flags,
- IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
- PKT_RX_L4_CKSUM_BAD);
- }
+ if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
+ (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
+ PKT_RX_IP_CKSUM_BAD);
+ if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
+ (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
+ ol_flags |=
+ (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
+ PKT_RX_L4_CKSUM_BAD);
/*
* PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
* of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
* (its value is 0).
*/
- if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
+ if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
ol_flags |=
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
PKT_RX_IP_CKSUM_BAD) |
- TRANSPOSE(~flags,
+ TRANSPOSE(~cqe->l4_hdr_type_etc,
IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
PKT_RX_L4_CKSUM_BAD);
return ol_flags;
}
+/**
+ * Get size of the next packet.
+ *
+ * @param rxq
+ * RX queue to fetch packet from.
+ *
+ * @return
+ * Packet size in bytes.
+ */
+static inline int __attribute__((always_inline))
+rx_poll_len(struct rxq *rxq)
+{
+ volatile struct mlx5_cqe64 *cqe;
+
+ cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
+ if (cqe)
+ return ntohl(cqe->byte_cnt);
+ return 0;
+}
+
/**
* DPDK callback for RX.
*
uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_sge sges[pkts_n];
- unsigned int i;
+ struct rxq *rxq = dpdk_rxq;
unsigned int pkts_ret = 0;
- int ret;
+ unsigned int i;
+ unsigned int rq_ci = rxq->rq_ci;
+ const unsigned int elts_n = rxq->elts_n;
+ const unsigned int wqe_cnt = elts_n - 1;
for (i = 0; (i != pkts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[elts_head];
- unsigned int len;
- struct rte_mbuf *seg = elt->buf;
+ unsigned int idx = rq_ci & wqe_cnt;
struct rte_mbuf *rep;
- uint32_t flags;
- uint16_t vlan_tci;
+ struct rte_mbuf *pkt;
+ unsigned int len;
+ volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
+ volatile struct mlx5_cqe64 *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
- /* Sanity checks. */
- assert(seg != NULL);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
- if (unlikely(ret < 0)) {
- struct ibv_wc wc;
- int wcs_n;
-
- DEBUG("rxq=%p, poll_length() failed (ret=%d)",
- (void *)rxq, ret);
- /* ibv_poll_cq() must be used in case of failure. */
- wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
- if (unlikely(wcs_n == 0))
- break;
- if (unlikely(wcs_n < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
- (void *)rxq, wcs_n);
- break;
- }
- assert(wcs_n == 1);
- if (unlikely(wc.status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
- " completion status (%d): %s",
- (void *)rxq, wc.wr_id, wc.status,
- ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
-#endif
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
- goto repost;
- }
- ret = wc.byte_len;
- }
- if (ret == 0)
- break;
- assert(ret >= (rxq->crc_present << 2));
- len = ret - (rxq->crc_present << 2);
+ pkt = (*rxq->elts)[idx];
+ rte_prefetch0(cqe);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p: can't allocate a new mbuf",
- (void *)rxq);
- /* Increment out of memory counters. */
++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
+ break;
}
-
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- elt->buf = rep;
-
- /* Add SGE to array for repost. */
- sges[i] = elt->sge;
-
- /* Update seg information. */
- SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
- NB_SEGS(seg) = 1;
- PORT(seg) = rxq->port_id;
- NEXT(seg) = NULL;
- PKT_LEN(seg) = len;
- DATA_LEN(seg) = len;
- if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
- seg->packet_type = rxq_cq_to_pkt_type(flags);
- seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
- if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
- seg->ol_flags |= PKT_RX_VLAN_PKT |
+ SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
+ NB_SEGS(rep) = 1;
+ PORT(rep) = rxq->port_id;
+ NEXT(rep) = NULL;
+ len = rx_poll_len(rxq);
+ if (unlikely(len == 0)) {
+ rte_mbuf_refcnt_set(rep, 0);
+ __rte_mbuf_raw_free(rep);
+ break;
+ }
+ /*
+ * Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes.
+ */
+ wqe->addr = htonll((uintptr_t)rep->buf_addr +
+ RTE_PKTMBUF_HEADROOM);
+ (*rxq->elts)[idx] = rep;
+ /* Update pkt information. */
+ if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+ rxq->crc_present) {
+ if (rxq->csum) {
+ pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+ pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
+ }
+ if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
+ pkt->ol_flags |= PKT_RX_VLAN_PKT |
PKT_RX_VLAN_STRIPPED;
- seg->vlan_tci = vlan_tci;
+ pkt->vlan_tci = ntohs(cqe->vlan_info);
}
+ if (rxq->crc_present)
+ len -= ETHER_CRC_LEN;
}
- /* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
+ PKT_LEN(pkt) = len;
+ DATA_LEN(pkt) = len;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment bytes counter. */
rxq->stats.ibytes += len;
#endif
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
+ /* Return packet. */
+ *(pkts++) = pkt;
+ ++pkts_ret;
+ ++rq_ci;
}
- if (unlikely(i == 0))
+ if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
return 0;
/* Repost WRs. */
#ifdef DEBUG_RECV
DEBUG("%p: reposting %u WRs", (void *)rxq, i);
#endif
- ret = rxq->recv(rxq->wq, sges, i);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci;
+ rte_wmb();
+ *rxq->cq_db = htonl(rxq->cq_ci);
+ rte_wmb();
+ *rxq->rq_db = htonl(rxq->rq_ci);
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment packets counter. */
rxq->stats.ipackets += pkts_ret;