net/mlx5: refactor Rx data path
[dpdk.git] / drivers / net / mlx5 / mlx5_rxtx.c
index a6b0cf5..87d09e9 100644 (file)
@@ -42,6 +42,8 @@
 #pragma GCC diagnostic ignored "-pedantic"
 #endif
 #include <infiniband/verbs.h>
+#include <infiniband/mlx5_hw.h>
+#include <infiniband/arch.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
@@ -55,7 +57,7 @@
 #include <rte_prefetch.h>
 #include <rte_common.h>
 #include <rte_branch_prediction.h>
-#include <rte_memory.h>
+#include <rte_ether.h>
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-pedantic"
 #endif
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
+#include "mlx5_prm.h"
+
+static inline volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+         unsigned int cqes_n, uint16_t *ci)
+         __attribute__((always_inline));
+
+static inline int
+rx_poll_len(struct rxq *rxq) __attribute__((always_inline));
+
+static volatile struct mlx5_cqe64 *
+get_cqe64(volatile struct mlx5_cqe cqes[],
+         unsigned int cqes_n, uint16_t *ci)
+{
+       volatile struct mlx5_cqe64 *cqe;
+       uint16_t idx = *ci;
+       uint8_t op_own;
+
+       cqe = &cqes[idx & (cqes_n - 1)].cqe64;
+       op_own = cqe->op_own;
+       if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(idx & cqes_n))) {
+               return NULL;
+       } else if (unlikely(op_own & 0x80)) {
+               switch (op_own >> 4) {
+               case MLX5_CQE_INVALID:
+                       return NULL; /* No CQE */
+               case MLX5_CQE_REQ_ERR:
+                       return cqe;
+               case MLX5_CQE_RESP_ERR:
+                       ++(*ci);
+                       return NULL;
+               default:
+                       return NULL;
+               }
+       }
+       if (cqe) {
+               *ci = idx + 1;
+               return cqe;
+       }
+       return NULL;
+}
 
 /**
  * Manage TX completions.
@@ -390,8 +433,8 @@ stop:
 /**
  * Translate RX completion flags to packet type.
  *
- * @param flags
- *   RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ *   Pointer to CQE.
  *
  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
  *
@@ -399,11 +442,13 @@ stop:
  *   Packet type for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_pkt_type(uint32_t flags)
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe64 *cqe)
 {
        uint32_t pkt_type;
+       uint8_t flags = cqe->l4_hdr_type_etc;
+       uint8_t info = cqe->rsvd0[0];
 
-       if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
+       if (info & IBV_EXP_CQ_RX_TUNNEL_PACKET)
                pkt_type =
                        TRANSPOSE(flags,
                                  IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
@@ -420,11 +465,11 @@ rxq_cq_to_pkt_type(uint32_t flags)
        else
                pkt_type =
                        TRANSPOSE(flags,
-                                 IBV_EXP_CQ_RX_IPV4_PACKET,
-                                 RTE_PTYPE_L3_IPV4) |
+                                 MLX5_CQE_L3_HDR_TYPE_IPV6,
+                                 RTE_PTYPE_L3_IPV6) |
                        TRANSPOSE(flags,
-                                 IBV_EXP_CQ_RX_IPV6_PACKET,
-                                 RTE_PTYPE_L3_IPV6);
+                                 MLX5_CQE_L3_HDR_TYPE_IPV4,
+                                 RTE_PTYPE_L3_IPV4);
        return pkt_type;
 }
 
@@ -433,49 +478,68 @@ rxq_cq_to_pkt_type(uint32_t flags)
  *
  * @param[in] rxq
  *   Pointer to RX queue structure.
- * @param flags
- *   RX completion flags returned by poll_length_flags().
+ * @param[in] cqe
+ *   Pointer to CQE.
  *
  * @return
  *   Offload flags (ol_flags) for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe64 *cqe)
 {
        uint32_t ol_flags = 0;
+       uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
+       uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
+       uint8_t info = cqe->rsvd0[0];
 
-       if (rxq->csum) {
-               /* Set IP checksum flag only for IPv4/IPv6 packets. */
-               if (flags &
-                   (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
-                       ol_flags |=
-                               TRANSPOSE(~flags,
-                                       IBV_EXP_CQ_RX_IP_CSUM_OK,
-                                       PKT_RX_IP_CKSUM_BAD);
-               /* Set L4 checksum flag only for TCP/UDP packets. */
-               if (flags &
-                   (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
-                       ol_flags |=
-                               TRANSPOSE(~flags,
-                                       IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
-                                       PKT_RX_L4_CKSUM_BAD);
-       }
+       if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
+           (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
+               ol_flags |=
+                       (!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) *
+                        PKT_RX_IP_CKSUM_BAD);
+       if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
+           (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
+           (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
+           (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
+               ol_flags |=
+                       (!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) *
+                        PKT_RX_L4_CKSUM_BAD);
        /*
         * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
         * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
         * (its value is 0).
         */
-       if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
+       if ((info & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
                ol_flags |=
-                       TRANSPOSE(~flags,
+                       TRANSPOSE(~cqe->l4_hdr_type_etc,
                                  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
                                  PKT_RX_IP_CKSUM_BAD) |
-                       TRANSPOSE(~flags,
+                       TRANSPOSE(~cqe->l4_hdr_type_etc,
                                  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
                                  PKT_RX_L4_CKSUM_BAD);
        return ol_flags;
 }
 
+/**
+ * Get size of the next packet.
+ *
+ * @param rxq
+ *   RX queue to fetch packet from.
+ *
+ * @return
+ *   Packet size in bytes.
+ */
+static inline int __attribute__((always_inline))
+rx_poll_len(struct rxq *rxq)
+{
+       volatile struct mlx5_cqe64 *cqe;
+
+       cqe = get_cqe64(*rxq->cqes, rxq->elts_n, &rxq->cq_ci);
+       if (cqe)
+               return ntohl(cqe->byte_cnt);
+       return 0;
+}
+
 /**
  * DPDK callback for RX.
  *
@@ -492,134 +556,85 @@ rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
 uint16_t
 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-       struct rxq *rxq = (struct rxq *)dpdk_rxq;
-       struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-       const unsigned int elts_n = rxq->elts_n;
-       unsigned int elts_head = rxq->elts_head;
-       struct ibv_sge sges[pkts_n];
-       unsigned int i;
+       struct rxq *rxq = dpdk_rxq;
        unsigned int pkts_ret = 0;
-       int ret;
+       unsigned int i;
+       unsigned int rq_ci = rxq->rq_ci;
+       const unsigned int elts_n = rxq->elts_n;
+       const unsigned int wqe_cnt = elts_n - 1;
 
        for (i = 0; (i != pkts_n); ++i) {
-               struct rxq_elt *elt = &(*elts)[elts_head];
-               unsigned int len;
-               struct rte_mbuf *seg = elt->buf;
+               unsigned int idx = rq_ci & wqe_cnt;
                struct rte_mbuf *rep;
-               uint32_t flags;
-               uint16_t vlan_tci;
+               struct rte_mbuf *pkt;
+               unsigned int len;
+               volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
+               volatile struct mlx5_cqe64 *cqe =
+                       &(*rxq->cqes)[rxq->cq_ci & wqe_cnt].cqe64;
 
-               /* Sanity checks. */
-               assert(seg != NULL);
-               assert(elts_head < rxq->elts_n);
-               assert(rxq->elts_head < rxq->elts_n);
-               /*
-                * Fetch initial bytes of packet descriptor into a
-                * cacheline while allocating rep.
-                */
-               rte_mbuf_prefetch_part1(seg);
-               rte_mbuf_prefetch_part2(seg);
-               ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
-               if (unlikely(ret < 0)) {
-                       struct ibv_wc wc;
-                       int wcs_n;
-
-                       DEBUG("rxq=%p, poll_length() failed (ret=%d)",
-                             (void *)rxq, ret);
-                       /* ibv_poll_cq() must be used in case of failure. */
-                       wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
-                       if (unlikely(wcs_n == 0))
-                               break;
-                       if (unlikely(wcs_n < 0)) {
-                               DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
-                                     (void *)rxq, wcs_n);
-                               break;
-                       }
-                       assert(wcs_n == 1);
-                       if (unlikely(wc.status != IBV_WC_SUCCESS)) {
-                               /* Whatever, just repost the offending WR. */
-                               DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
-                                     " completion status (%d): %s",
-                                     (void *)rxq, wc.wr_id, wc.status,
-                                     ibv_wc_status_str(wc.status));
-#ifdef MLX5_PMD_SOFT_COUNTERS
-                               /* Increment dropped packets counter. */
-                               ++rxq->stats.idropped;
-#endif
-                               /* Add SGE to array for repost. */
-                               sges[i] = elt->sge;
-                               goto repost;
-                       }
-                       ret = wc.byte_len;
-               }
-               if (ret == 0)
-                       break;
-               assert(ret >= (rxq->crc_present << 2));
-               len = ret - (rxq->crc_present << 2);
+               pkt = (*rxq->elts)[idx];
+               rte_prefetch0(cqe);
                rep = rte_mbuf_raw_alloc(rxq->mp);
                if (unlikely(rep == NULL)) {
-                       /*
-                        * Unable to allocate a replacement mbuf,
-                        * repost WR.
-                        */
-                       DEBUG("rxq=%p: can't allocate a new mbuf",
-                             (void *)rxq);
-                       /* Increment out of memory counters. */
                        ++rxq->stats.rx_nombuf;
-                       ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-                       goto repost;
+                       break;
                }
-
-               /* Reconfigure sge to use rep instead of seg. */
-               elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-               elt->buf = rep;
-
-               /* Add SGE to array for repost. */
-               sges[i] = elt->sge;
-
-               /* Update seg information. */
-               SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
-               NB_SEGS(seg) = 1;
-               PORT(seg) = rxq->port_id;
-               NEXT(seg) = NULL;
-               PKT_LEN(seg) = len;
-               DATA_LEN(seg) = len;
-               if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
-                       seg->packet_type = rxq_cq_to_pkt_type(flags);
-                       seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
-                       if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
-                               seg->ol_flags |= PKT_RX_VLAN_PKT |
+               SET_DATA_OFF(rep, RTE_PKTMBUF_HEADROOM);
+               NB_SEGS(rep) = 1;
+               PORT(rep) = rxq->port_id;
+               NEXT(rep) = NULL;
+               len = rx_poll_len(rxq);
+               if (unlikely(len == 0)) {
+                       rte_mbuf_refcnt_set(rep, 0);
+                       __rte_mbuf_raw_free(rep);
+                       break;
+               }
+               /*
+                * Fill NIC descriptor with the new buffer.  The lkey and size
+                * of the buffers are already known, only the buffer address
+                * changes.
+                */
+               wqe->addr = htonll((uintptr_t)rep->buf_addr +
+                                  RTE_PKTMBUF_HEADROOM);
+               (*rxq->elts)[idx] = rep;
+               /* Update pkt information. */
+               if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip |
+                   rxq->crc_present) {
+                       if (rxq->csum) {
+                               pkt->packet_type = rxq_cq_to_pkt_type(cqe);
+                               pkt->ol_flags = rxq_cq_to_ol_flags(rxq, cqe);
+                       }
+                       if (cqe->l4_hdr_type_etc & MLX5_CQE_VLAN_STRIPPED) {
+                               pkt->ol_flags |= PKT_RX_VLAN_PKT |
                                        PKT_RX_VLAN_STRIPPED;
-                               seg->vlan_tci = vlan_tci;
+                               pkt->vlan_tci = ntohs(cqe->vlan_info);
                        }
+                       if (rxq->crc_present)
+                               len -= ETHER_CRC_LEN;
                }
-               /* Return packet. */
-               *(pkts++) = seg;
-               ++pkts_ret;
+               PKT_LEN(pkt) = len;
+               DATA_LEN(pkt) = len;
 #ifdef MLX5_PMD_SOFT_COUNTERS
                /* Increment bytes counter. */
                rxq->stats.ibytes += len;
 #endif
-repost:
-               if (++elts_head >= elts_n)
-                       elts_head = 0;
-               continue;
+               /* Return packet. */
+               *(pkts++) = pkt;
+               ++pkts_ret;
+               ++rq_ci;
        }
-       if (unlikely(i == 0))
+       if (unlikely((i == 0) && (rq_ci == rxq->rq_ci)))
                return 0;
        /* Repost WRs. */
 #ifdef DEBUG_RECV
        DEBUG("%p: reposting %u WRs", (void *)rxq, i);
 #endif
-       ret = rxq->recv(rxq->wq, sges, i);
-       if (unlikely(ret)) {
-               /* Inability to repost WRs is fatal. */
-               DEBUG("%p: recv_burst(): failed (ret=%d)",
-                     (void *)rxq->priv,
-                     ret);
-               abort();
-       }
-       rxq->elts_head = elts_head;
+       /* Update the consumer index. */
+       rxq->rq_ci = rq_ci;
+       rte_wmb();
+       *rxq->cq_db = htonl(rxq->cq_ci);
+       rte_wmb();
+       *rxq->rq_db = htonl(rxq->rq_ci);
 #ifdef MLX5_PMD_SOFT_COUNTERS
        /* Increment packets counter. */
        rxq->stats.ipackets += pkts_ret;