net/mlx4: separate Rx/Tx functions
authorAdrien Mazarguil <adrien.mazarguil@6wind.com>
Fri, 1 Sep 2017 08:06:57 +0000 (10:06 +0200)
committerFerruh Yigit <ferruh.yigit@intel.com>
Fri, 6 Oct 2017 00:49:48 +0000 (02:49 +0200)
This commit groups all data plane functions (Rx/Tx) into a separate file
and adjusts header files accordingly.

Private functions are now prefixed with "mlx4_" to prevent them from
conflicting with their mlx5 PMD counterparts at link time.

No impact on functionality.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
drivers/net/mlx4/Makefile
drivers/net/mlx4/mlx4.c
drivers/net/mlx4/mlx4.h
drivers/net/mlx4/mlx4_rxtx.c [new file with mode: 0644]
drivers/net/mlx4/mlx4_rxtx.h

index f6e3001..8def32a 100644 (file)
@@ -38,6 +38,7 @@ LIB = librte_pmd_mlx4.a
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_utils.c
 
 # Basic CFLAGS.
index ba06075..a409ec2 100644 (file)
 #include <rte_mbuf.h>
 #include <rte_errno.h>
 #include <rte_mempool.h>
-#include <rte_prefetch.h>
 #include <rte_malloc.h>
 #include <rte_memory.h>
 #include <rte_flow.h>
 #include <rte_kvargs.h>
 #include <rte_interrupts.h>
-#include <rte_branch_prediction.h>
 #include <rte_common.h>
 
 /* Generated configuration header. */
@@ -505,9 +503,6 @@ mlx4_dev_configure(struct rte_eth_dev *dev)
        return 0;
 }
 
-static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t);
-static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
-
 /* TX queues handling. */
 
 /**
@@ -630,53 +625,6 @@ txq_cleanup(struct txq *txq)
        memset(txq, 0, sizeof(*txq));
 }
 
-/**
- * Manage TX completions.
- *
- * When sending a burst, mlx4_tx_burst() posts several WRs.
- * To improve performance, a completion event is only required once every
- * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
- * for other WRs, but this information would not be used anyway.
- *
- * @param txq
- *   Pointer to TX queue structure.
- *
- * @return
- *   0 on success, -1 on failure.
- */
-static int
-txq_complete(struct txq *txq)
-{
-       unsigned int elts_comp = txq->elts_comp;
-       unsigned int elts_tail = txq->elts_tail;
-       const unsigned int elts_n = txq->elts_n;
-       struct ibv_wc wcs[elts_comp];
-       int wcs_n;
-
-       if (unlikely(elts_comp == 0))
-               return 0;
-       wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
-       if (unlikely(wcs_n == 0))
-               return 0;
-       if (unlikely(wcs_n < 0)) {
-               DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
-                     (void *)txq, wcs_n);
-               return -1;
-       }
-       elts_comp -= wcs_n;
-       assert(elts_comp <= txq->elts_comp);
-       /*
-        * Assume WC status is successful as nothing can be done about it
-        * anyway.
-        */
-       elts_tail += wcs_n * txq->elts_comp_cd_init;
-       if (elts_tail >= elts_n)
-               elts_tail -= elts_n;
-       txq->elts_tail = elts_tail;
-       txq->elts_comp = elts_comp;
-       return 0;
-}
-
 struct mlx4_check_mempool_data {
        int ret;
        char *start;
@@ -738,10 +686,6 @@ static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start,
        return data.ret;
 }
 
-/* For best performance, this function should not be inlined. */
-static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
-       __rte_noinline;
-
 /**
  * Register mempool as a memory region.
  *
@@ -753,7 +697,7 @@ static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *)
  * @return
  *   Memory region pointer, NULL in case of error and rte_errno is set.
  */
-static struct ibv_mr *
+struct ibv_mr *
 mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
 {
        const struct rte_memseg *ms = rte_eal_get_physmem_layout();
@@ -794,81 +738,6 @@ mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
        return mr;
 }
 
-/**
- * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- *   Pointer to mbuf.
- *
- * @return
- *   Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-txq_mb2mp(struct rte_mbuf *buf)
-{
-       if (unlikely(RTE_MBUF_INDIRECT(buf)))
-               return rte_mbuf_from_indirect(buf)->pool;
-       return buf->pool;
-}
-
-/**
- * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
- *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] mp
- *   Memory Pool for which a Memory Region lkey must be returned.
- *
- * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
- */
-static uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
-{
-       unsigned int i;
-       struct ibv_mr *mr;
-
-       for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-               if (unlikely(txq->mp2mr[i].mp == NULL)) {
-                       /* Unknown MP, add a new MR for it. */
-                       break;
-               }
-               if (txq->mp2mr[i].mp == mp) {
-                       assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-                       assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-                       return txq->mp2mr[i].lkey;
-               }
-       }
-       /* Add a new entry, register MR first. */
-       DEBUG("%p: discovered new memory pool \"%s\" (%p)",
-             (void *)txq, mp->name, (void *)mp);
-       mr = mlx4_mp2mr(txq->priv->pd, mp);
-       if (unlikely(mr == NULL)) {
-               DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
-                     (void *)txq);
-               return (uint32_t)-1;
-       }
-       if (unlikely(i == RTE_DIM(txq->mp2mr))) {
-               /* Table is full, remove oldest entry. */
-               DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
-                     (void *)txq);
-               --i;
-               claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
-               memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-                       (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-       }
-       /* Store the new entry. */
-       txq->mp2mr[i].mp = mp;
-       txq->mp2mr[i].mr = mr;
-       txq->mp2mr[i].lkey = mr->lkey;
-       DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
-             (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
-       return txq->mp2mr[i].lkey;
-}
-
 struct txq_mp2mr_mbuf_check_data {
        int ret;
 };
@@ -923,172 +792,7 @@ txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
        if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
                        data.ret == -1)
                return;
-       txq_mp2mr(txq, mp);
-}
-
-/**
- * DPDK callback for TX.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       struct txq *txq = (struct txq *)dpdk_txq;
-       struct ibv_send_wr *wr_head = NULL;
-       struct ibv_send_wr **wr_next = &wr_head;
-       struct ibv_send_wr *wr_bad = NULL;
-       unsigned int elts_head = txq->elts_head;
-       const unsigned int elts_n = txq->elts_n;
-       unsigned int elts_comp_cd = txq->elts_comp_cd;
-       unsigned int elts_comp = 0;
-       unsigned int i;
-       unsigned int max;
-       int err;
-
-       assert(elts_comp_cd != 0);
-       txq_complete(txq);
-       max = (elts_n - (elts_head - txq->elts_tail));
-       if (max > elts_n)
-               max -= elts_n;
-       assert(max >= 1);
-       assert(max <= elts_n);
-       /* Always leave one free entry in the ring. */
-       --max;
-       if (max == 0)
-               return 0;
-       if (max > pkts_n)
-               max = pkts_n;
-       for (i = 0; (i != max); ++i) {
-               struct rte_mbuf *buf = pkts[i];
-               unsigned int elts_head_next =
-                       (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
-               struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
-               struct txq_elt *elt = &(*txq->elts)[elts_head];
-               struct ibv_send_wr *wr = &elt->wr;
-               unsigned int segs = buf->nb_segs;
-               unsigned int sent_size = 0;
-               uint32_t send_flags = 0;
-
-               /* Clean up old buffer. */
-               if (likely(elt->buf != NULL)) {
-                       struct rte_mbuf *tmp = elt->buf;
-
-#ifndef NDEBUG
-                       /* Poisoning. */
-                       memset(elt, 0x66, sizeof(*elt));
-#endif
-                       /* Faster than rte_pktmbuf_free(). */
-                       do {
-                               struct rte_mbuf *next = tmp->next;
-
-                               rte_pktmbuf_free_seg(tmp);
-                               tmp = next;
-                       } while (tmp != NULL);
-               }
-               /* Request TX completion. */
-               if (unlikely(--elts_comp_cd == 0)) {
-                       elts_comp_cd = txq->elts_comp_cd_init;
-                       ++elts_comp;
-                       send_flags |= IBV_SEND_SIGNALED;
-               }
-               if (likely(segs == 1)) {
-                       struct ibv_sge *sge = &elt->sge;
-                       uintptr_t addr;
-                       uint32_t length;
-                       uint32_t lkey;
-
-                       /* Retrieve buffer information. */
-                       addr = rte_pktmbuf_mtod(buf, uintptr_t);
-                       length = buf->data_len;
-                       /* Retrieve Memory Region key for this memory pool. */
-                       lkey = txq_mp2mr(txq, txq_mb2mp(buf));
-                       if (unlikely(lkey == (uint32_t)-1)) {
-                               /* MR does not exist. */
-                               DEBUG("%p: unable to get MP <-> MR"
-                                     " association", (void *)txq);
-                               /* Clean up TX element. */
-                               elt->buf = NULL;
-                               goto stop;
-                       }
-                       /* Update element. */
-                       elt->buf = buf;
-                       if (txq->priv->vf)
-                               rte_prefetch0((volatile void *)
-                                             (uintptr_t)addr);
-                       RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-                       sge->addr = addr;
-                       sge->length = length;
-                       sge->lkey = lkey;
-                       sent_size += length;
-               } else {
-                       err = -1;
-                       goto stop;
-               }
-               if (sent_size <= txq->max_inline)
-                       send_flags |= IBV_SEND_INLINE;
-               elts_head = elts_head_next;
-               /* Increment sent bytes counter. */
-               txq->stats.obytes += sent_size;
-               /* Set up WR. */
-               wr->sg_list = &elt->sge;
-               wr->num_sge = segs;
-               wr->opcode = IBV_WR_SEND;
-               wr->send_flags = send_flags;
-               *wr_next = wr;
-               wr_next = &wr->next;
-       }
-stop:
-       /* Take a shortcut if nothing must be sent. */
-       if (unlikely(i == 0))
-               return 0;
-       /* Increment sent packets counter. */
-       txq->stats.opackets += i;
-       /* Ring QP doorbell. */
-       *wr_next = NULL;
-       assert(wr_head);
-       err = ibv_post_send(txq->qp, wr_head, &wr_bad);
-       if (unlikely(err)) {
-               uint64_t obytes = 0;
-               uint64_t opackets = 0;
-
-               /* Rewind bad WRs. */
-               while (wr_bad != NULL) {
-                       int j;
-
-                       /* Force completion request if one was lost. */
-                       if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
-                               elts_comp_cd = 1;
-                               --elts_comp;
-                       }
-                       ++opackets;
-                       for (j = 0; j < wr_bad->num_sge; ++j)
-                               obytes += wr_bad->sg_list[j].length;
-                       elts_head = (elts_head ? elts_head : elts_n) - 1;
-                       wr_bad = wr_bad->next;
-               }
-               txq->stats.opackets -= opackets;
-               txq->stats.obytes -= obytes;
-               i -= opackets;
-               DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
-                     " (%" PRIu64 " bytes) rejected: %s",
-                     (void *)txq,
-                     opackets,
-                     obytes,
-                     (err <= -1) ? "Internal error" : strerror(err));
-       }
-       txq->elts_head = elts_head;
-       txq->elts_comp += elts_comp;
-       txq->elts_comp_cd = elts_comp_cd;
-       return i;
+       mlx4_txq_mp2mr(txq, mp);
 }
 
 /**
@@ -1545,132 +1249,6 @@ rxq_cleanup(struct rxq *rxq)
        memset(rxq, 0, sizeof(*rxq));
 }
 
-/**
- * DPDK callback for RX.
- *
- * The following function doesn't manage scattered packets.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       struct rxq *rxq = (struct rxq *)dpdk_rxq;
-       struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-       const unsigned int elts_n = rxq->elts_n;
-       unsigned int elts_head = rxq->elts_head;
-       struct ibv_wc wcs[pkts_n];
-       struct ibv_recv_wr *wr_head = NULL;
-       struct ibv_recv_wr **wr_next = &wr_head;
-       struct ibv_recv_wr *wr_bad = NULL;
-       unsigned int i;
-       unsigned int pkts_ret = 0;
-       int ret;
-
-       ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-       if (unlikely(ret == 0))
-               return 0;
-       if (unlikely(ret < 0)) {
-               DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-                     (void *)rxq, ret);
-               return 0;
-       }
-       assert(ret <= (int)pkts_n);
-       /* For each work completion. */
-       for (i = 0; i != (unsigned int)ret; ++i) {
-               struct ibv_wc *wc = &wcs[i];
-               struct rxq_elt *elt = &(*elts)[elts_head];
-               struct ibv_recv_wr *wr = &elt->wr;
-               uint32_t len = wc->byte_len;
-               struct rte_mbuf *seg = elt->buf;
-               struct rte_mbuf *rep;
-
-               /* Sanity checks. */
-               assert(wr->sg_list == &elt->sge);
-               assert(wr->num_sge == 1);
-               assert(elts_head < rxq->elts_n);
-               assert(rxq->elts_head < rxq->elts_n);
-               /*
-                * Fetch initial bytes of packet descriptor into a
-                * cacheline while allocating rep.
-                */
-               rte_mbuf_prefetch_part1(seg);
-               rte_mbuf_prefetch_part2(seg);
-               /* Link completed WRs together for repost. */
-               *wr_next = wr;
-               wr_next = &wr->next;
-               if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-                       /* Whatever, just repost the offending WR. */
-                       DEBUG("rxq=%p: bad work completion status (%d): %s",
-                             (void *)rxq, wc->status,
-                             ibv_wc_status_str(wc->status));
-                       /* Increment dropped packets counter. */
-                       ++rxq->stats.idropped;
-                       goto repost;
-               }
-               rep = rte_mbuf_raw_alloc(rxq->mp);
-               if (unlikely(rep == NULL)) {
-                       /*
-                        * Unable to allocate a replacement mbuf,
-                        * repost WR.
-                        */
-                       DEBUG("rxq=%p: can't allocate a new mbuf",
-                             (void *)rxq);
-                       /* Increase out of memory counters. */
-                       ++rxq->stats.rx_nombuf;
-                       ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-                       goto repost;
-               }
-               /* Reconfigure sge to use rep instead of seg. */
-               elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-               assert(elt->sge.lkey == rxq->mr->lkey);
-               elt->buf = rep;
-               /* Update seg information. */
-               seg->data_off = RTE_PKTMBUF_HEADROOM;
-               seg->nb_segs = 1;
-               seg->port = rxq->port_id;
-               seg->next = NULL;
-               seg->pkt_len = len;
-               seg->data_len = len;
-               seg->packet_type = 0;
-               seg->ol_flags = 0;
-               /* Return packet. */
-               *(pkts++) = seg;
-               ++pkts_ret;
-               /* Increase bytes counter. */
-               rxq->stats.ibytes += len;
-repost:
-               if (++elts_head >= elts_n)
-                       elts_head = 0;
-               continue;
-       }
-       if (unlikely(i == 0))
-               return 0;
-       /* Repost WRs. */
-       *wr_next = NULL;
-       assert(wr_head);
-       ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
-       if (unlikely(ret)) {
-               /* Inability to repost WRs is fatal. */
-               DEBUG("%p: recv_burst(): failed (ret=%d)",
-                     (void *)rxq->priv,
-                     ret);
-               abort();
-       }
-       rxq->elts_head = elts_head;
-       /* Increase packets counter. */
-       rxq->stats.ipackets += pkts_ret;
-       return pkts_ret;
-}
-
 /**
  * Allocate a Queue Pair.
  * Optionally setup inline receive if supported.
@@ -2031,56 +1609,6 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
        priv_mac_addr_del(priv);
 }
 
-/**
- * Dummy DPDK callback for TX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_txq
- *   Generic pointer to TX queue structure.
- * @param[in] pkts
- *   Packets to transmit.
- * @param pkts_n
- *   Number of packets in array.
- *
- * @return
- *   Number of packets successfully transmitted (<= pkts_n).
- */
-static uint16_t
-removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       (void)dpdk_txq;
-       (void)pkts;
-       (void)pkts_n;
-       return 0;
-}
-
-/**
- * Dummy DPDK callback for RX.
- *
- * This function is used to temporarily replace the real callback during
- * unsafe control operations on the queue, or in case of error.
- *
- * @param dpdk_rxq
- *   Generic pointer to RX queue structure.
- * @param[out] pkts
- *   Array to store received packets.
- * @param pkts_n
- *   Maximum number of packets in array.
- *
- * @return
- *   Number of packets successfully received (<= pkts_n).
- */
-static uint16_t
-removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
-{
-       (void)dpdk_rxq;
-       (void)pkts;
-       (void)pkts_n;
-       return 0;
-}
-
 /**
  * DPDK callback to close the device.
  *
@@ -2107,8 +1635,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
         * still required for DPDK 1.3 because some programs (such as testpmd)
         * never release them before closing the device.
         */
-       dev->rx_pkt_burst = removed_rx_burst;
-       dev->tx_pkt_burst = removed_tx_burst;
+       dev->rx_pkt_burst = mlx4_rx_burst_removed;
+       dev->tx_pkt_burst = mlx4_tx_burst_removed;
        if (priv->rxqs != NULL) {
                /* XXX race condition if mlx4_rx_burst() is still running. */
                usleep(1000);
@@ -2173,8 +1701,8 @@ priv_set_link(struct priv *priv, int up)
                err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP);
                if (err)
                        return err;
-               dev->rx_pkt_burst = removed_rx_burst;
-               dev->tx_pkt_burst = removed_tx_burst;
+               dev->rx_pkt_burst = mlx4_rx_burst_removed;
+               dev->tx_pkt_burst = mlx4_tx_burst_removed;
        }
        return 0;
 }
index edbece6..efccf1a 100644 (file)
@@ -49,6 +49,7 @@
 #include <rte_ethdev.h>
 #include <rte_ether.h>
 #include <rte_interrupts.h>
+#include <rte_mempool.h>
 
 /* Request send completion once in every 64 sends, might be less. */
 #define MLX4_PMD_TX_PER_COMP_REQ 64
@@ -115,6 +116,7 @@ struct priv {
 
 /* mlx4.c */
 
+struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
 int mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete);
 
 /* mlx4_intr.c */
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
new file mode 100644 (file)
index 0000000..b5e7777
--- /dev/null
@@ -0,0 +1,524 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file
+ * Data plane functions for mlx4 driver.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Verbs headers do not support -pedantic. */
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+#include <infiniband/verbs.h>
+#ifdef PEDANTIC
+#pragma GCC diagnostic error "-Wpedantic"
+#endif
+
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+#include <rte_prefetch.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Manage Tx completions.
+ *
+ * When sending a burst, mlx4_tx_burst() posts several WRs.
+ * To improve performance, a completion event is only required once every
+ * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
+ * for other WRs, but this information would not be used anyway.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ *
+ * @return
+ *   0 on success, -1 on failure.
+ */
+static int
+mlx4_txq_complete(struct txq *txq)
+{
+       unsigned int elts_comp = txq->elts_comp;
+       unsigned int elts_tail = txq->elts_tail;
+       const unsigned int elts_n = txq->elts_n;
+       struct ibv_wc wcs[elts_comp];
+       int wcs_n;
+
+       if (unlikely(elts_comp == 0))
+               return 0;
+       wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
+       if (unlikely(wcs_n == 0))
+               return 0;
+       if (unlikely(wcs_n < 0)) {
+               DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
+                     (void *)txq, wcs_n);
+               return -1;
+       }
+       elts_comp -= wcs_n;
+       assert(elts_comp <= txq->elts_comp);
+       /*
+        * Assume WC status is successful as nothing can be done about it
+        * anyway.
+        */
+       elts_tail += wcs_n * txq->elts_comp_cd_init;
+       if (elts_tail >= elts_n)
+               elts_tail -= elts_n;
+       txq->elts_tail = elts_tail;
+       txq->elts_comp = elts_comp;
+       return 0;
+}
+
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+       if (unlikely(RTE_MBUF_INDIRECT(buf)))
+               return rte_mbuf_from_indirect(buf)->pool;
+       return buf->pool;
+}
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
+ * remove an entry first.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+       unsigned int i;
+       struct ibv_mr *mr;
+
+       for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+               if (unlikely(txq->mp2mr[i].mp == NULL)) {
+                       /* Unknown MP, add a new MR for it. */
+                       break;
+               }
+               if (txq->mp2mr[i].mp == mp) {
+                       assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+                       assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+                       return txq->mp2mr[i].lkey;
+               }
+       }
+       /* Add a new entry, register MR first. */
+       DEBUG("%p: discovered new memory pool \"%s\" (%p)",
+             (void *)txq, mp->name, (void *)mp);
+       mr = mlx4_mp2mr(txq->priv->pd, mp);
+       if (unlikely(mr == NULL)) {
+               DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+                     (void *)txq);
+               return (uint32_t)-1;
+       }
+       if (unlikely(i == RTE_DIM(txq->mp2mr))) {
+               /* Table is full, remove oldest entry. */
+               DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
+                     (void *)txq);
+               --i;
+               claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+               memmove(&txq->mp2mr[0], &txq->mp2mr[1],
+                       (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
+       }
+       /* Store the new entry. */
+       txq->mp2mr[i].mp = mp;
+       txq->mp2mr[i].mr = mr;
+       txq->mp2mr[i].lkey = mr->lkey;
+       DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
+             (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
+       return txq->mp2mr[i].lkey;
+}
+
+/**
+ * DPDK callback for Tx.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+       struct txq *txq = (struct txq *)dpdk_txq;
+       struct ibv_send_wr *wr_head = NULL;
+       struct ibv_send_wr **wr_next = &wr_head;
+       struct ibv_send_wr *wr_bad = NULL;
+       unsigned int elts_head = txq->elts_head;
+       const unsigned int elts_n = txq->elts_n;
+       unsigned int elts_comp_cd = txq->elts_comp_cd;
+       unsigned int elts_comp = 0;
+       unsigned int i;
+       unsigned int max;
+       int err;
+
+       assert(elts_comp_cd != 0);
+       mlx4_txq_complete(txq);
+       max = (elts_n - (elts_head - txq->elts_tail));
+       if (max > elts_n)
+               max -= elts_n;
+       assert(max >= 1);
+       assert(max <= elts_n);
+       /* Always leave one free entry in the ring. */
+       --max;
+       if (max == 0)
+               return 0;
+       if (max > pkts_n)
+               max = pkts_n;
+       for (i = 0; (i != max); ++i) {
+               struct rte_mbuf *buf = pkts[i];
+               unsigned int elts_head_next =
+                       (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
+               struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
+               struct txq_elt *elt = &(*txq->elts)[elts_head];
+               struct ibv_send_wr *wr = &elt->wr;
+               unsigned int segs = buf->nb_segs;
+               unsigned int sent_size = 0;
+               uint32_t send_flags = 0;
+
+               /* Clean up old buffer. */
+               if (likely(elt->buf != NULL)) {
+                       struct rte_mbuf *tmp = elt->buf;
+
+#ifndef NDEBUG
+                       /* Poisoning. */
+                       memset(elt, 0x66, sizeof(*elt));
+#endif
+                       /* Faster than rte_pktmbuf_free(). */
+                       do {
+                               struct rte_mbuf *next = tmp->next;
+
+                               rte_pktmbuf_free_seg(tmp);
+                               tmp = next;
+                       } while (tmp != NULL);
+               }
+               /* Request Tx completion. */
+               if (unlikely(--elts_comp_cd == 0)) {
+                       elts_comp_cd = txq->elts_comp_cd_init;
+                       ++elts_comp;
+                       send_flags |= IBV_SEND_SIGNALED;
+               }
+               if (likely(segs == 1)) {
+                       struct ibv_sge *sge = &elt->sge;
+                       uintptr_t addr;
+                       uint32_t length;
+                       uint32_t lkey;
+
+                       /* Retrieve buffer information. */
+                       addr = rte_pktmbuf_mtod(buf, uintptr_t);
+                       length = buf->data_len;
+                       /* Retrieve memory region key for this memory pool. */
+                       lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
+                       if (unlikely(lkey == (uint32_t)-1)) {
+                               /* MR does not exist. */
+                               DEBUG("%p: unable to get MP <-> MR"
+                                     " association", (void *)txq);
+                               /* Clean up Tx element. */
+                               elt->buf = NULL;
+                               goto stop;
+                       }
+                       /* Update element. */
+                       elt->buf = buf;
+                       if (txq->priv->vf)
+                               rte_prefetch0((volatile void *)
+                                             (uintptr_t)addr);
+                       RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+                       sge->addr = addr;
+                       sge->length = length;
+                       sge->lkey = lkey;
+                       sent_size += length;
+               } else {
+                       err = -1;
+                       goto stop;
+               }
+               if (sent_size <= txq->max_inline)
+                       send_flags |= IBV_SEND_INLINE;
+               elts_head = elts_head_next;
+               /* Increment sent bytes counter. */
+               txq->stats.obytes += sent_size;
+               /* Set up WR. */
+               wr->sg_list = &elt->sge;
+               wr->num_sge = segs;
+               wr->opcode = IBV_WR_SEND;
+               wr->send_flags = send_flags;
+               *wr_next = wr;
+               wr_next = &wr->next;
+       }
+stop:
+       /* Take a shortcut if nothing must be sent. */
+       if (unlikely(i == 0))
+               return 0;
+       /* Increment sent packets counter. */
+       txq->stats.opackets += i;
+       /* Ring QP doorbell. */
+       *wr_next = NULL;
+       assert(wr_head);
+       err = ibv_post_send(txq->qp, wr_head, &wr_bad);
+       if (unlikely(err)) {
+               uint64_t obytes = 0;
+               uint64_t opackets = 0;
+
+               /* Rewind bad WRs. */
+               while (wr_bad != NULL) {
+                       int j;
+
+                       /* Force completion request if one was lost. */
+                       if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
+                               elts_comp_cd = 1;
+                               --elts_comp;
+                       }
+                       ++opackets;
+                       for (j = 0; j < wr_bad->num_sge; ++j)
+                               obytes += wr_bad->sg_list[j].length;
+                       elts_head = (elts_head ? elts_head : elts_n) - 1;
+                       wr_bad = wr_bad->next;
+               }
+               txq->stats.opackets -= opackets;
+               txq->stats.obytes -= obytes;
+               i -= opackets;
+               DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
+                     " (%" PRIu64 " bytes) rejected: %s",
+                     (void *)txq,
+                     opackets,
+                     obytes,
+                     (err <= -1) ? "Internal error" : strerror(err));
+       }
+       txq->elts_head = elts_head;
+       txq->elts_comp += elts_comp;
+       txq->elts_comp_cd = elts_comp_cd;
+       return i;
+}
+
+/**
+ * DPDK callback for Rx.
+ *
+ * The following function doesn't manage scattered packets.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+       struct rxq *rxq = (struct rxq *)dpdk_rxq;
+       struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
+       const unsigned int elts_n = rxq->elts_n;
+       unsigned int elts_head = rxq->elts_head;
+       struct ibv_wc wcs[pkts_n];
+       struct ibv_recv_wr *wr_head = NULL;
+       struct ibv_recv_wr **wr_next = &wr_head;
+       struct ibv_recv_wr *wr_bad = NULL;
+       unsigned int i;
+       unsigned int pkts_ret = 0;
+       int ret;
+
+       ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
+       if (unlikely(ret == 0))
+               return 0;
+       if (unlikely(ret < 0)) {
+               DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
+                     (void *)rxq, ret);
+               return 0;
+       }
+       assert(ret <= (int)pkts_n);
+       /* For each work completion. */
+       for (i = 0; i != (unsigned int)ret; ++i) {
+               struct ibv_wc *wc = &wcs[i];
+               struct rxq_elt *elt = &(*elts)[elts_head];
+               struct ibv_recv_wr *wr = &elt->wr;
+               uint32_t len = wc->byte_len;
+               struct rte_mbuf *seg = elt->buf;
+               struct rte_mbuf *rep;
+
+               /* Sanity checks. */
+               assert(wr->sg_list == &elt->sge);
+               assert(wr->num_sge == 1);
+               assert(elts_head < rxq->elts_n);
+               assert(rxq->elts_head < rxq->elts_n);
+               /*
+                * Fetch initial bytes of packet descriptor into a
+                * cacheline while allocating rep.
+                */
+               rte_mbuf_prefetch_part1(seg);
+               rte_mbuf_prefetch_part2(seg);
+               /* Link completed WRs together for repost. */
+               *wr_next = wr;
+               wr_next = &wr->next;
+               if (unlikely(wc->status != IBV_WC_SUCCESS)) {
+                       /* Whatever, just repost the offending WR. */
+                       DEBUG("rxq=%p: bad work completion status (%d): %s",
+                             (void *)rxq, wc->status,
+                             ibv_wc_status_str(wc->status));
+                       /* Increment dropped packets counter. */
+                       ++rxq->stats.idropped;
+                       goto repost;
+               }
+               rep = rte_mbuf_raw_alloc(rxq->mp);
+               if (unlikely(rep == NULL)) {
+                       /*
+                        * Unable to allocate a replacement mbuf,
+                        * repost WR.
+                        */
+                       DEBUG("rxq=%p: can't allocate a new mbuf",
+                             (void *)rxq);
+                       /* Increase out of memory counters. */
+                       ++rxq->stats.rx_nombuf;
+                       ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
+                       goto repost;
+               }
+               /* Reconfigure sge to use rep instead of seg. */
+               elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
+               assert(elt->sge.lkey == rxq->mr->lkey);
+               elt->buf = rep;
+               /* Update seg information. */
+               seg->data_off = RTE_PKTMBUF_HEADROOM;
+               seg->nb_segs = 1;
+               seg->port = rxq->port_id;
+               seg->next = NULL;
+               seg->pkt_len = len;
+               seg->data_len = len;
+               seg->packet_type = 0;
+               seg->ol_flags = 0;
+               /* Return packet. */
+               *(pkts++) = seg;
+               ++pkts_ret;
+               /* Increase bytes counter. */
+               rxq->stats.ibytes += len;
+repost:
+               if (++elts_head >= elts_n)
+                       elts_head = 0;
+               continue;
+       }
+       if (unlikely(i == 0))
+               return 0;
+       /* Repost WRs. */
+       *wr_next = NULL;
+       assert(wr_head);
+       ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
+       if (unlikely(ret)) {
+               /* Inability to repost WRs is fatal. */
+               DEBUG("%p: recv_burst(): failed (ret=%d)",
+                     (void *)rxq->priv,
+                     ret);
+               abort();
+       }
+       rxq->elts_head = elts_head;
+       /* Increase packets counter. */
+       rxq->stats.ipackets += pkts_ret;
+       return pkts_ret;
+}
+
+/**
+ * Dummy DPDK callback for Tx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to Tx queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+       (void)dpdk_txq;
+       (void)pkts;
+       (void)pkts_n;
+       return 0;
+}
+
+/**
+ * Dummy DPDK callback for Rx.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to Rx queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+       (void)dpdk_rxq;
+       (void)pkts;
+       (void)pkts_n;
+       return 0;
+}
index ea55aed..669c8a4 100644 (file)
@@ -119,4 +119,16 @@ struct txq {
        unsigned int socket; /**< CPU socket ID for allocations. */
 };
 
+/* mlx4_rxtx.c */
+
+uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
+uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
+                      uint16_t pkts_n);
+uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
+                      uint16_t pkts_n);
+uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
+                              uint16_t pkts_n);
+uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+                              uint16_t pkts_n);
+
 #endif /* MLX4_RXTX_H_ */