4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42 #pragma GCC diagnostic ignored "-pedantic"
44 #include <infiniband/verbs.h>
46 #pragma GCC diagnostic error "-pedantic"
49 /* DPDK headers don't like -pedantic. */
51 #pragma GCC diagnostic ignored "-pedantic"
54 #include <rte_mempool.h>
55 #include <rte_prefetch.h>
56 #include <rte_common.h>
57 #include <rte_branch_prediction.h>
59 #pragma GCC diagnostic error "-pedantic"
63 #include "mlx5_utils.h"
64 #include "mlx5_rxtx.h"
65 #include "mlx5_defs.h"
68 * Manage TX completions.
70 * When sending a burst, mlx5_tx_burst() posts several WRs.
71 * To improve performance, a completion event is only required once every
72 * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
73 * for other WRs, but this information would not be used anyway.
76 * Pointer to TX queue structure.
79 * 0 on success, -1 on failure.
82 txq_complete(struct txq *txq)
84 unsigned int elts_comp = txq->elts_comp;
85 unsigned int elts_tail = txq->elts_tail;
86 const unsigned int elts_n = txq->elts_n;
89 if (unlikely(elts_comp == 0))
92 DEBUG("%p: processing %u work requests completions",
93 (void *)txq, elts_comp);
95 wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp);
96 if (unlikely(wcs_n == 0))
98 if (unlikely(wcs_n < 0)) {
99 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
104 assert(elts_comp <= txq->elts_comp);
106 * Assume WC status is successful as nothing can be done about it
109 elts_tail += wcs_n * txq->elts_comp_cd_init;
110 if (elts_tail >= elts_n)
112 txq->elts_tail = elts_tail;
113 txq->elts_comp = elts_comp;
118 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
119 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
120 * remove an entry first.
123 * Pointer to TX queue structure.
125 * Memory Pool for which a Memory Region lkey must be returned.
128 * mr->lkey on success, (uint32_t)-1 on failure.
131 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
136 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
137 if (unlikely(txq->mp2mr[i].mp == NULL)) {
138 /* Unknown MP, add a new MR for it. */
141 if (txq->mp2mr[i].mp == mp) {
142 assert(txq->mp2mr[i].lkey != (uint32_t)-1);
143 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
144 return txq->mp2mr[i].lkey;
147 /* Add a new entry, register MR first. */
148 DEBUG("%p: discovered new memory pool %p", (void *)txq, (void *)mp);
149 mr = ibv_reg_mr(txq->priv->pd,
150 (void *)mp->elt_va_start,
151 (mp->elt_va_end - mp->elt_va_start),
152 (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE));
153 if (unlikely(mr == NULL)) {
154 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
158 if (unlikely(i == RTE_DIM(txq->mp2mr))) {
159 /* Table is full, remove oldest entry. */
160 DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
163 claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
164 memmove(&txq->mp2mr[0], &txq->mp2mr[1],
165 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
167 /* Store the new entry. */
168 txq->mp2mr[i].mp = mp;
169 txq->mp2mr[i].mr = mr;
170 txq->mp2mr[i].lkey = mr->lkey;
171 DEBUG("%p: new MR lkey for MP %p: 0x%08" PRIu32,
172 (void *)txq, (void *)mp, txq->mp2mr[i].lkey);
173 return txq->mp2mr[i].lkey;
177 * DPDK callback for TX.
180 * Generic pointer to TX queue structure.
182 * Packets to transmit.
184 * Number of packets in array.
187 * Number of packets successfully transmitted (<= pkts_n).
190 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
192 struct txq *txq = (struct txq *)dpdk_txq;
193 unsigned int elts_head = txq->elts_head;
194 const unsigned int elts_tail = txq->elts_tail;
195 const unsigned int elts_n = txq->elts_n;
196 unsigned int elts_comp_cd = txq->elts_comp_cd;
197 unsigned int elts_comp = 0;
202 assert(elts_comp_cd != 0);
204 max = (elts_n - (elts_head - elts_tail));
208 assert(max <= elts_n);
209 /* Always leave one free entry in the ring. */
215 for (i = 0; (i != max); ++i) {
216 struct rte_mbuf *buf = pkts[i];
217 unsigned int elts_head_next =
218 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
219 struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
220 struct txq_elt *elt = &(*txq->elts)[elts_head];
221 unsigned int segs = NB_SEGS(buf);
222 uint32_t send_flags = 0;
224 /* Clean up old buffer. */
225 if (likely(elt->buf != NULL)) {
226 struct rte_mbuf *tmp = elt->buf;
228 /* Faster than rte_pktmbuf_free(). */
230 struct rte_mbuf *next = NEXT(tmp);
232 rte_pktmbuf_free_seg(tmp);
234 } while (tmp != NULL);
236 /* Request TX completion. */
237 if (unlikely(--elts_comp_cd == 0)) {
238 elts_comp_cd = txq->elts_comp_cd_init;
240 send_flags |= IBV_EXP_QP_BURST_SIGNALED;
242 if (likely(segs == 1)) {
247 /* Retrieve buffer information. */
248 addr = rte_pktmbuf_mtod(buf, uintptr_t);
249 length = DATA_LEN(buf);
250 /* Retrieve Memory Region key for this memory pool. */
251 lkey = txq_mp2mr(txq, buf->pool);
252 if (unlikely(lkey == (uint32_t)-1)) {
253 /* MR does not exist. */
254 DEBUG("%p: unable to get MP <-> MR"
255 " association", (void *)txq);
256 /* Clean up TX element. */
260 /* Update element. */
263 rte_prefetch0((volatile void *)
265 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
266 /* Put packet into send queue. */
267 #if MLX5_PMD_MAX_INLINE > 0
268 if (length <= txq->max_inline)
269 err = txq->if_qp->send_pending_inline
276 err = txq->if_qp->send_pending
285 DEBUG("%p: TX scattered buffers support not"
286 " compiled in", (void *)txq);
289 elts_head = elts_head_next;
292 /* Take a shortcut if nothing must be sent. */
293 if (unlikely(i == 0))
295 /* Ring QP doorbell. */
296 err = txq->if_qp->send_flush(txq->qp);
298 /* A nonzero value is not supposed to be returned.
299 * Nothing can be done about it. */
300 DEBUG("%p: send_flush() failed with error %d",
303 txq->elts_head = elts_head;
304 txq->elts_comp += elts_comp;
305 txq->elts_comp_cd = elts_comp_cd;
310 * DPDK callback for RX.
313 * Generic pointer to RX queue structure.
315 * Array to store received packets.
317 * Maximum number of packets in array.
320 * Number of packets successfully received (<= pkts_n).
323 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
325 struct rxq *rxq = (struct rxq *)dpdk_rxq;
326 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
327 const unsigned int elts_n = rxq->elts_n;
328 unsigned int elts_head = rxq->elts_head;
329 struct ibv_sge sges[pkts_n];
331 unsigned int pkts_ret = 0;
334 for (i = 0; (i != pkts_n); ++i) {
335 struct rxq_elt *elt = &(*elts)[elts_head];
336 struct ibv_recv_wr *wr = &elt->wr;
337 uint64_t wr_id = wr->wr_id;
339 struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
340 WR_ID(wr_id).offset);
341 struct rte_mbuf *rep;
345 assert(WR_ID(wr_id).id < rxq->elts_n);
346 assert(wr->sg_list == &elt->sge);
347 assert(wr->num_sge == 1);
348 assert(elts_head < rxq->elts_n);
349 assert(rxq->elts_head < rxq->elts_n);
351 * Fetch initial bytes of packet descriptor into a
352 * cacheline while allocating rep.
355 rte_prefetch0(&seg->cacheline1);
356 ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
358 if (unlikely(ret < 0)) {
362 DEBUG("rxq=%p, poll_length() failed (ret=%d)",
364 /* ibv_poll_cq() must be used in case of failure. */
365 wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
366 if (unlikely(wcs_n == 0))
368 if (unlikely(wcs_n < 0)) {
369 DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
374 if (unlikely(wc.status != IBV_WC_SUCCESS)) {
375 /* Whatever, just repost the offending WR. */
376 DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
377 " completion status (%d): %s",
378 (void *)rxq, wc.wr_id, wc.status,
379 ibv_wc_status_str(wc.status));
380 /* Add SGE to array for repost. */
389 rep = __rte_mbuf_raw_alloc(rxq->mp);
390 if (unlikely(rep == NULL)) {
392 * Unable to allocate a replacement mbuf,
395 DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
396 " can't allocate a new mbuf",
397 (void *)rxq, WR_ID(wr_id).id);
398 /* Increment out of memory counters. */
399 ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
403 /* Reconfigure sge to use rep instead of seg. */
404 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
405 assert(elt->sge.lkey == rxq->mr->lkey);
406 WR_ID(wr->wr_id).offset =
407 (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
409 assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
411 /* Add SGE to array for repost. */
414 /* Update seg information. */
415 SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
417 PORT(seg) = rxq->port_id;
426 if (++elts_head >= elts_n)
430 if (unlikely(i == 0))
434 DEBUG("%p: reposting %u WRs", (void *)rxq, i);
436 ret = rxq->if_qp->recv_burst(rxq->qp, sges, i);
438 /* Inability to repost WRs is fatal. */
439 DEBUG("%p: recv_burst(): failed (ret=%d)",
444 rxq->elts_head = elts_head;
449 * Dummy DPDK callback for TX.
451 * This function is used to temporarily replace the real callback during
452 * unsafe control operations on the queue, or in case of error.
455 * Generic pointer to TX queue structure.
457 * Packets to transmit.
459 * Number of packets in array.
462 * Number of packets successfully transmitted (<= pkts_n).
465 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
474 * Dummy DPDK callback for RX.
476 * This function is used to temporarily replace the real callback during
477 * unsafe control operations on the queue, or in case of error.
480 * Generic pointer to RX queue structure.
482 * Array to store received packets.
484 * Maximum number of packets in array.
487 * Number of packets successfully received (<= pkts_n).
490 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)