4 * Copyright 2017 6WIND S.A.
5 * Copyright 2017 Mellanox
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 * Data plane functions for mlx4 driver.
44 /* Verbs headers do not support -pedantic. */
46 #pragma GCC diagnostic ignored "-Wpedantic"
48 #include <infiniband/verbs.h>
50 #pragma GCC diagnostic error "-Wpedantic"
53 #include <rte_branch_prediction.h>
54 #include <rte_common.h>
56 #include <rte_mempool.h>
57 #include <rte_prefetch.h>
60 #include "mlx4_rxtx.h"
61 #include "mlx4_utils.h"
64 * Manage Tx completions.
66 * When sending a burst, mlx4_tx_burst() posts several WRs.
67 * To improve performance, a completion event is only required once every
68 * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
69 * for other WRs, but this information would not be used anyway.
72 * Pointer to Tx queue structure.
75 * 0 on success, -1 on failure.
78 mlx4_txq_complete(struct txq *txq)
80 unsigned int elts_comp = txq->elts_comp;
81 unsigned int elts_tail = txq->elts_tail;
82 const unsigned int elts_n = txq->elts_n;
83 struct ibv_wc wcs[elts_comp];
86 if (unlikely(elts_comp == 0))
88 wcs_n = ibv_poll_cq(txq->cq, elts_comp, wcs);
89 if (unlikely(wcs_n == 0))
91 if (unlikely(wcs_n < 0)) {
92 DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)",
97 assert(elts_comp <= txq->elts_comp);
99 * Assume WC status is successful as nothing can be done about it
102 elts_tail += wcs_n * txq->elts_comp_cd_init;
103 if (elts_tail >= elts_n)
105 txq->elts_tail = elts_tail;
106 txq->elts_comp = elts_comp;
111 * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
112 * the cloned mbuf is allocated is returned instead.
118 * Memory pool where data is located for given mbuf.
120 static struct rte_mempool *
121 mlx4_txq_mb2mp(struct rte_mbuf *buf)
123 if (unlikely(RTE_MBUF_INDIRECT(buf)))
124 return rte_mbuf_from_indirect(buf)->pool;
129 * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
130 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
131 * remove an entry first.
134 * Pointer to Tx queue structure.
136 * Memory pool for which a memory region lkey must be returned.
139 * mr->lkey on success, (uint32_t)-1 on failure.
142 mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
147 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
148 if (unlikely(txq->mp2mr[i].mp == NULL)) {
149 /* Unknown MP, add a new MR for it. */
152 if (txq->mp2mr[i].mp == mp) {
153 assert(txq->mp2mr[i].lkey != (uint32_t)-1);
154 assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
155 return txq->mp2mr[i].lkey;
158 /* Add a new entry, register MR first. */
159 DEBUG("%p: discovered new memory pool \"%s\" (%p)",
160 (void *)txq, mp->name, (void *)mp);
161 mr = mlx4_mp2mr(txq->priv->pd, mp);
162 if (unlikely(mr == NULL)) {
163 DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
167 if (unlikely(i == RTE_DIM(txq->mp2mr))) {
168 /* Table is full, remove oldest entry. */
169 DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
172 claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
173 memmove(&txq->mp2mr[0], &txq->mp2mr[1],
174 (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
176 /* Store the new entry. */
177 txq->mp2mr[i].mp = mp;
178 txq->mp2mr[i].mr = mr;
179 txq->mp2mr[i].lkey = mr->lkey;
180 DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
181 (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey);
182 return txq->mp2mr[i].lkey;
186 * DPDK callback for Tx.
189 * Generic pointer to Tx queue structure.
191 * Packets to transmit.
193 * Number of packets in array.
196 * Number of packets successfully transmitted (<= pkts_n).
199 mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
201 struct txq *txq = (struct txq *)dpdk_txq;
202 struct ibv_send_wr *wr_head = NULL;
203 struct ibv_send_wr **wr_next = &wr_head;
204 struct ibv_send_wr *wr_bad = NULL;
205 unsigned int elts_head = txq->elts_head;
206 const unsigned int elts_n = txq->elts_n;
207 unsigned int elts_comp_cd = txq->elts_comp_cd;
208 unsigned int elts_comp = 0;
213 assert(elts_comp_cd != 0);
214 mlx4_txq_complete(txq);
215 max = (elts_n - (elts_head - txq->elts_tail));
219 assert(max <= elts_n);
220 /* Always leave one free entry in the ring. */
226 for (i = 0; (i != max); ++i) {
227 struct rte_mbuf *buf = pkts[i];
228 unsigned int elts_head_next =
229 (((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
230 struct txq_elt *elt_next = &(*txq->elts)[elts_head_next];
231 struct txq_elt *elt = &(*txq->elts)[elts_head];
232 struct ibv_send_wr *wr = &elt->wr;
233 unsigned int segs = buf->nb_segs;
234 unsigned int sent_size = 0;
235 uint32_t send_flags = 0;
237 /* Clean up old buffer. */
238 if (likely(elt->buf != NULL)) {
239 struct rte_mbuf *tmp = elt->buf;
243 memset(elt, 0x66, sizeof(*elt));
245 /* Faster than rte_pktmbuf_free(). */
247 struct rte_mbuf *next = tmp->next;
249 rte_pktmbuf_free_seg(tmp);
251 } while (tmp != NULL);
253 /* Request Tx completion. */
254 if (unlikely(--elts_comp_cd == 0)) {
255 elts_comp_cd = txq->elts_comp_cd_init;
257 send_flags |= IBV_SEND_SIGNALED;
259 if (likely(segs == 1)) {
260 struct ibv_sge *sge = &elt->sge;
265 /* Retrieve buffer information. */
266 addr = rte_pktmbuf_mtod(buf, uintptr_t);
267 length = buf->data_len;
268 /* Retrieve memory region key for this memory pool. */
269 lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf));
270 if (unlikely(lkey == (uint32_t)-1)) {
271 /* MR does not exist. */
272 DEBUG("%p: unable to get MP <-> MR"
273 " association", (void *)txq);
274 /* Clean up Tx element. */
278 /* Update element. */
281 rte_prefetch0((volatile void *)
283 RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
285 sge->length = length;
292 if (sent_size <= txq->max_inline)
293 send_flags |= IBV_SEND_INLINE;
294 elts_head = elts_head_next;
295 /* Increment sent bytes counter. */
296 txq->stats.obytes += sent_size;
298 wr->sg_list = &elt->sge;
300 wr->opcode = IBV_WR_SEND;
301 wr->send_flags = send_flags;
306 /* Take a shortcut if nothing must be sent. */
307 if (unlikely(i == 0))
309 /* Increment sent packets counter. */
310 txq->stats.opackets += i;
311 /* Ring QP doorbell. */
314 err = ibv_post_send(txq->qp, wr_head, &wr_bad);
317 uint64_t opackets = 0;
319 /* Rewind bad WRs. */
320 while (wr_bad != NULL) {
323 /* Force completion request if one was lost. */
324 if (wr_bad->send_flags & IBV_SEND_SIGNALED) {
329 for (j = 0; j < wr_bad->num_sge; ++j)
330 obytes += wr_bad->sg_list[j].length;
331 elts_head = (elts_head ? elts_head : elts_n) - 1;
332 wr_bad = wr_bad->next;
334 txq->stats.opackets -= opackets;
335 txq->stats.obytes -= obytes;
337 DEBUG("%p: ibv_post_send() failed, %" PRIu64 " packets"
338 " (%" PRIu64 " bytes) rejected: %s",
342 (err <= -1) ? "Internal error" : strerror(err));
344 txq->elts_head = elts_head;
345 txq->elts_comp += elts_comp;
346 txq->elts_comp_cd = elts_comp_cd;
351 * DPDK callback for Rx.
353 * The following function doesn't manage scattered packets.
356 * Generic pointer to Rx queue structure.
358 * Array to store received packets.
360 * Maximum number of packets in array.
363 * Number of packets successfully received (<= pkts_n).
366 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
368 struct rxq *rxq = (struct rxq *)dpdk_rxq;
369 struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
370 const unsigned int elts_n = rxq->elts_n;
371 unsigned int elts_head = rxq->elts_head;
372 struct ibv_wc wcs[pkts_n];
373 struct ibv_recv_wr *wr_head = NULL;
374 struct ibv_recv_wr **wr_next = &wr_head;
375 struct ibv_recv_wr *wr_bad = NULL;
377 unsigned int pkts_ret = 0;
380 ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
381 if (unlikely(ret == 0))
383 if (unlikely(ret < 0)) {
384 DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
388 assert(ret <= (int)pkts_n);
389 /* For each work completion. */
390 for (i = 0; i != (unsigned int)ret; ++i) {
391 struct ibv_wc *wc = &wcs[i];
392 struct rxq_elt *elt = &(*elts)[elts_head];
393 struct ibv_recv_wr *wr = &elt->wr;
394 uint32_t len = wc->byte_len;
395 struct rte_mbuf *seg = elt->buf;
396 struct rte_mbuf *rep;
399 assert(wr->sg_list == &elt->sge);
400 assert(wr->num_sge == 1);
401 assert(elts_head < rxq->elts_n);
402 assert(rxq->elts_head < rxq->elts_n);
404 * Fetch initial bytes of packet descriptor into a
405 * cacheline while allocating rep.
407 rte_mbuf_prefetch_part1(seg);
408 rte_mbuf_prefetch_part2(seg);
409 /* Link completed WRs together for repost. */
412 if (unlikely(wc->status != IBV_WC_SUCCESS)) {
413 /* Whatever, just repost the offending WR. */
414 DEBUG("rxq=%p: bad work completion status (%d): %s",
415 (void *)rxq, wc->status,
416 ibv_wc_status_str(wc->status));
417 /* Increment dropped packets counter. */
418 ++rxq->stats.idropped;
421 rep = rte_mbuf_raw_alloc(rxq->mp);
422 if (unlikely(rep == NULL)) {
424 * Unable to allocate a replacement mbuf,
427 DEBUG("rxq=%p: can't allocate a new mbuf",
429 /* Increase out of memory counters. */
430 ++rxq->stats.rx_nombuf;
431 ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
434 /* Reconfigure sge to use rep instead of seg. */
435 elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
436 assert(elt->sge.lkey == rxq->mr->lkey);
438 /* Update seg information. */
439 seg->data_off = RTE_PKTMBUF_HEADROOM;
441 seg->port = rxq->port_id;
445 seg->packet_type = 0;
450 /* Increase bytes counter. */
451 rxq->stats.ibytes += len;
453 if (++elts_head >= elts_n)
457 if (unlikely(i == 0))
462 ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
464 /* Inability to repost WRs is fatal. */
465 DEBUG("%p: recv_burst(): failed (ret=%d)",
470 rxq->elts_head = elts_head;
471 /* Increase packets counter. */
472 rxq->stats.ipackets += pkts_ret;
477 * Dummy DPDK callback for Tx.
479 * This function is used to temporarily replace the real callback during
480 * unsafe control operations on the queue, or in case of error.
483 * Generic pointer to Tx queue structure.
485 * Packets to transmit.
487 * Number of packets in array.
490 * Number of packets successfully transmitted (<= pkts_n).
493 mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
502 * Dummy DPDK callback for Rx.
504 * This function is used to temporarily replace the real callback during
505 * unsafe control operations on the queue, or in case of error.
508 * Generic pointer to Rx queue structure.
510 * Array to store received packets.
512 * Maximum number of packets in array.
515 * Number of packets successfully received (<= pkts_n).
518 mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)