1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2016 6WIND S.A.
3 * Copyright 2016 Mellanox Technologies, Ltd
6 #include <rte_eal_memconfig.h>
7 #include <rte_mempool.h>
8 #include <rte_malloc.h>
9 #include <rte_rwlock.h>
10 #include <rte_bus_pci.h>
12 #include <mlx5_common_mp.h>
13 #include <mlx5_common_mr.h>
17 #include "mlx5_rxtx.h"
20 struct mr_find_contig_memsegs_data {
24 const struct rte_memseg_list *msl;
27 struct mr_update_mp_data {
28 struct rte_eth_dev *dev;
29 struct mlx5_mr_ctrl *mr_ctrl;
34 * Callback for memory free event. Iterate freed memsegs and check whether it
35 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
36 * result, the MR would be fragmented. If it becomes empty, the MR will be freed
37 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a
38 * secondary process, the garbage collector will be called in primary process
39 * as the secondary process can't call mlx5_mr_create().
41 * The global cache must be rebuilt if there's any change and this event has to
42 * be propagated to dataplane threads to flush the local caches.
45 * Pointer to the Ethernet device shared context.
47 * Address of freed memory.
49 * Size of freed memory.
52 mlx5_mr_mem_event_free_cb(struct mlx5_dev_ctx_shared *sh,
53 const void *addr, size_t len)
55 const struct rte_memseg_list *msl;
61 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu",
62 sh->ibdev_name, addr, len);
63 msl = rte_mem_virt2memseg_list(addr);
64 /* addr and len must be page-aligned. */
65 MLX5_ASSERT((uintptr_t)addr ==
66 RTE_ALIGN((uintptr_t)addr, msl->page_sz));
67 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
68 ms_n = len / msl->page_sz;
69 rte_rwlock_write_lock(&sh->share_cache.rwlock);
70 /* Clear bits of freed memsegs from MR. */
71 for (i = 0; i < ms_n; ++i) {
72 const struct rte_memseg *ms;
73 struct mr_cache_entry entry;
78 /* Find MR having this memseg. */
79 start = (uintptr_t)addr + i * msl->page_sz;
80 mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, start);
83 MLX5_ASSERT(mr->msl); /* Can't be external memory. */
84 ms = rte_mem_virt2memseg((void *)start, msl);
85 MLX5_ASSERT(ms != NULL);
86 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
87 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
88 pos = ms_idx - mr->ms_base_idx;
89 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
90 MLX5_ASSERT(pos < mr->ms_bmp_n);
91 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p",
92 sh->ibdev_name, (void *)mr, pos, (void *)start);
93 rte_bitmap_clear(mr->ms_bmp, pos);
94 if (--mr->ms_n == 0) {
96 LIST_INSERT_HEAD(&sh->share_cache.mr_free_list, mr, mr);
97 DRV_LOG(DEBUG, "device %s remove MR(%p) from list",
98 sh->ibdev_name, (void *)mr);
101 * MR is fragmented or will be freed. the global cache must be
107 mlx5_mr_rebuild_cache(&sh->share_cache);
109 * Flush local caches by propagating invalidation across cores.
110 * rte_smp_wmb() is enough to synchronize this event. If one of
111 * freed memsegs is seen by other core, that means the memseg
112 * has been allocated by allocator, which will come after this
113 * free call. Therefore, this store instruction (incrementing
114 * generation below) will be guaranteed to be seen by other core
115 * before the core sees the newly allocated memory.
117 ++sh->share_cache.dev_gen;
118 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d",
119 sh->share_cache.dev_gen);
122 rte_rwlock_write_unlock(&sh->share_cache.rwlock);
126 * Callback for memory event. This can be called from both primary and secondary
137 mlx5_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
138 size_t len, void *arg __rte_unused)
140 struct mlx5_dev_ctx_shared *sh;
141 struct mlx5_dev_list *dev_list = &mlx5_shared_data->mem_event_cb_list;
143 /* Must be called from the primary process. */
144 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
145 switch (event_type) {
146 case RTE_MEM_EVENT_FREE:
147 rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock);
148 /* Iterate all the existing mlx5 devices. */
149 LIST_FOREACH(sh, dev_list, mem_event_cb)
150 mlx5_mr_mem_event_free_cb(sh, addr, len);
151 rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
153 case RTE_MEM_EVENT_ALLOC:
160 * Bottom-half of LKey search on Rx.
163 * Pointer to Rx queue structure.
168 * Searched LKey on success, UINT32_MAX on no match.
171 mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
173 struct mlx5_rxq_ctrl *rxq_ctrl =
174 container_of(rxq, struct mlx5_rxq_ctrl, rxq);
175 struct mlx5_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
176 struct mlx5_priv *priv = rxq_ctrl->priv;
178 return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
179 &priv->sh->share_cache, mr_ctrl, addr,
180 priv->config.mr_ext_memseg_en);
184 * Bottom-half of LKey search on Tx.
187 * Pointer to Tx queue structure.
192 * Searched LKey on success, UINT32_MAX on no match.
195 mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
197 struct mlx5_txq_ctrl *txq_ctrl =
198 container_of(txq, struct mlx5_txq_ctrl, txq);
199 struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
200 struct mlx5_priv *priv = txq_ctrl->priv;
202 return mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
203 &priv->sh->share_cache, mr_ctrl, addr,
204 priv->config.mr_ext_memseg_en);
208 * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
209 * list, register the mempool of the mbuf as externally allocated memory.
212 * Pointer to Tx queue structure.
217 * Searched LKey on success, UINT32_MAX on no match.
220 mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
222 uintptr_t addr = (uintptr_t)mb->buf_addr;
225 lkey = mlx5_tx_addr2mr_bh(txq, addr);
226 if (lkey == UINT32_MAX && rte_errno == ENXIO) {
227 /* Mempool may have externally allocated memory. */
228 return mlx5_tx_update_ext_mp(txq, addr, mlx5_mb2mp(mb));
234 * Called during rte_mempool_mem_iter() by mlx5_mr_update_ext_mp().
236 * Externally allocated chunk is registered and a MR is created for the chunk.
237 * The MR object is added to the global list. If memseg list of a MR object
238 * (mr->msl) is null, the MR object can be regarded as externally allocated
241 * Once external memory is registered, it should be static. If the memory is
242 * freed and the virtual address range has different physical memory mapped
243 * again, it may cause crash on device due to the wrong translation entry. PMD
244 * can't track the free event of the external memory for now.
247 mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
248 struct rte_mempool_memhdr *memhdr,
249 unsigned mem_idx __rte_unused)
251 struct mr_update_mp_data *data = opaque;
252 struct rte_eth_dev *dev = data->dev;
253 struct mlx5_priv *priv = dev->data->dev_private;
254 struct mlx5_dev_ctx_shared *sh = priv->sh;
255 struct mlx5_mr_ctrl *mr_ctrl = data->mr_ctrl;
256 struct mlx5_mr *mr = NULL;
257 uintptr_t addr = (uintptr_t)memhdr->addr;
258 size_t len = memhdr->len;
259 struct mr_cache_entry entry;
262 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
263 /* If already registered, it should return. */
264 rte_rwlock_read_lock(&sh->share_cache.rwlock);
265 lkey = mlx5_mr_lookup_cache(&sh->share_cache, &entry, addr);
266 rte_rwlock_read_unlock(&sh->share_cache.rwlock);
267 if (lkey != UINT32_MAX)
269 DRV_LOG(DEBUG, "port %u register MR for chunk #%d of mempool (%s)",
270 dev->data->port_id, mem_idx, mp->name);
271 mr = mlx5_create_mr_ext(sh->pd, addr, len, mp->socket_id,
272 sh->share_cache.reg_mr_cb);
275 "port %u unable to allocate a new MR of"
277 dev->data->port_id, mp->name);
281 rte_rwlock_write_lock(&sh->share_cache.rwlock);
282 LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
283 /* Insert to the global cache table. */
284 mlx5_mr_insert_cache(&sh->share_cache, mr);
285 rte_rwlock_write_unlock(&sh->share_cache.rwlock);
286 /* Insert to the local cache table */
287 mlx5_mr_addr2mr_bh(sh->pd, &priv->mp_id, &sh->share_cache,
288 mr_ctrl, addr, priv->config.mr_ext_memseg_en);
292 * Finds the first ethdev that match the pci device.
293 * The existence of multiple ethdev per pci device is only with representors.
294 * On such case, it is enough to get only one of the ports as they all share
295 * the same ibv context.
298 * Pointer to the PCI device.
301 * Pointer to the ethdev if found, NULL otherwise.
303 static struct rte_eth_dev *
304 pci_dev_to_eth_dev(struct rte_pci_device *pdev)
308 port_id = rte_eth_find_next_of(0, &pdev->device);
309 if (port_id == RTE_MAX_ETHPORTS)
311 return &rte_eth_devices[port_id];
315 * DPDK callback to DMA map external memory to a PCI device.
318 * Pointer to the PCI device.
320 * Starting virtual address of memory to be mapped.
322 * Starting IOVA address of memory to be mapped.
324 * Length of memory segment being mapped.
327 * 0 on success, negative value on error.
330 mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
331 uint64_t iova __rte_unused, size_t len)
333 struct rte_eth_dev *dev;
335 struct mlx5_priv *priv;
336 struct mlx5_dev_ctx_shared *sh;
338 dev = pci_dev_to_eth_dev(pdev);
340 DRV_LOG(WARNING, "unable to find matching ethdev "
341 "to PCI device %p", (void *)pdev);
345 priv = dev->data->dev_private;
347 mr = mlx5_create_mr_ext(sh->pd, (uintptr_t)addr, len, SOCKET_ID_ANY,
348 sh->share_cache.reg_mr_cb);
351 "port %u unable to dma map", dev->data->port_id);
355 rte_rwlock_write_lock(&sh->share_cache.rwlock);
356 LIST_INSERT_HEAD(&sh->share_cache.mr_list, mr, mr);
357 /* Insert to the global cache table. */
358 mlx5_mr_insert_cache(&sh->share_cache, mr);
359 rte_rwlock_write_unlock(&sh->share_cache.rwlock);
364 * DPDK callback to DMA unmap external memory to a PCI device.
367 * Pointer to the PCI device.
369 * Starting virtual address of memory to be unmapped.
371 * Starting IOVA address of memory to be unmapped.
373 * Length of memory segment being unmapped.
376 * 0 on success, negative value on error.
379 mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
380 uint64_t iova __rte_unused, size_t len __rte_unused)
382 struct rte_eth_dev *dev;
383 struct mlx5_priv *priv;
384 struct mlx5_dev_ctx_shared *sh;
386 struct mr_cache_entry entry;
388 dev = pci_dev_to_eth_dev(pdev);
390 DRV_LOG(WARNING, "unable to find matching ethdev "
391 "to PCI device %p", (void *)pdev);
395 priv = dev->data->dev_private;
397 rte_rwlock_read_lock(&sh->share_cache.rwlock);
398 mr = mlx5_mr_lookup_list(&sh->share_cache, &entry, (uintptr_t)addr);
400 rte_rwlock_read_unlock(&sh->share_cache.rwlock);
401 DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
402 "to PCI device %p", (uintptr_t)addr,
408 mlx5_mr_free(mr, sh->share_cache.dereg_mr_cb);
409 DRV_LOG(DEBUG, "port %u remove MR(%p) from list", dev->data->port_id,
411 mlx5_mr_rebuild_cache(&sh->share_cache);
413 * Flush local caches by propagating invalidation across cores.
414 * rte_smp_wmb() is enough to synchronize this event. If one of
415 * freed memsegs is seen by other core, that means the memseg
416 * has been allocated by allocator, which will come after this
417 * free call. Therefore, this store instruction (incrementing
418 * generation below) will be guaranteed to be seen by other core
419 * before the core sees the newly allocated memory.
421 ++sh->share_cache.dev_gen;
422 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d",
423 sh->share_cache.dev_gen);
425 rte_rwlock_read_unlock(&sh->share_cache.rwlock);
430 * Register MR for entire memory chunks in a Mempool having externally allocated
431 * memory and fill in local cache.
434 * Pointer to Ethernet device.
436 * Pointer to per-queue MR control structure.
438 * Pointer to registering Mempool.
441 * 0 on success, -1 on failure.
444 mlx5_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
445 struct rte_mempool *mp)
447 struct mr_update_mp_data data = {
453 rte_mempool_mem_iter(mp, mlx5_mr_update_ext_mp_cb, &data);
458 * Register MR entire memory chunks in a Mempool having externally allocated
459 * memory and search LKey of the address to return.
462 * Pointer to Ethernet device.
466 * Pointer to registering Mempool where addr belongs.
469 * LKey for address on success, UINT32_MAX on failure.
472 mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
473 struct rte_mempool *mp)
475 struct mlx5_txq_ctrl *txq_ctrl =
476 container_of(txq, struct mlx5_txq_ctrl, txq);
477 struct mlx5_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
478 struct mlx5_priv *priv = txq_ctrl->priv;
480 if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
482 "port %u using address (%p) from unregistered mempool"
483 " having externally allocated memory"
484 " in secondary process, please create mempool"
485 " prior to rte_eth_dev_start()",
486 PORT_ID(priv), (void *)addr);
489 mlx5_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
490 return mlx5_tx_addr2mr_bh(txq, addr);
493 /* Called during rte_mempool_mem_iter() by mlx5_mr_update_mp(). */
495 mlx5_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
496 struct rte_mempool_memhdr *memhdr,
497 unsigned mem_idx __rte_unused)
499 struct mr_update_mp_data *data = opaque;
500 struct rte_eth_dev *dev = data->dev;
501 struct mlx5_priv *priv = dev->data->dev_private;
505 /* Stop iteration if failed in the previous walk. */
508 /* Register address of the chunk and update local caches. */
509 lkey = mlx5_mr_addr2mr_bh(priv->sh->pd, &priv->mp_id,
510 &priv->sh->share_cache, data->mr_ctrl,
511 (uintptr_t)memhdr->addr,
512 priv->config.mr_ext_memseg_en);
513 if (lkey == UINT32_MAX)
518 * Register entire memory chunks in a Mempool.
521 * Pointer to Ethernet device.
523 * Pointer to per-queue MR control structure.
525 * Pointer to registering Mempool.
528 * 0 on success, -1 on failure.
531 mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
532 struct rte_mempool *mp)
534 struct mr_update_mp_data data = {
539 uint32_t flags = rte_pktmbuf_priv_flags(mp);
541 if (flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) {
543 * The pinned external buffer should be registered for DMA
544 * operations by application. The mem_list of the pool contains
545 * the list of chunks with mbuf structures w/o built-in data
546 * buffers and DMA actually does not happen there, no need
547 * to create MR for these chunks.
551 DRV_LOG(DEBUG, "Port %u Rx queue registering mp %s "
552 "having %u chunks.", dev->data->port_id,
553 mp->name, mp->nb_mem_chunks);
554 rte_mempool_mem_iter(mp, mlx5_mr_update_mp_cb, &data);
555 if (data.ret < 0 && rte_errno == ENXIO) {
556 /* Mempool may have externally allocated memory. */
557 return mlx5_mr_update_ext_mp(dev, mr_ctrl, mp);