+ /*
+ * If the global cache has overflowed since it failed to expand the
+ * B-tree table, it can't have all the existing MRs. Then, the address
+ * has to be searched by traversing the original MR list instead, which
+ * is very slow path. Otherwise, the global cache is all inclusive.
+ */
+ if (!unlikely(sh->mr.cache.overflow)) {
+ lkey = mr_btree_lookup(&sh->mr.cache, &idx, addr);
+ if (lkey != UINT32_MAX)
+ *entry = (*sh->mr.cache.table)[idx];
+ } else {
+ /* Falling back to the slowest path. */
+ mr = mr_lookup_dev_list(sh, entry, addr);
+ if (mr != NULL)
+ lkey = entry->lkey;
+ }
+ assert(lkey == UINT32_MAX || (addr >= entry->start &&
+ addr < entry->end));
+ return lkey;
+}
+
+/**
+ * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
+ * can raise memory free event and the callback function will spin on the lock.
+ *
+ * @param mr
+ * Pointer to MR to free.
+ */
+static void
+mr_free(struct mlx5_mr *mr)
+{
+ if (mr == NULL)
+ return;
+ DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr);
+ if (mr->ibv_mr != NULL)
+ claim_zero(mlx5_glue->dereg_mr(mr->ibv_mr));
+ if (mr->ms_bmp != NULL)
+ rte_bitmap_free(mr->ms_bmp);
+ rte_free(mr);
+}
+
+/**
+ * Release resources of detached MR having no online entry.
+ *
+ * @param sh
+ * Pointer to Ethernet device shared context.
+ */
+static void
+mlx5_mr_garbage_collect(struct mlx5_ibv_shared *sh)
+{
+ struct mlx5_mr *mr_next;
+ struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
+
+ /* Must be called from the primary process. */
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ /*
+ * MR can't be freed with holding the lock because rte_free() could call
+ * memory free callback function. This will be a deadlock situation.
+ */
+ rte_rwlock_write_lock(&sh->mr.rwlock);
+ /* Detach the whole free list and release it after unlocking. */
+ free_list = sh->mr.mr_free_list;
+ LIST_INIT(&sh->mr.mr_free_list);
+ rte_rwlock_write_unlock(&sh->mr.rwlock);
+ /* Release resources. */
+ mr_next = LIST_FIRST(&free_list);
+ while (mr_next != NULL) {
+ struct mlx5_mr *mr = mr_next;
+
+ mr_next = LIST_NEXT(mr, mr);
+ mr_free(mr);
+ }
+}
+
+/* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */
+static int
+mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ struct mr_find_contig_memsegs_data *data = arg;
+
+ if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
+ return 0;
+ /* Found, save it and stop walking. */
+ data->start = ms->addr_64;
+ data->end = ms->addr_64 + len;
+ data->msl = msl;
+ return 1;
+}
+
+/**
+ * Create a new global Memory Region (MR) for a missing virtual address.
+ * This API should be called on a secondary process, then a request is sent to
+ * the primary process in order to create a MR for the address. As the global MR
+ * list is on the shared memory, following LKey lookup should succeed unless the
+ * request fails.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry, found in the global cache or newly
+ * created. If failed to create one, this will not be updated.
+ * @param addr
+ * Target virtual address to register.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
+ */
+static uint32_t
+mlx5_mr_create_secondary(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+ uintptr_t addr)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+
+ DEBUG("port %u requesting MR creation for address (%p)",
+ dev->data->port_id, (void *)addr);
+ ret = mlx5_mp_req_mr_create(dev, addr);
+ if (ret) {
+ DEBUG("port %u fail to request MR creation for address (%p)",
+ dev->data->port_id, (void *)addr);
+ return UINT32_MAX;
+ }
+ rte_rwlock_read_lock(&priv->sh->mr.rwlock);
+ /* Fill in output data. */
+ mr_lookup_dev(priv->sh, entry, addr);
+ /* Lookup can't fail. */
+ assert(entry->lkey != UINT32_MAX);
+ rte_rwlock_read_unlock(&priv->sh->mr.rwlock);
+ DEBUG("port %u MR CREATED by primary process for %p:\n"
+ " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
+ dev->data->port_id, (void *)addr,
+ entry->start, entry->end, entry->lkey);
+ return entry->lkey;
+}
+
+/**
+ * Create a new global Memory Region (MR) for a missing virtual address.
+ * Register entire virtually contiguous memory chunk around the address.
+ * This must be called from the primary process.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] entry
+ * Pointer to returning MR cache entry, found in the global cache or newly
+ * created. If failed to create one, this will not be updated.
+ * @param addr
+ * Target virtual address to register.
+ *
+ * @return
+ * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
+ */
+uint32_t
+mlx5_mr_create_primary(struct rte_eth_dev *dev, struct mlx5_mr_cache *entry,
+ uintptr_t addr)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_ibv_shared *sh = priv->sh;
+ struct mlx5_dev_config *config = &priv->config;
+ const struct rte_memseg_list *msl;
+ const struct rte_memseg *ms;
+ struct mlx5_mr *mr = NULL;
+ size_t len;
+ uint32_t ms_n;
+ uint32_t bmp_size;
+ void *bmp_mem;
+ int ms_idx_shift = -1;
+ unsigned int n;
+ struct mr_find_contig_memsegs_data data = {
+ .addr = addr,
+ };
+ struct mr_find_contig_memsegs_data data_re;
+
+ DRV_LOG(DEBUG, "port %u creating a MR using address (%p)",
+ dev->data->port_id, (void *)addr);
+ /*
+ * Release detached MRs if any. This can't be called with holding either
+ * memory_hotplug_lock or sh->mr.rwlock. MRs on the free list have
+ * been detached by the memory free event but it couldn't be released
+ * inside the callback due to deadlock. As a result, releasing resources
+ * is quite opportunistic.
+ */
+ mlx5_mr_garbage_collect(sh);
+ /*
+ * If enabled, find out a contiguous virtual address chunk in use, to
+ * which the given address belongs, in order to register maximum range.
+ * In the best case where mempools are not dynamically recreated and
+ * '--socket-mem' is specified as an EAL option, it is very likely to
+ * have only one MR(LKey) per a socket and per a hugepage-size even
+ * though the system memory is highly fragmented. As the whole memory
+ * chunk will be pinned by kernel, it can't be reused unless entire
+ * chunk is freed from EAL.
+ *
+ * If disabled, just register one memseg (page). Then, memory
+ * consumption will be minimized but it may drop performance if there
+ * are many MRs to lookup on the datapath.
+ */
+ if (!config->mr_ext_memseg_en) {
+ data.msl = rte_mem_virt2memseg_list((void *)addr);
+ data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz);
+ data.end = data.start + data.msl->page_sz;
+ } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
+ DRV_LOG(WARNING,
+ "port %u unable to find virtually contiguous"
+ " chunk for address (%p)."
+ " rte_memseg_contig_walk() failed.",
+ dev->data->port_id, (void *)addr);
+ rte_errno = ENXIO;
+ goto err_nolock;
+ }
+alloc_resources:
+ /* Addresses must be page-aligned. */
+ assert(rte_is_aligned((void *)data.start, data.msl->page_sz));
+ assert(rte_is_aligned((void *)data.end, data.msl->page_sz));
+ msl = data.msl;
+ ms = rte_mem_virt2memseg((void *)data.start, msl);
+ len = data.end - data.start;
+ assert(msl->page_sz == ms->hugepage_sz);
+ /* Number of memsegs in the range. */
+ ms_n = len / msl->page_sz;
+ DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
+ " page_sz=0x%" PRIx64 ", ms_n=%u",
+ dev->data->port_id, (void *)addr,
+ data.start, data.end, msl->page_sz, ms_n);
+ /* Size of memory for bitmap. */
+ bmp_size = rte_bitmap_get_memory_footprint(ms_n);
+ mr = rte_zmalloc_socket(NULL,
+ RTE_ALIGN_CEIL(sizeof(*mr),
+ RTE_CACHE_LINE_SIZE) +
+ bmp_size,
+ RTE_CACHE_LINE_SIZE, msl->socket_id);