1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
14 #include <sys/queue.h>
16 #include <rte_fbarray.h>
17 #include <rte_memory.h>
19 #include <rte_eal_memconfig.h>
20 #include <rte_errno.h>
23 #include "eal_memalloc.h"
24 #include "eal_private.h"
25 #include "eal_internal_cfg.h"
28 * Try to mmap *size bytes in /dev/zero. If it is successful, return the
29 * pointer to the mmap'd area and keep *size unmodified. Else, retry
30 * with a smaller zone: decrease *size by hugepage_sz until it reaches
31 * 0. In this case, return NULL. Note: this function returns an address
32 * which is a multiple of hugepage size.
35 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
37 static void *next_baseaddr;
38 static uint64_t system_page_sz;
41 eal_get_virtual_area(void *requested_addr, size_t *size,
42 size_t page_sz, int flags, int mmap_flags)
44 bool addr_is_hint, allow_shrink, unmap, no_align;
46 void *mapped_addr, *aligned_addr;
48 if (system_page_sz == 0)
49 system_page_sz = sysconf(_SC_PAGESIZE);
51 mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
53 RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
55 addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
56 allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
57 unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
59 if (next_baseaddr == NULL && internal_config.base_virtaddr != 0 &&
60 rte_eal_process_type() == RTE_PROC_PRIMARY)
61 next_baseaddr = (void *) internal_config.base_virtaddr;
63 if (requested_addr == NULL && next_baseaddr != NULL) {
64 requested_addr = next_baseaddr;
65 requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
69 /* if requested address is not aligned by page size, or if requested
70 * address is NULL, add page size to requested length as we may get an
71 * address that's aligned by system page size, which can be smaller than
72 * our requested page size. additionally, we shouldn't try to align if
73 * system page size is the same as requested page size.
75 no_align = (requested_addr != NULL &&
76 ((uintptr_t)requested_addr & (page_sz - 1))) ||
77 page_sz == system_page_sz;
80 map_sz = no_align ? *size : *size + page_sz;
81 if (map_sz > SIZE_MAX) {
82 RTE_LOG(ERR, EAL, "Map size too big\n");
87 mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_READ,
89 if (mapped_addr == MAP_FAILED && allow_shrink)
91 } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
93 /* align resulting address - if map failed, we will ignore the value
94 * anyway, so no need to add additional checks.
96 aligned_addr = no_align ? mapped_addr :
97 RTE_PTR_ALIGN(mapped_addr, page_sz);
100 RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
104 } else if (mapped_addr == MAP_FAILED) {
105 RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
107 /* pass errno up the call chain */
110 } else if (requested_addr != NULL && !addr_is_hint &&
111 aligned_addr != requested_addr) {
112 RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
113 requested_addr, aligned_addr);
114 munmap(mapped_addr, map_sz);
115 rte_errno = EADDRNOTAVAIL;
117 } else if (requested_addr != NULL && addr_is_hint &&
118 aligned_addr != requested_addr) {
119 RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
120 requested_addr, aligned_addr);
121 RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n");
122 } else if (next_baseaddr != NULL) {
123 next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
126 RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
127 aligned_addr, *size);
130 munmap(mapped_addr, map_sz);
131 } else if (!no_align) {
132 void *map_end, *aligned_end;
133 size_t before_len, after_len;
135 /* when we reserve space with alignment, we add alignment to
136 * mapping size. On 32-bit, if 1GB alignment was requested, this
137 * would waste 1GB of address space, which is a luxury we cannot
138 * afford. so, if alignment was performed, check if any unneeded
139 * address space can be unmapped back.
142 map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
143 aligned_end = RTE_PTR_ADD(aligned_addr, *size);
145 /* unmap space before aligned mmap address */
146 before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
148 munmap(mapped_addr, before_len);
150 /* unmap space after aligned end mmap address */
151 after_len = RTE_PTR_DIFF(map_end, aligned_end);
153 munmap(aligned_end, after_len);
159 static struct rte_memseg *
160 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
162 const struct rte_fbarray *arr;
169 /* a memseg list was specified, check if it's the right one */
170 start = msl->base_va;
171 end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
173 if (addr < start || addr >= end)
176 /* now, calculate index */
177 arr = &msl->memseg_arr;
178 ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
179 return rte_fbarray_get(arr, ms_idx);
182 static struct rte_memseg_list *
183 virt2memseg_list(const void *addr)
185 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
186 struct rte_memseg_list *msl;
189 for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
191 msl = &mcfg->memsegs[msl_idx];
193 start = msl->base_va;
194 end = RTE_PTR_ADD(start,
195 (size_t)msl->page_sz * msl->memseg_arr.len);
196 if (addr >= start && addr < end)
199 /* if we didn't find our memseg list */
200 if (msl_idx == RTE_MAX_MEMSEG_LISTS)
205 __rte_experimental struct rte_memseg_list *
206 rte_mem_virt2memseg_list(const void *addr)
208 return virt2memseg_list(addr);
216 find_virt(const struct rte_memseg_list *msl __rte_unused,
217 const struct rte_memseg *ms, void *arg)
219 struct virtiova *vi = arg;
220 if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
221 size_t offset = vi->iova - ms->iova;
222 vi->virt = RTE_PTR_ADD(ms->addr, offset);
229 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
230 const struct rte_memseg *ms, size_t len, void *arg)
232 struct virtiova *vi = arg;
233 if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
234 size_t offset = vi->iova - ms->iova;
235 vi->virt = RTE_PTR_ADD(ms->addr, offset);
242 __rte_experimental void *
243 rte_mem_iova2virt(rte_iova_t iova)
247 memset(&vi, 0, sizeof(vi));
250 /* for legacy mem, we can get away with scanning VA-contiguous segments,
251 * as we know they are PA-contiguous as well
253 if (internal_config.legacy_mem)
254 rte_memseg_contig_walk(find_virt_legacy, &vi);
256 rte_memseg_walk(find_virt, &vi);
261 __rte_experimental struct rte_memseg *
262 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
264 return virt2memseg(addr, msl != NULL ? msl :
265 rte_mem_virt2memseg_list(addr));
269 physmem_size(const struct rte_memseg_list *msl, void *arg)
271 uint64_t *total_len = arg;
273 *total_len += msl->memseg_arr.count * msl->page_sz;
278 /* get the total size of memory */
280 rte_eal_get_physmem_size(void)
282 uint64_t total_len = 0;
284 rte_memseg_list_walk(physmem_size, &total_len);
290 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
293 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
297 msl_idx = msl - mcfg->memsegs;
298 if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
301 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
305 fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
306 "virt:%p, socket_id:%"PRId32", "
307 "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
322 * Defining here because declared in rte_memory.h, but the actual implementation
323 * is in eal_common_memalloc.c, like all other memalloc internals.
325 int __rte_experimental
326 rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
329 /* FreeBSD boots with legacy mem enabled by default */
330 if (internal_config.legacy_mem) {
331 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
335 return eal_memalloc_mem_event_callback_register(name, clb, arg);
338 int __rte_experimental
339 rte_mem_event_callback_unregister(const char *name, void *arg)
341 /* FreeBSD boots with legacy mem enabled by default */
342 if (internal_config.legacy_mem) {
343 RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
347 return eal_memalloc_mem_event_callback_unregister(name, arg);
350 int __rte_experimental
351 rte_mem_alloc_validator_register(const char *name,
352 rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
354 /* FreeBSD boots with legacy mem enabled by default */
355 if (internal_config.legacy_mem) {
356 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
360 return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
364 int __rte_experimental
365 rte_mem_alloc_validator_unregister(const char *name, int socket_id)
367 /* FreeBSD boots with legacy mem enabled by default */
368 if (internal_config.legacy_mem) {
369 RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
373 return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
376 /* Dump the physical memory layout on console */
378 rte_dump_physmem_layout(FILE *f)
380 rte_memseg_walk(dump_memseg, f);
383 /* return the number of memory channels */
384 unsigned rte_memory_get_nchannel(void)
386 return rte_eal_get_configuration()->mem_config->nchannel;
389 /* return the number of memory rank */
390 unsigned rte_memory_get_nrank(void)
392 return rte_eal_get_configuration()->mem_config->nrank;
396 rte_eal_memdevice_init(void)
398 struct rte_config *config;
400 if (rte_eal_process_type() == RTE_PROC_SECONDARY)
403 config = rte_eal_get_configuration();
404 config->mem_config->nchannel = internal_config.force_nchannel;
405 config->mem_config->nrank = internal_config.force_nrank;
410 /* Lock page in physical memory and prevent from swapping. */
412 rte_mem_lock_page(const void *virt)
414 unsigned long virtual = (unsigned long)virt;
415 int page_size = getpagesize();
416 unsigned long aligned = (virtual & ~(page_size - 1));
417 return mlock((void *)aligned, page_size);
420 int __rte_experimental
421 rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
423 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
424 int i, ms_idx, ret = 0;
426 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
427 struct rte_memseg_list *msl = &mcfg->memsegs[i];
428 const struct rte_memseg *ms;
429 struct rte_fbarray *arr;
431 if (msl->memseg_arr.count == 0)
434 arr = &msl->memseg_arr;
436 ms_idx = rte_fbarray_find_next_used(arr, 0);
437 while (ms_idx >= 0) {
441 ms = rte_fbarray_get(arr, ms_idx);
443 /* find how many more segments there are, starting with
446 n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
447 len = n_segs * msl->page_sz;
449 ret = func(msl, ms, len, arg);
452 ms_idx = rte_fbarray_find_next_used(arr,
459 int __rte_experimental
460 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
462 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
465 /* do not allow allocations/frees/init while we iterate */
466 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
467 ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
468 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
473 int __rte_experimental
474 rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
476 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
477 int i, ms_idx, ret = 0;
479 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
480 struct rte_memseg_list *msl = &mcfg->memsegs[i];
481 const struct rte_memseg *ms;
482 struct rte_fbarray *arr;
484 if (msl->memseg_arr.count == 0)
487 arr = &msl->memseg_arr;
489 ms_idx = rte_fbarray_find_next_used(arr, 0);
490 while (ms_idx >= 0) {
491 ms = rte_fbarray_get(arr, ms_idx);
492 ret = func(msl, ms, arg);
495 ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
501 int __rte_experimental
502 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
504 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
507 /* do not allow allocations/frees/init while we iterate */
508 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
509 ret = rte_memseg_walk_thread_unsafe(func, arg);
510 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
515 int __rte_experimental
516 rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
518 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
521 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
522 struct rte_memseg_list *msl = &mcfg->memsegs[i];
524 if (msl->base_va == NULL)
527 ret = func(msl, arg);
534 int __rte_experimental
535 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
537 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
540 /* do not allow allocations/frees/init while we iterate */
541 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
542 ret = rte_memseg_list_walk_thread_unsafe(func, arg);
543 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
548 /* init memory subsystem */
550 rte_eal_memory_init(void)
552 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
554 RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
559 /* lock mem hotplug here, to prevent races while we init */
560 rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
562 if (rte_eal_memseg_init() < 0)
565 if (eal_memalloc_init() < 0)
568 retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
569 rte_eal_hugepage_init() :
570 rte_eal_hugepage_attach();
574 if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
579 rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);