1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
14 #include <sys/queue.h>
16 #include <rte_fbarray.h>
17 #include <rte_memory.h>
19 #include <rte_eal_memconfig.h>
20 #include <rte_errno.h>
23 #include "eal_private.h"
24 #include "eal_internal_cfg.h"
27 * Try to mmap *size bytes in /dev/zero. If it is successful, return the
28 * pointer to the mmap'd area and keep *size unmodified. Else, retry
29 * with a smaller zone: decrease *size by hugepage_sz until it reaches
30 * 0. In this case, return NULL. Note: this function returns an address
31 * which is a multiple of hugepage size.
34 #define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
36 static uint64_t baseaddr_offset;
37 static uint64_t system_page_sz;
40 eal_get_virtual_area(void *requested_addr, size_t *size,
41 size_t page_sz, int flags, int mmap_flags)
43 bool addr_is_hint, allow_shrink, unmap, no_align;
45 void *mapped_addr, *aligned_addr;
47 if (system_page_sz == 0)
48 system_page_sz = sysconf(_SC_PAGESIZE);
50 mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
52 RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
54 addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
55 allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
56 unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
58 if (requested_addr == NULL && internal_config.base_virtaddr != 0) {
59 requested_addr = (void *) (internal_config.base_virtaddr +
60 (size_t)baseaddr_offset);
61 requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
65 /* if requested address is not aligned by page size, or if requested
66 * address is NULL, add page size to requested length as we may get an
67 * address that's aligned by system page size, which can be smaller than
68 * our requested page size. additionally, we shouldn't try to align if
69 * system page size is the same as requested page size.
71 no_align = (requested_addr != NULL &&
72 ((uintptr_t)requested_addr & (page_sz - 1)) == 0) ||
73 page_sz == system_page_sz;
76 map_sz = no_align ? *size : *size + page_sz;
78 mapped_addr = mmap(requested_addr, map_sz, PROT_READ,
80 if (mapped_addr == MAP_FAILED && allow_shrink)
82 } while (allow_shrink && mapped_addr == MAP_FAILED && *size > 0);
84 /* align resulting address - if map failed, we will ignore the value
85 * anyway, so no need to add additional checks.
87 aligned_addr = no_align ? mapped_addr :
88 RTE_PTR_ALIGN(mapped_addr, page_sz);
91 RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
95 } else if (mapped_addr == MAP_FAILED) {
96 RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
98 /* pass errno up the call chain */
101 } else if (requested_addr != NULL && !addr_is_hint &&
102 aligned_addr != requested_addr) {
103 RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
104 requested_addr, aligned_addr);
105 munmap(mapped_addr, map_sz);
106 rte_errno = EADDRNOTAVAIL;
108 } else if (requested_addr != NULL && addr_is_hint &&
109 aligned_addr != requested_addr) {
110 RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
111 requested_addr, aligned_addr);
112 RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory into secondary processes\n");
116 munmap(mapped_addr, map_sz);
118 RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
119 aligned_addr, *size);
121 baseaddr_offset += *size;
127 get_mem_amount(uint64_t page_sz, uint64_t max_mem)
129 uint64_t area_sz, max_pages;
131 /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
132 max_pages = RTE_MAX_MEMSEG_PER_LIST;
133 max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
135 area_sz = RTE_MIN(page_sz * max_pages, max_mem);
137 /* make sure the list isn't smaller than the page size */
138 area_sz = RTE_MAX(area_sz, page_sz);
140 return RTE_ALIGN(area_sz, page_sz);
144 alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
145 uint64_t max_mem, int socket_id, int type_msl_idx)
147 char name[RTE_FBARRAY_NAME_LEN];
151 mem_amount = get_mem_amount(page_sz, max_mem);
152 max_segs = mem_amount / page_sz;
154 snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
156 if (rte_fbarray_init(&msl->memseg_arr, name, max_segs,
157 sizeof(struct rte_memseg))) {
158 RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
159 rte_strerror(rte_errno));
163 msl->page_sz = page_sz;
164 msl->socket_id = socket_id;
167 RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
168 (size_t)page_sz >> 10, socket_id);
174 alloc_va_space(struct rte_memseg_list *msl)
181 #ifdef RTE_ARCH_PPC_64
182 flags |= MAP_HUGETLB;
185 page_sz = msl->page_sz;
186 mem_sz = page_sz * msl->memseg_arr.len;
188 addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
190 if (rte_errno == EADDRNOTAVAIL)
191 RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
192 (unsigned long long)mem_sz, msl->base_va);
194 RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
202 static int __rte_unused
203 memseg_primary_init_32(void)
205 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
206 int active_sockets, hpi_idx, msl_idx = 0;
207 unsigned int socket_id, i;
208 struct rte_memseg_list *msl;
209 uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
212 /* no-huge does not need this at all */
213 if (internal_config.no_hugetlbfs)
216 /* this is a giant hack, but desperate times call for desperate
217 * measures. in legacy 32-bit mode, we cannot preallocate VA space,
218 * because having upwards of 2 gigabytes of VA space already mapped will
219 * interfere with our ability to map and sort hugepages.
221 * therefore, in legacy 32-bit mode, we will be initializing memseg
222 * lists much later - in eal_memory.c, right after we unmap all the
223 * unneeded pages. this will not affect secondary processes, as those
224 * should be able to mmap the space without (too many) problems.
226 if (internal_config.legacy_mem)
229 /* 32-bit mode is a very special case. we cannot know in advance where
230 * the user will want to allocate their memory, so we have to do some
234 total_requested_mem = 0;
235 if (internal_config.force_sockets)
236 for (i = 0; i < rte_socket_count(); i++) {
239 socket_id = rte_socket_id_by_idx(i);
240 mem = internal_config.socket_mem[socket_id];
246 total_requested_mem += mem;
249 total_requested_mem = internal_config.memory;
251 max_mem = (uint64_t) RTE_MAX_MEM_MB_PER_TYPE << 20;
252 if (total_requested_mem > max_mem) {
253 RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
254 (unsigned int)(max_mem >> 20));
257 total_extra_mem = max_mem - total_requested_mem;
258 extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
259 total_extra_mem / active_sockets;
261 /* the allocation logic is a little bit convoluted, but here's how it
262 * works, in a nutshell:
263 * - if user hasn't specified on which sockets to allocate memory via
264 * --socket-mem, we allocate all of our memory on master core socket.
265 * - if user has specified sockets to allocate memory on, there may be
266 * some "unused" memory left (e.g. if user has specified --socket-mem
267 * such that not all memory adds up to 2 gigabytes), so add it to all
268 * sockets that are in use equally.
270 * page sizes are sorted by size in descending order, so we can safely
271 * assume that we dispense with bigger page sizes first.
274 /* create memseg lists */
275 for (i = 0; i < rte_socket_count(); i++) {
276 int hp_sizes = (int) internal_config.num_hugepage_sizes;
277 uint64_t max_socket_mem, cur_socket_mem;
278 unsigned int master_lcore_socket;
279 struct rte_config *cfg = rte_eal_get_configuration();
282 socket_id = rte_socket_id_by_idx(i);
284 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
289 /* if we didn't specifically request memory on this socket */
290 skip = active_sockets != 0 &&
291 internal_config.socket_mem[socket_id] == 0;
292 /* ...or if we didn't specifically request memory on *any*
293 * socket, and this is not master lcore
295 master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
296 skip |= active_sockets == 0 && socket_id != master_lcore_socket;
299 RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
304 /* max amount of memory on this socket */
305 max_socket_mem = (active_sockets != 0 ?
306 internal_config.socket_mem[socket_id] :
307 internal_config.memory) +
308 extra_mem_per_socket;
311 for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
312 uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
313 uint64_t hugepage_sz;
314 struct hugepage_info *hpi;
315 int type_msl_idx, max_segs, total_segs = 0;
317 hpi = &internal_config.hugepage_info[hpi_idx];
318 hugepage_sz = hpi->hugepage_sz;
320 max_segs = RTE_MAX_MEMSEG_PER_TYPE;
321 max_pagesz_mem = max_socket_mem - cur_socket_mem;
323 /* make it multiple of page size */
324 max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
327 RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
328 "%" PRIu64 "M on socket %i\n",
329 max_pagesz_mem >> 20, socket_id);
332 while (cur_pagesz_mem < max_pagesz_mem &&
333 total_segs < max_segs) {
334 if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
336 "No more space in memseg lists, please increase %s\n",
337 RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
341 msl = &mcfg->memsegs[msl_idx++];
343 if (alloc_memseg_list(msl, hugepage_sz,
344 max_pagesz_mem, socket_id,
348 total_segs += msl->memseg_arr.len;
349 cur_pagesz_mem = total_segs * hugepage_sz;
352 if (alloc_va_space(msl)) {
353 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
357 cur_socket_mem += cur_pagesz_mem;
364 static int __rte_unused
365 memseg_primary_init(void)
367 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
368 int i, socket_id, hpi_idx, msl_idx = 0;
369 struct rte_memseg_list *msl;
370 uint64_t max_mem, total_mem;
372 /* no-huge does not need this at all */
373 if (internal_config.no_hugetlbfs)
376 max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
379 /* create memseg lists */
380 for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
382 struct hugepage_info *hpi;
383 uint64_t hugepage_sz;
385 hpi = &internal_config.hugepage_info[hpi_idx];
386 hugepage_sz = hpi->hugepage_sz;
388 for (i = 0; i < (int) rte_socket_count(); i++) {
389 uint64_t max_type_mem, total_type_mem = 0;
390 int type_msl_idx, max_segs, total_segs = 0;
392 socket_id = rte_socket_id_by_idx(i);
394 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
399 max_type_mem = RTE_MIN(max_mem - total_mem,
400 (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
401 max_segs = RTE_MAX_MEMSEG_PER_TYPE;
404 while (total_type_mem < max_type_mem &&
405 total_segs < max_segs) {
406 uint64_t cur_max_mem;
407 if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
409 "No more space in memseg lists, please increase %s\n",
410 RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
414 msl = &mcfg->memsegs[msl_idx++];
416 cur_max_mem = max_type_mem - total_type_mem;
417 if (alloc_memseg_list(msl, hugepage_sz,
418 cur_max_mem, socket_id,
422 total_segs += msl->memseg_arr.len;
423 total_type_mem = total_segs * hugepage_sz;
426 if (alloc_va_space(msl)) {
427 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
431 total_mem += total_type_mem;
438 memseg_secondary_init(void)
440 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
442 struct rte_memseg_list *msl;
444 for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
446 msl = &mcfg->memsegs[msl_idx];
448 /* skip empty memseg lists */
449 if (msl->memseg_arr.len == 0)
452 if (rte_fbarray_attach(&msl->memseg_arr)) {
453 RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
457 /* preallocate VA space */
458 if (alloc_va_space(msl)) {
459 RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
467 static struct rte_memseg *
468 virt2memseg(const void *addr, const struct rte_memseg_list *msl)
470 const struct rte_fbarray *arr;
474 /* a memseg list was specified, check if it's the right one */
475 start = msl->base_va;
476 end = RTE_PTR_ADD(start, (size_t)msl->page_sz * msl->memseg_arr.len);
478 if (addr < start || addr >= end)
481 /* now, calculate index */
482 arr = &msl->memseg_arr;
483 ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
484 return rte_fbarray_get(arr, ms_idx);
487 static struct rte_memseg_list *
488 virt2memseg_list(const void *addr)
490 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
491 struct rte_memseg_list *msl;
494 for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
496 msl = &mcfg->memsegs[msl_idx];
498 start = msl->base_va;
499 end = RTE_PTR_ADD(start,
500 (size_t)msl->page_sz * msl->memseg_arr.len);
501 if (addr >= start && addr < end)
504 /* if we didn't find our memseg list */
505 if (msl_idx == RTE_MAX_MEMSEG_LISTS)
510 __rte_experimental struct rte_memseg_list *
511 rte_mem_virt2memseg_list(const void *addr)
513 return virt2memseg_list(addr);
521 find_virt(const struct rte_memseg_list *msl __rte_unused,
522 const struct rte_memseg *ms, void *arg)
524 struct virtiova *vi = arg;
525 if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
526 size_t offset = vi->iova - ms->iova;
527 vi->virt = RTE_PTR_ADD(ms->addr, offset);
534 find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
535 const struct rte_memseg *ms, size_t len, void *arg)
537 struct virtiova *vi = arg;
538 if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
539 size_t offset = vi->iova - ms->iova;
540 vi->virt = RTE_PTR_ADD(ms->addr, offset);
547 __rte_experimental void *
548 rte_mem_iova2virt(rte_iova_t iova)
552 memset(&vi, 0, sizeof(vi));
555 /* for legacy mem, we can get away with scanning VA-contiguous segments,
556 * as we know they are PA-contiguous as well
558 if (internal_config.legacy_mem)
559 rte_memseg_contig_walk(find_virt_legacy, &vi);
561 rte_memseg_walk(find_virt, &vi);
566 __rte_experimental struct rte_memseg *
567 rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
569 return virt2memseg(addr, msl != NULL ? msl :
570 rte_mem_virt2memseg_list(addr));
574 physmem_size(const struct rte_memseg_list *msl, void *arg)
576 uint64_t *total_len = arg;
578 *total_len += msl->memseg_arr.count * msl->page_sz;
583 /* get the total size of memory */
585 rte_eal_get_physmem_size(void)
587 uint64_t total_len = 0;
589 rte_memseg_list_walk(physmem_size, &total_len);
595 dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
598 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
602 msl_idx = msl - mcfg->memsegs;
603 if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
606 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
610 fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
611 "virt:%p, socket_id:%"PRId32", "
612 "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
626 /* Dump the physical memory layout on console */
628 rte_dump_physmem_layout(FILE *f)
630 rte_memseg_walk(dump_memseg, f);
633 /* return the number of memory channels */
634 unsigned rte_memory_get_nchannel(void)
636 return rte_eal_get_configuration()->mem_config->nchannel;
639 /* return the number of memory rank */
640 unsigned rte_memory_get_nrank(void)
642 return rte_eal_get_configuration()->mem_config->nrank;
646 rte_eal_memdevice_init(void)
648 struct rte_config *config;
650 if (rte_eal_process_type() == RTE_PROC_SECONDARY)
653 config = rte_eal_get_configuration();
654 config->mem_config->nchannel = internal_config.force_nchannel;
655 config->mem_config->nrank = internal_config.force_nrank;
660 /* Lock page in physical memory and prevent from swapping. */
662 rte_mem_lock_page(const void *virt)
664 unsigned long virtual = (unsigned long)virt;
665 int page_size = getpagesize();
666 unsigned long aligned = (virtual & ~(page_size - 1));
667 return mlock((void *)aligned, page_size);
670 int __rte_experimental
671 rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
673 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
674 int i, ms_idx, ret = 0;
676 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
677 struct rte_memseg_list *msl = &mcfg->memsegs[i];
678 const struct rte_memseg *ms;
679 struct rte_fbarray *arr;
681 if (msl->memseg_arr.count == 0)
684 arr = &msl->memseg_arr;
686 ms_idx = rte_fbarray_find_next_used(arr, 0);
687 while (ms_idx >= 0) {
691 ms = rte_fbarray_get(arr, ms_idx);
693 /* find how many more segments there are, starting with
696 n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
697 len = n_segs * msl->page_sz;
699 ret = func(msl, ms, len, arg);
704 ms_idx = rte_fbarray_find_next_used(arr,
711 int __rte_experimental
712 rte_memseg_walk(rte_memseg_walk_t func, void *arg)
714 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
715 int i, ms_idx, ret = 0;
717 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
718 struct rte_memseg_list *msl = &mcfg->memsegs[i];
719 const struct rte_memseg *ms;
720 struct rte_fbarray *arr;
722 if (msl->memseg_arr.count == 0)
725 arr = &msl->memseg_arr;
727 ms_idx = rte_fbarray_find_next_used(arr, 0);
728 while (ms_idx >= 0) {
729 ms = rte_fbarray_get(arr, ms_idx);
730 ret = func(msl, ms, arg);
735 ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
741 int __rte_experimental
742 rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
744 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
747 for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
748 struct rte_memseg_list *msl = &mcfg->memsegs[i];
750 if (msl->base_va == NULL)
753 ret = func(msl, arg);
762 /* init memory subsystem */
764 rte_eal_memory_init(void)
766 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
768 RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
773 retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
775 memseg_primary_init_32() :
777 memseg_primary_init() :
779 memseg_secondary_init();
784 retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
785 rte_eal_hugepage_init() :
786 rte_eal_hugepage_attach();
790 if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)