TAILQ_HEAD_INITIALIZER(msl_entry_list);
static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
static sigjmp_buf huge_jmpenv;
static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
{
RTE_LOG(DEBUG, EAL,
"Restoring previous memory policy: %d\n", *oldpolicy);
- if (oldpolicy == MPOL_DEFAULT) {
+ if (*oldpolicy == MPOL_DEFAULT) {
numa_set_localalloc();
} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
oldmask->size + 1) < 0) {
return st.st_blocks == 0;
}
+/* we cannot use rte_memseg_list_walk() here because we will be holding a
+ * write lock whenever we enter every function in this file, however copying
+ * the same iteration code everywhere is not ideal as well. so, use a lockless
+ * copy of memseg list walk here.
+ */
+static int
+memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ret = 0;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->base_va == NULL)
+ continue;
+
+ ret = func(msl, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ }
+ return 0;
+}
+
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
if (internal_config.single_file_segments) {
map_offset = seg_idx * alloc_sz;
ret = resize_hugefile(fd, map_offset, alloc_sz, true);
- if (ret < 1)
+ if (ret < 0)
goto resized;
} else {
map_offset = 0;
*/
void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
- /* for non-single file segments, we can close fd here */
- if (!internal_config.single_file_segments)
- close(fd);
if (va == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
}
if (va != addr) {
RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
- goto mapped;
+ munmap(va, alloc_sz);
+ goto resized;
}
rte_iova_t iova = rte_mem_virt2iova(addr);
(unsigned int)(alloc_sz >> 20));
goto mapped;
}
+ /* for non-single file segments, we can close fd here */
+ if (!internal_config.single_file_segments)
+ close(fd);
+
*(int *)addr = *(int *)addr;
ms->addr = addr;
resize_hugefile(fd, map_offset, alloc_sz, false);
if (is_zero_length(fd)) {
struct msl_entry *te = get_msl_entry_by_idx(list_idx);
- if (te != NULL && te->fd >= 0) {
- close(te->fd);
+ /* te->fd is equivalent to fd */
+ if (te != NULL && te->fd >= 0)
te->fd = -1;
- }
/* ignore errors, can't make it any worse */
unlink(path);
+ close(fd);
}
+ /* if we're not removing the file, fd stays in the tailq */
} else {
close(fd);
unlink(path);
*/
if (is_zero_length(fd)) {
struct msl_entry *te = get_msl_entry_by_idx(list_idx);
- if (te != NULL && te->fd >= 0) {
- close(te->fd);
+ /* te->fd is equivalent to fd */
+ if (te != NULL && te->fd >= 0)
te->fd = -1;
- }
unlink(path);
+ close(fd);
}
+ /* if we're not removing the file, fd stays in the tailq */
ret = 0;
} else {
/* if we're able to take out a write lock, we're the last one
}
out:
wa->segs_allocated = i;
+ if (i > 0)
+ cur_msl->version++;
return 1;
}
/* msl is const */
found_msl = &mcfg->memsegs[msl_idx];
+ found_msl->version++;
+
rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx))
return -1;
wa.socket = socket;
wa.segs_allocated = 0;
- ret = rte_memseg_list_walk(alloc_seg_walk, &wa);
+ ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
if (ret == 0) {
RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
__func__);
struct free_walk_param wa;
int i, walk_res;
+ /* if this page is marked as unfreeable, fail */
+ if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+ RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+ ret = -1;
+ continue;
+ }
+
memset(&wa, 0, sizeof(wa));
for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
wa.ms = cur;
wa.hi = hi;
- walk_res = rte_memseg_list_walk(free_seg_walk, &wa);
+ walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa);
if (walk_res == 1)
continue;
if (walk_res == 0)
return eal_memalloc_free_seg_bulk(&ms, 1);
}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used, int start, int end)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int i, ret, chunk_len, diff_len;
+
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ /* we need to aggregate allocations/deallocations into bigger chunks,
+ * as we don't want to spam the user with per-page callbacks.
+ *
+ * to avoid any potential issues, we also want to trigger
+ * deallocation callbacks *before* we actually deallocate
+ * memory, so that the user application could wrap up its use
+ * before it goes away.
+ */
+
+ chunk_len = end - start;
+
+ /* find how many contiguous pages we can map/unmap for this chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_free(l_arr, start) :
+ rte_fbarray_find_contig_used(l_arr, start);
+
+ /* has to be at least one page */
+ if (diff_len < 1)
+ return -1;
+
+ diff_len = RTE_MIN(chunk_len, diff_len);
+
+ /* if we are freeing memory, notify the application */
+ if (!used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len, page_sz;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ page_sz = (size_t)primary_msl->page_sz;
+ len = page_sz * diff_len;
+
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ start_va, len);
+ }
+
+ for (i = 0; i < diff_len; i++) {
+ struct rte_memseg *p_ms, *l_ms;
+ int seg_idx = start + i;
+
+ l_ms = rte_fbarray_get(l_arr, seg_idx);
+ p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+ if (l_ms == NULL || p_ms == NULL)
+ return -1;
+
+ if (used) {
+ ret = alloc_seg(l_ms, p_ms->addr,
+ p_ms->socket_id, hi,
+ msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_used(l_arr, seg_idx);
+ } else {
+ ret = free_seg(l_ms, hi, msl_idx, seg_idx);
+ rte_fbarray_set_free(l_arr, seg_idx);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ /* if we just allocated memory, notify the application */
+ if (used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len, page_sz;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ page_sz = (size_t)primary_msl->page_sz;
+ len = page_sz * diff_len;
+
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ start_va, len);
+ }
+
+ /* calculate how much we can advance until next chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_used(l_arr, start) :
+ rte_fbarray_find_contig_free(l_arr, start);
+ ret = RTE_MIN(chunk_len, diff_len);
+
+ return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int p_idx, l_chunk_len, p_chunk_len, ret;
+ int start, end;
+
+ /* this is a little bit tricky, but the basic idea is - walk both lists
+ * and spot any places where there are discrepancies. walking both lists
+ * and noting discrepancies in a single go is a hard problem, so we do
+ * it in two passes - first we spot any places where allocated segments
+ * mismatch (i.e. ensure that everything that's allocated in the primary
+ * is also allocated in the secondary), and then we do it by looking at
+ * free segments instead.
+ *
+ * we also need to aggregate changes into chunks, as we have to call
+ * callbacks per allocation, not per page.
+ */
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ if (used)
+ p_idx = rte_fbarray_find_next_used(p_arr, 0);
+ else
+ p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+ while (p_idx >= 0) {
+ int next_chunk_search_idx;
+
+ if (used) {
+ p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+ p_idx);
+ } else {
+ p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+ p_idx);
+ }
+ /* best case scenario - no differences (or bigger, which will be
+ * fixed during next iteration), look for next chunk
+ */
+ if (l_chunk_len >= p_chunk_len) {
+ next_chunk_search_idx = p_idx + p_chunk_len;
+ goto next_chunk;
+ }
+
+ /* if both chunks start at the same point, skip parts we know
+ * are identical, and sync the rest. each call to sync_chunk
+ * will only sync contiguous segments, so we need to call this
+ * until we are sure there are no more differences in this
+ * chunk.
+ */
+ start = p_idx + l_chunk_len;
+ end = p_idx + p_chunk_len;
+ do {
+ ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+ used, start, end);
+ start += ret;
+ } while (start < end && ret >= 0);
+ /* if ret is negative, something went wrong */
+ if (ret < 0)
+ return -1;
+
+ next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+ /* skip to end of this chunk */
+ if (used) {
+ p_idx = rte_fbarray_find_next_used(p_arr,
+ next_chunk_search_idx);
+ } else {
+ p_idx = rte_fbarray_find_next_free(p_arr,
+ next_chunk_search_idx);
+ }
+ }
+ return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx)
+{
+ int ret;
+
+ /* ensure all allocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+ if (ret < 0)
+ return -1;
+
+ /* ensure all unallocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+ if (ret < 0)
+ return -1;
+
+ /* update version number */
+ local_msl->version = primary_msl->version;
+
+ return 0;
+}
+
+static int
+sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *primary_msl, *local_msl;
+ struct hugepage_info *hi = NULL;
+ unsigned int i;
+ int msl_idx;
+ bool new_msl = false;
+
+ msl_idx = msl - mcfg->memsegs;
+ primary_msl = &mcfg->memsegs[msl_idx];
+ local_msl = &local_memsegs[msl_idx];
+
+ /* check if secondary has this memseg list set up */
+ if (local_msl->base_va == NULL) {
+ char name[PATH_MAX];
+ int ret;
+ new_msl = true;
+
+ /* create distinct fbarrays for each secondary */
+ snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+ primary_msl->memseg_arr.name, getpid());
+
+ ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+ primary_msl->memseg_arr.len,
+ primary_msl->memseg_arr.elt_sz);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+ return -1;
+ }
+
+ local_msl->base_va = primary_msl->base_va;
+ }
+
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ uint64_t cur_sz =
+ internal_config.hugepage_info[i].hugepage_sz;
+ uint64_t msl_sz = primary_msl->page_sz;
+ if (msl_sz == cur_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ return -1;
+ }
+
+ /* if versions don't match or if we have just allocated a new
+ * memseg list, synchronize everything
+ */
+ if ((new_msl || local_msl->version != primary_msl->version) &&
+ sync_existing(primary_msl, local_msl, hi, msl_idx))
+ return -1;
+ return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ /* nothing to be done in primary */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return 0;
+
+ if (memseg_list_walk_thread_unsafe(sync_walk, NULL))
+ return -1;
+ return 0;
+}