X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Flinuxapp%2Feal%2Feal_memalloc.c;h=1f553dd2a6c7c81ac0bb81005600ac0c944d3e48;hb=b4953225cea42bb4da370e012a6be41c54c80967;hp=11ef742ad0f347e03f3ed7c332f00e3e511d6cf3;hpb=a5ff05d60fc59c90b2cfd65613eee39af9c0e33e;p=dpdk.git diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index 11ef742ad0..1f553dd2a6 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -27,6 +27,7 @@ #include #include #endif +#include #include #include @@ -39,6 +40,34 @@ #include "eal_internal_cfg.h" #include "eal_memalloc.h" +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* + * If each page is in a separate file, we can close fd's since we need each fd + * only once. However, in single file segments mode, we can get away with using + * a single fd for entire segments, but we need to store them somewhere. Each + * fd is different within each process, so we'll store them in a local tailq. + */ +struct msl_entry { + TAILQ_ENTRY(msl_entry) next; + unsigned int msl_idx; + int fd; +}; + +/** Double linked list of memseg list fd's. */ +TAILQ_HEAD(msl_entry_list, msl_entry); + +static struct msl_entry_list msl_entry_list = + TAILQ_HEAD_INITIALIZER(msl_entry_list); +static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER; + +/** local copy of a memory map, used to synchronize memory hotplug in MP */ +static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; + static sigjmp_buf huge_jmpenv; static void __rte_unused huge_sigbus_handler(int signo __rte_unused) @@ -129,18 +158,126 @@ resotre_numa(int *oldpolicy, struct bitmask *oldmask) } #endif +static struct msl_entry * +get_msl_entry_by_idx(unsigned int list_idx) +{ + struct msl_entry *te; + + rte_spinlock_lock(&tailq_lock); + + TAILQ_FOREACH(te, &msl_entry_list, next) { + if (te->msl_idx == list_idx) + break; + } + if (te == NULL) { + /* doesn't exist, so create it and set fd to -1 */ + + te = malloc(sizeof(*te)); + if (te == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n", + __func__); + goto unlock; + } + te->msl_idx = list_idx; + te->fd = -1; + TAILQ_INSERT_TAIL(&msl_entry_list, te, next); + } +unlock: + rte_spinlock_unlock(&tailq_lock); + return te; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * uses fstat to check if file size on disk is zero (regular fstat won't show + * true file size due to how fallocate works) + */ +static bool +is_zero_length(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return false; + return st.st_blocks == 0; +} + +/* we cannot use rte_memseg_list_walk() here because we will be holding a + * write lock whenever we enter every function in this file, however copying + * the same iteration code everywhere is not ideal as well. so, use a lockless + * copy of memseg list walk here. + */ +static int +memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int i, ret = 0; + + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + + if (msl->base_va == NULL) + continue; + + ret = func(msl, arg); + if (ret < 0) + return -1; + if (ret > 0) + return 1; + } + return 0; +} + static int get_seg_fd(char *path, int buflen, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) { int fd; - eal_get_hugefile_path(path, buflen, hi->hugedir, - list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - return -1; + + if (internal_config.single_file_segments) { + /* + * try to find a tailq entry, for this memseg list, or create + * one if it doesn't exist. + */ + struct msl_entry *te = get_msl_entry_by_idx(list_idx); + if (te == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n", + __func__); + return -1; + } else if (te->fd < 0) { + /* create a hugepage file */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx); + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + te->fd = fd; + } else { + fd = te->fd; + } + } else { + /* one file per page, just create it */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return -1; + } } return fd; } @@ -172,6 +309,94 @@ static int lock(int fd, uint64_t offset, uint64_t len, int type) return 1; } +static int +resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, + bool grow) +{ + bool again = false; + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, but locks are still needed to prevent + * primary process' initialization from clearing out + * huge pages used by this process. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* not being able to take out a read lock is an error */ + if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1) + return -1; + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* if fallocate() is supported, we need to take out a + * read lock on allocate (to prevent other processes + * from deallocating this page), and take out a write + * lock on deallocate (to ensure nobody else is using + * this page). + * + * we can't use flock() for this, as we actually need to + * lock part of the file, not the entire file. + */ + + if (!grow) { + ret = lock(fd, fa_offset, page_sz, F_WRLCK); + + if (ret < 0) + return -1; + else if (ret == 0) + /* failed to lock, not an error */ + return 0; + } + if (fallocate(fd, flags, fa_offset, page_sz) < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else { + fallocate_supported = 1; + + if (grow) { + /* if can't read lock, it's an error */ + if (lock(fd, fa_offset, page_sz, + F_RDLCK) != 1) + return -1; + } else { + /* if can't unlock, it's an error */ + if (lock(fd, fa_offset, page_sz, + F_UNLCK) != 1) + return -1; + } + } + } + } while (again); + return 0; +} + static int alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, struct hugepage_info *hi, unsigned int list_idx, @@ -191,34 +416,40 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, return -1; alloc_sz = hi->hugepage_sz; - - map_offset = 0; - if (ftruncate(fd, alloc_sz) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - goto resized; - } - /* we've allocated a page - take out a read lock. we're using fcntl() - * locks rather than flock() here because doing that gives us one huge - * advantage - fcntl() locks are per-process, not per-file descriptor, - * which means that we don't have to keep the original fd's around to - * keep a lock on the file. - * - * this is useful, because when it comes to unmapping pages, we will - * have to take out a write lock (to figure out if another process still - * has this page mapped), and to do itwith flock() we'll have to use - * original fd, as lock is associated with that particular fd. with - * fcntl(), this is not necessary - we can open a new fd and use fcntl() - * on that. - */ - ret = lock(fd, map_offset, alloc_sz, F_RDLCK); - - /* this should not fail */ - if (ret != 1) { - RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n", - __func__, - strerror(errno)); - goto resized; + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, map_offset, alloc_sz, true); + if (ret < 1) + goto resized; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + /* we've allocated a page - take out a read lock. we're using + * fcntl() locks rather than flock() here because doing that + * gives us one huge advantage - fcntl() locks are per-process, + * not per-file descriptor, which means that we don't have to + * keep the original fd's around to keep a lock on the file. + * + * this is useful, because when it comes to unmapping pages, we + * will have to take out a write lock (to figure out if another + * process still has this page mapped), and to do itwith flock() + * we'll have to use original fd, as lock is associated with + * that particular fd. with fcntl(), this is not necessary - we + * can open a new fd and use fcntl() on that. + */ + ret = lock(fd, map_offset, alloc_sz, F_RDLCK); + + /* this should not fail */ + if (ret != 1) { + RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n", + __func__, + strerror(errno)); + goto resized; + } } /* @@ -227,7 +458,9 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, */ void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset); - close(fd); + /* for non-single file segments, we can close fd here */ + if (!internal_config.single_file_segments) + close(fd); if (va == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, @@ -284,8 +517,21 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, mapped: munmap(addr, alloc_sz); resized: - close(fd); - unlink(path); + if (internal_config.single_file_segments) { + resize_hugefile(fd, map_offset, alloc_sz, false); + if (is_zero_length(fd)) { + struct msl_entry *te = get_msl_entry_by_idx(list_idx); + if (te != NULL && te->fd >= 0) { + close(te->fd); + te->fd = -1; + } + /* ignore errors, can't make it any worse */ + unlink(path); + } + } else { + close(fd); + unlink(path); + } return -1; } @@ -293,6 +539,7 @@ static int free_seg(struct rte_memseg *ms, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) { + uint64_t map_offset; char path[PATH_MAX]; int fd, ret; @@ -310,21 +557,39 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, if (fd < 0) return -1; - /* if we're able to take out a write lock, we're the last one - * holding onto this page. - */ - - ret = lock(fd, 0, ms->len, F_WRLCK); - if (ret >= 0) { - /* no one else is using this page */ - if (ret == 1) + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, map_offset, ms->len, false)) + return -1; + /* if file is zero-length, we've already shrunk it, so it's + * safe to remove. + */ + if (is_zero_length(fd)) { + struct msl_entry *te = get_msl_entry_by_idx(list_idx); + if (te != NULL && te->fd >= 0) { + close(te->fd); + te->fd = -1; + } unlink(path); - ret = lock(fd, 0, ms->len, F_UNLCK); - if (ret != 1) - RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n", - __func__, path); + } + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + + ret = lock(fd, 0, ms->len, F_WRLCK); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + ret = lock(fd, 0, ms->len, F_UNLCK); + if (ret != 1) + RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n", + __func__, path); + } + close(fd); } - close(fd); memset(ms, 0, sizeof(*ms)); @@ -412,6 +677,8 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) } out: wa->segs_allocated = i; + if (i > 0) + cur_msl->version++; return 1; } @@ -441,7 +708,10 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg) /* msl is const */ found_msl = &mcfg->memsegs[msl_idx]; + found_msl->version++; + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx)) return -1; @@ -496,7 +766,7 @@ eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, wa.socket = socket; wa.segs_allocated = 0; - ret = rte_memseg_list_walk(alloc_seg_walk, &wa); + ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); if (ret == 0) { RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", __func__); @@ -537,6 +807,13 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) struct free_walk_param wa; int i, walk_res; + /* if this page is marked as unfreeable, fail */ + if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); + ret = -1; + continue; + } + memset(&wa, 0, sizeof(wa)); for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); @@ -554,7 +831,7 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) wa.ms = cur; wa.hi = hi; - walk_res = rte_memseg_list_walk(free_seg_walk, &wa); + walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa); if (walk_res == 1) continue; if (walk_res == 0) @@ -573,3 +850,275 @@ eal_memalloc_free_seg(struct rte_memseg *ms) return eal_memalloc_free_seg_bulk(&ms, 1); } + +static int +sync_chunk(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used, int start, int end) +{ + struct rte_fbarray *l_arr, *p_arr; + int i, ret, chunk_len, diff_len; + + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + /* we need to aggregate allocations/deallocations into bigger chunks, + * as we don't want to spam the user with per-page callbacks. + * + * to avoid any potential issues, we also want to trigger + * deallocation callbacks *before* we actually deallocate + * memory, so that the user application could wrap up its use + * before it goes away. + */ + + chunk_len = end - start; + + /* find how many contiguous pages we can map/unmap for this chunk */ + diff_len = used ? + rte_fbarray_find_contig_free(l_arr, start) : + rte_fbarray_find_contig_used(l_arr, start); + + /* has to be at least one page */ + if (diff_len < 1) + return -1; + + diff_len = RTE_MIN(chunk_len, diff_len); + + /* if we are freeing memory, notify the application */ + if (!used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + start_va, len); + } + + for (i = 0; i < diff_len; i++) { + struct rte_memseg *p_ms, *l_ms; + int seg_idx = start + i; + + l_ms = rte_fbarray_get(l_arr, seg_idx); + p_ms = rte_fbarray_get(p_arr, seg_idx); + + if (l_ms == NULL || p_ms == NULL) + return -1; + + if (used) { + ret = alloc_seg(l_ms, p_ms->addr, + p_ms->socket_id, hi, + msl_idx, seg_idx); + if (ret < 0) + return -1; + rte_fbarray_set_used(l_arr, seg_idx); + } else { + ret = free_seg(l_ms, hi, msl_idx, seg_idx); + rte_fbarray_set_free(l_arr, seg_idx); + if (ret < 0) + return -1; + } + } + + /* if we just allocated memory, notify the application */ + if (used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + start_va, len); + } + + /* calculate how much we can advance until next chunk */ + diff_len = used ? + rte_fbarray_find_contig_used(l_arr, start) : + rte_fbarray_find_contig_free(l_arr, start); + ret = RTE_MIN(chunk_len, diff_len); + + return ret; +} + +static int +sync_status(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used) +{ + struct rte_fbarray *l_arr, *p_arr; + int p_idx, l_chunk_len, p_chunk_len, ret; + int start, end; + + /* this is a little bit tricky, but the basic idea is - walk both lists + * and spot any places where there are discrepancies. walking both lists + * and noting discrepancies in a single go is a hard problem, so we do + * it in two passes - first we spot any places where allocated segments + * mismatch (i.e. ensure that everything that's allocated in the primary + * is also allocated in the secondary), and then we do it by looking at + * free segments instead. + * + * we also need to aggregate changes into chunks, as we have to call + * callbacks per allocation, not per page. + */ + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + if (used) + p_idx = rte_fbarray_find_next_used(p_arr, 0); + else + p_idx = rte_fbarray_find_next_free(p_arr, 0); + + while (p_idx >= 0) { + int next_chunk_search_idx; + + if (used) { + p_chunk_len = rte_fbarray_find_contig_used(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_used(l_arr, + p_idx); + } else { + p_chunk_len = rte_fbarray_find_contig_free(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_free(l_arr, + p_idx); + } + /* best case scenario - no differences (or bigger, which will be + * fixed during next iteration), look for next chunk + */ + if (l_chunk_len >= p_chunk_len) { + next_chunk_search_idx = p_idx + p_chunk_len; + goto next_chunk; + } + + /* if both chunks start at the same point, skip parts we know + * are identical, and sync the rest. each call to sync_chunk + * will only sync contiguous segments, so we need to call this + * until we are sure there are no more differences in this + * chunk. + */ + start = p_idx + l_chunk_len; + end = p_idx + p_chunk_len; + do { + ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, + used, start, end); + start += ret; + } while (start < end && ret >= 0); + /* if ret is negative, something went wrong */ + if (ret < 0) + return -1; + + next_chunk_search_idx = p_idx + p_chunk_len; +next_chunk: + /* skip to end of this chunk */ + if (used) { + p_idx = rte_fbarray_find_next_used(p_arr, + next_chunk_search_idx); + } else { + p_idx = rte_fbarray_find_next_free(p_arr, + next_chunk_search_idx); + } + } + return 0; +} + +static int +sync_existing(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx) +{ + int ret; + + /* ensure all allocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); + if (ret < 0) + return -1; + + /* ensure all unallocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); + if (ret < 0) + return -1; + + /* update version number */ + local_msl->version = primary_msl->version; + + return 0; +} + +static int +sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + struct hugepage_info *hi = NULL; + unsigned int i; + int msl_idx; + bool new_msl = false; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + /* check if secondary has this memseg list set up */ + if (local_msl->base_va == NULL) { + char name[PATH_MAX]; + int ret; + new_msl = true; + + /* create distinct fbarrays for each secondary */ + snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", + primary_msl->memseg_arr.name, getpid()); + + ret = rte_fbarray_init(&local_msl->memseg_arr, name, + primary_msl->memseg_arr.len, + primary_msl->memseg_arr.elt_sz); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); + return -1; + } + + local_msl->base_va = primary_msl->base_va; + } + + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + uint64_t cur_sz = + internal_config.hugepage_info[i].hugepage_sz; + uint64_t msl_sz = primary_msl->page_sz; + if (msl_sz == cur_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + return -1; + } + + /* if versions don't match or if we have just allocated a new + * memseg list, synchronize everything + */ + if ((new_msl || local_msl->version != primary_msl->version) && + sync_existing(primary_msl, local_msl, hi, msl_idx)) + return -1; + return 0; +} + + +int +eal_memalloc_sync_with_primary(void) +{ + /* nothing to be done in primary */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return 0; + + if (memseg_list_walk_thread_unsafe(sync_walk, NULL)) + return -1; + return 0; +}