X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Flinuxapp%2Feal%2Feal_memalloc.c;h=9c41046fcb80775cb394d074e23c618482d4ce3a;hb=2a04139f66b457d069445a65fd11722d91463bcb;hp=45ea0adaabab71dadbad570a4971c6b9755d65a8;hpb=582bed1e1d1ddfd51bc7d61b95b0b8b55e47c6f9;p=dpdk.git diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c index 45ea0adaab..9c41046fcb 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c @@ -27,6 +27,7 @@ #include #include #endif +#include #include #include @@ -39,6 +40,31 @@ #include "eal_internal_cfg.h" #include "eal_memalloc.h" +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* + * If each page is in a separate file, we can close fd's since we need each fd + * only once. However, in single file segments mode, we can get away with using + * a single fd for entire segments, but we need to store them somewhere. Each + * fd is different within each process, so we'll store them in a local tailq. + */ +struct msl_entry { + TAILQ_ENTRY(msl_entry) next; + unsigned int msl_idx; + int fd; +}; + +/** Double linked list of memseg list fd's. */ +TAILQ_HEAD(msl_entry_list, msl_entry); + +static struct msl_entry_list msl_entry_list = + TAILQ_HEAD_INITIALIZER(msl_entry_list); +static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER; + static sigjmp_buf huge_jmpenv; static void __rte_unused huge_sigbus_handler(int signo __rte_unused) @@ -129,18 +155,100 @@ resotre_numa(int *oldpolicy, struct bitmask *oldmask) } #endif +static struct msl_entry * +get_msl_entry_by_idx(unsigned int list_idx) +{ + struct msl_entry *te; + + rte_spinlock_lock(&tailq_lock); + + TAILQ_FOREACH(te, &msl_entry_list, next) { + if (te->msl_idx == list_idx) + break; + } + if (te == NULL) { + /* doesn't exist, so create it and set fd to -1 */ + + te = malloc(sizeof(*te)); + if (te == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n", + __func__); + goto unlock; + } + te->msl_idx = list_idx; + te->fd = -1; + TAILQ_INSERT_TAIL(&msl_entry_list, te, next); + } +unlock: + rte_spinlock_unlock(&tailq_lock); + return te; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * uses fstat to check if file size on disk is zero (regular fstat won't show + * true file size due to how fallocate works) + */ +static bool +is_zero_length(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return false; + return st.st_blocks == 0; +} + static int get_seg_fd(char *path, int buflen, struct hugepage_info *hi, unsigned int list_idx, unsigned int seg_idx) { int fd; - eal_get_hugefile_path(path, buflen, hi->hugedir, - list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - return -1; + + if (internal_config.single_file_segments) { + /* + * try to find a tailq entry, for this memseg list, or create + * one if it doesn't exist. + */ + struct msl_entry *te = get_msl_entry_by_idx(list_idx); + if (te == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n", + __func__); + return -1; + } else if (te->fd < 0) { + /* create a hugepage file */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx); + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + te->fd = fd; + } else { + fd = te->fd; + } + } else { + /* one file per page, just create it */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + return -1; + } } return fd; } @@ -172,6 +280,94 @@ static int lock(int fd, uint64_t offset, uint64_t len, int type) return 1; } +static int +resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, + bool grow) +{ + bool again = false; + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, but locks are still needed to prevent + * primary process' initialization from clearing out + * huge pages used by this process. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* not being able to take out a read lock is an error */ + if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1) + return -1; + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* if fallocate() is supported, we need to take out a + * read lock on allocate (to prevent other processes + * from deallocating this page), and take out a write + * lock on deallocate (to ensure nobody else is using + * this page). + * + * we can't use flock() for this, as we actually need to + * lock part of the file, not the entire file. + */ + + if (!grow) { + ret = lock(fd, fa_offset, page_sz, F_WRLCK); + + if (ret < 0) + return -1; + else if (ret == 0) + /* failed to lock, not an error */ + return 0; + } + if (fallocate(fd, flags, fa_offset, page_sz) < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else { + fallocate_supported = 1; + + if (grow) { + /* if can't read lock, it's an error */ + if (lock(fd, fa_offset, page_sz, + F_RDLCK) != 1) + return -1; + } else { + /* if can't unlock, it's an error */ + if (lock(fd, fa_offset, page_sz, + F_UNLCK) != 1) + return -1; + } + } + } + } while (again); + return 0; +} + static int alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, struct hugepage_info *hi, unsigned int list_idx, @@ -191,34 +387,40 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, return -1; alloc_sz = hi->hugepage_sz; - - map_offset = 0; - if (ftruncate(fd, alloc_sz) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - goto resized; - } - /* we've allocated a page - take out a read lock. we're using fcntl() - * locks rather than flock() here because doing that gives us one huge - * advantage - fcntl() locks are per-process, not per-file descriptor, - * which means that we don't have to keep the original fd's around to - * keep a lock on the file. - * - * this is useful, because when it comes to unmapping pages, we will - * have to take out a write lock (to figure out if another process still - * has this page mapped), and to do itwith flock() we'll have to use - * original fd, as lock is associated with that particular fd. with - * fcntl(), this is not necessary - we can open a new fd and use fcntl() - * on that. - */ - ret = lock(fd, map_offset, alloc_sz, F_RDLCK); - - /* this should not fail */ - if (ret != 1) { - RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n", - __func__, - strerror(errno)); - goto resized; + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, map_offset, alloc_sz, true); + if (ret < 1) + goto resized; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + /* we've allocated a page - take out a read lock. we're using + * fcntl() locks rather than flock() here because doing that + * gives us one huge advantage - fcntl() locks are per-process, + * not per-file descriptor, which means that we don't have to + * keep the original fd's around to keep a lock on the file. + * + * this is useful, because when it comes to unmapping pages, we + * will have to take out a write lock (to figure out if another + * process still has this page mapped), and to do itwith flock() + * we'll have to use original fd, as lock is associated with + * that particular fd. with fcntl(), this is not necessary - we + * can open a new fd and use fcntl() on that. + */ + ret = lock(fd, map_offset, alloc_sz, F_RDLCK); + + /* this should not fail */ + if (ret != 1) { + RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n", + __func__, + strerror(errno)); + goto resized; + } } /* @@ -227,7 +429,9 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, */ void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset); - close(fd); + /* for non-single file segments, we can close fd here */ + if (!internal_config.single_file_segments) + close(fd); if (va == MAP_FAILED) { RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, @@ -284,11 +488,85 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, mapped: munmap(addr, alloc_sz); resized: - close(fd); - unlink(path); + if (internal_config.single_file_segments) { + resize_hugefile(fd, map_offset, alloc_sz, false); + if (is_zero_length(fd)) { + struct msl_entry *te = get_msl_entry_by_idx(list_idx); + if (te != NULL && te->fd >= 0) { + close(te->fd); + te->fd = -1; + } + /* ignore errors, can't make it any worse */ + unlink(path); + } + } else { + close(fd); + unlink(path); + } return -1; } +static int +free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + uint64_t map_offset; + char path[PATH_MAX]; + int fd, ret; + + /* erase page data */ + memset(ms->addr, 0, ms->len); + + if (mmap(ms->addr, ms->len, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); + return -1; + } + + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, map_offset, ms->len, false)) + return -1; + /* if file is zero-length, we've already shrunk it, so it's + * safe to remove. + */ + if (is_zero_length(fd)) { + struct msl_entry *te = get_msl_entry_by_idx(list_idx); + if (te != NULL && te->fd >= 0) { + close(te->fd); + te->fd = -1; + } + unlink(path); + } + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + + ret = lock(fd, 0, ms->len, F_WRLCK); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + ret = lock(fd, 0, ms->len, F_UNLCK); + if (ret != 1) + RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n", + __func__, path); + } + close(fd); + } + + memset(ms, 0, sizeof(*ms)); + + return ret; +} + struct alloc_walk_param { struct hugepage_info *hi; struct rte_memseg **ms; @@ -305,7 +583,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) struct alloc_walk_param *wa = arg; struct rte_memseg_list *cur_msl; size_t page_sz; - int cur_idx; + int cur_idx, start_idx, j; unsigned int msl_idx, need, i; if (msl->page_sz != wa->page_sz) @@ -324,6 +602,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need); if (cur_idx < 0) return 0; + start_idx = cur_idx; for (i = 0; i < need; i++, cur_idx++) { struct rte_memseg *cur; @@ -341,6 +620,25 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) /* if exact number wasn't requested, stop */ if (!wa->exact) goto out; + + /* clean up */ + for (j = start_idx; j < cur_idx; j++) { + struct rte_memseg *tmp; + struct rte_fbarray *arr = + &cur_msl->memseg_arr; + + tmp = rte_fbarray_get(arr, j); + if (free_seg(tmp, wa->hi, msl_idx, + start_idx + j)) { + RTE_LOG(ERR, EAL, "Cannot free page\n"); + continue; + } + + rte_fbarray_set_free(arr, j); + } + /* clear the list */ + if (wa->ms) + memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); return -1; } if (wa->ms) @@ -351,7 +649,39 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) out: wa->segs_allocated = i; return 1; +} + +struct free_walk_param { + struct hugepage_info *hi; + struct rte_memseg *ms; +}; +static int +free_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct free_walk_param *wa = arg; + uintptr_t start_addr, end_addr; + int msl_idx, seg_idx; + + start_addr = (uintptr_t) msl->base_va; + end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz; + + if ((uintptr_t)wa->ms->addr < start_addr || + (uintptr_t)wa->ms->addr >= end_addr) + return 0; + + msl_idx = msl - mcfg->memsegs; + seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; + /* msl is const */ + found_msl = &mcfg->memsegs[msl_idx]; + + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx)) + return -1; + + return 1; } int @@ -427,3 +757,55 @@ eal_memalloc_alloc_seg(size_t page_sz, int socket) /* return pointer to newly allocated memseg */ return ms; } + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) +{ + int seg, ret = 0; + + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (seg = 0; seg < n_segs; seg++) { + struct rte_memseg *cur = ms[seg]; + struct hugepage_info *hi = NULL; + struct free_walk_param wa; + int i, walk_res; + + memset(&wa, 0, sizeof(wa)); + + for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); + i++) { + hi = &internal_config.hugepage_info[i]; + if (cur->hugepage_sz == hi->hugepage_sz) + break; + } + if (i == (int)RTE_DIM(internal_config.hugepage_info)) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + ret = -1; + continue; + } + + wa.ms = cur; + wa.hi = hi; + + walk_res = rte_memseg_list_walk(free_seg_walk, &wa); + if (walk_res == 1) + continue; + if (walk_res == 0) + RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); + ret = -1; + } + return ret; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms) +{ + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + return eal_memalloc_free_seg_bulk(&ms, 1); +}