#include <numa.h>
#include <numaif.h>
#endif
+#include <linux/falloc.h>
#include <rte_common.h>
#include <rte_log.h>
#include "eal_internal_cfg.h"
#include "eal_memalloc.h"
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * If each page is in a separate file, we can close fd's since we need each fd
+ * only once. However, in single file segments mode, we can get away with using
+ * a single fd for entire segments, but we need to store them somewhere. Each
+ * fd is different within each process, so we'll store them in a local tailq.
+ */
+struct msl_entry {
+ TAILQ_ENTRY(msl_entry) next;
+ unsigned int msl_idx;
+ int fd;
+};
+
+/** Double linked list of memseg list fd's. */
+TAILQ_HEAD(msl_entry_list, msl_entry);
+
+static struct msl_entry_list msl_entry_list =
+ TAILQ_HEAD_INITIALIZER(msl_entry_list);
+static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+
static sigjmp_buf huge_jmpenv;
static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
}
#endif
+static struct msl_entry *
+get_msl_entry_by_idx(unsigned int list_idx)
+{
+ struct msl_entry *te;
+
+ rte_spinlock_lock(&tailq_lock);
+
+ TAILQ_FOREACH(te, &msl_entry_list, next) {
+ if (te->msl_idx == list_idx)
+ break;
+ }
+ if (te == NULL) {
+ /* doesn't exist, so create it and set fd to -1 */
+
+ te = malloc(sizeof(*te));
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ goto unlock;
+ }
+ te->msl_idx = list_idx;
+ te->fd = -1;
+ TAILQ_INSERT_TAIL(&msl_entry_list, te, next);
+ }
+unlock:
+ rte_spinlock_unlock(&tailq_lock);
+ return te;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
+ * uses fstat to check if file size on disk is zero (regular fstat won't show
+ * true file size due to how fallocate works)
+ */
+static bool
+is_zero_length(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return false;
+ return st.st_blocks == 0;
+}
+
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
{
int fd;
- eal_get_hugefile_path(path, buflen, hi->hugedir,
- list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
- fd = open(path, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
- strerror(errno));
- return -1;
+
+ if (internal_config.single_file_segments) {
+ /*
+ * try to find a tailq entry, for this memseg list, or create
+ * one if it doesn't exist.
+ */
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+ __func__);
+ return -1;
+ } else if (te->fd < 0) {
+ /* create a hugepage file */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ te->fd = fd;
+ } else {
+ fd = te->fd;
+ }
+ } else {
+ /* one file per page, just create it */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
}
return fd;
}
return 1;
}
+static int
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
+ bool grow)
+{
+ bool again = false;
+ do {
+ if (fallocate_supported == 0) {
+ /* we cannot deallocate memory if fallocate() is not
+ * supported, but locks are still needed to prevent
+ * primary process' initialization from clearing out
+ * huge pages used by this process.
+ */
+
+ if (!grow) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+ __func__);
+ return -1;
+ }
+ uint64_t new_size = fa_offset + page_sz;
+ uint64_t cur_size = get_file_size(fd);
+
+ /* fallocate isn't supported, fall back to ftruncate */
+ if (new_size > cur_size &&
+ ftruncate(fd, new_size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* not being able to take out a read lock is an error */
+ if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1)
+ return -1;
+ } else {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* if fallocate() is supported, we need to take out a
+ * read lock on allocate (to prevent other processes
+ * from deallocating this page), and take out a write
+ * lock on deallocate (to ensure nobody else is using
+ * this page).
+ *
+ * we can't use flock() for this, as we actually need to
+ * lock part of the file, not the entire file.
+ */
+
+ if (!grow) {
+ ret = lock(fd, fa_offset, page_sz, F_WRLCK);
+
+ if (ret < 0)
+ return -1;
+ else if (ret == 0)
+ /* failed to lock, not an error */
+ return 0;
+ }
+ if (fallocate(fd, flags, fa_offset, page_sz) < 0) {
+ if (fallocate_supported == -1 &&
+ errno == ENOTSUP) {
+ RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+ __func__);
+ again = true;
+ fallocate_supported = 0;
+ } else {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ } else {
+ fallocate_supported = 1;
+
+ if (grow) {
+ /* if can't read lock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_RDLCK) != 1)
+ return -1;
+ } else {
+ /* if can't unlock, it's an error */
+ if (lock(fd, fa_offset, page_sz,
+ F_UNLCK) != 1)
+ return -1;
+ }
+ }
+ }
+ } while (again);
+ return 0;
+}
+
static int
alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
struct hugepage_info *hi, unsigned int list_idx,
return -1;
alloc_sz = hi->hugepage_sz;
-
- map_offset = 0;
- if (ftruncate(fd, alloc_sz) < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
- __func__, strerror(errno));
- goto resized;
- }
- /* we've allocated a page - take out a read lock. we're using fcntl()
- * locks rather than flock() here because doing that gives us one huge
- * advantage - fcntl() locks are per-process, not per-file descriptor,
- * which means that we don't have to keep the original fd's around to
- * keep a lock on the file.
- *
- * this is useful, because when it comes to unmapping pages, we will
- * have to take out a write lock (to figure out if another process still
- * has this page mapped), and to do itwith flock() we'll have to use
- * original fd, as lock is associated with that particular fd. with
- * fcntl(), this is not necessary - we can open a new fd and use fcntl()
- * on that.
- */
- ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
-
- /* this should not fail */
- if (ret != 1) {
- RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
- __func__,
- strerror(errno));
- goto resized;
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * alloc_sz;
+ ret = resize_hugefile(fd, map_offset, alloc_sz, true);
+ if (ret < 1)
+ goto resized;
+ } else {
+ map_offset = 0;
+ if (ftruncate(fd, alloc_sz) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ /* we've allocated a page - take out a read lock. we're using
+ * fcntl() locks rather than flock() here because doing that
+ * gives us one huge advantage - fcntl() locks are per-process,
+ * not per-file descriptor, which means that we don't have to
+ * keep the original fd's around to keep a lock on the file.
+ *
+ * this is useful, because when it comes to unmapping pages, we
+ * will have to take out a write lock (to figure out if another
+ * process still has this page mapped), and to do itwith flock()
+ * we'll have to use original fd, as lock is associated with
+ * that particular fd. with fcntl(), this is not necessary - we
+ * can open a new fd and use fcntl() on that.
+ */
+ ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
+
+ /* this should not fail */
+ if (ret != 1) {
+ RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
+ __func__,
+ strerror(errno));
+ goto resized;
+ }
}
/*
*/
void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
- close(fd);
+ /* for non-single file segments, we can close fd here */
+ if (!internal_config.single_file_segments)
+ close(fd);
if (va == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
mapped:
munmap(addr, alloc_sz);
resized:
- close(fd);
- unlink(path);
+ if (internal_config.single_file_segments) {
+ resize_hugefile(fd, map_offset, alloc_sz, false);
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ /* ignore errors, can't make it any worse */
+ unlink(path);
+ }
+ } else {
+ close(fd);
+ unlink(path);
+ }
return -1;
}
+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int fd, ret;
+
+ /* erase page data */
+ memset(ms->addr, 0, ms->len);
+
+ if (mmap(ms->addr, ms->len, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+ MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+ return -1;
+ }
+
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * ms->len;
+ if (resize_hugefile(fd, map_offset, ms->len, false))
+ return -1;
+ /* if file is zero-length, we've already shrunk it, so it's
+ * safe to remove.
+ */
+ if (is_zero_length(fd)) {
+ struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+ if (te != NULL && te->fd >= 0) {
+ close(te->fd);
+ te->fd = -1;
+ }
+ unlink(path);
+ }
+ ret = 0;
+ } else {
+ /* if we're able to take out a write lock, we're the last one
+ * holding onto this page.
+ */
+
+ ret = lock(fd, 0, ms->len, F_WRLCK);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ ret = lock(fd, 0, ms->len, F_UNLCK);
+ if (ret != 1)
+ RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+ __func__, path);
+ }
+ close(fd);
+ }
+
+ memset(ms, 0, sizeof(*ms));
+
+ return ret;
+}
+
struct alloc_walk_param {
struct hugepage_info *hi;
struct rte_memseg **ms;
struct alloc_walk_param *wa = arg;
struct rte_memseg_list *cur_msl;
size_t page_sz;
- int cur_idx;
+ int cur_idx, start_idx, j;
unsigned int msl_idx, need, i;
if (msl->page_sz != wa->page_sz)
cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
if (cur_idx < 0)
return 0;
+ start_idx = cur_idx;
for (i = 0; i < need; i++, cur_idx++) {
struct rte_memseg *cur;
/* if exact number wasn't requested, stop */
if (!wa->exact)
goto out;
+
+ /* clean up */
+ for (j = start_idx; j < cur_idx; j++) {
+ struct rte_memseg *tmp;
+ struct rte_fbarray *arr =
+ &cur_msl->memseg_arr;
+
+ tmp = rte_fbarray_get(arr, j);
+ if (free_seg(tmp, wa->hi, msl_idx,
+ start_idx + j)) {
+ RTE_LOG(ERR, EAL, "Cannot free page\n");
+ continue;
+ }
+
+ rte_fbarray_set_free(arr, j);
+ }
+ /* clear the list */
+ if (wa->ms)
+ memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
return -1;
}
if (wa->ms)
out:
wa->segs_allocated = i;
return 1;
+}
+
+struct free_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *found_msl;
+ struct free_walk_param *wa = arg;
+ uintptr_t start_addr, end_addr;
+ int msl_idx, seg_idx;
+
+ start_addr = (uintptr_t) msl->base_va;
+ end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+
+ if ((uintptr_t)wa->ms->addr < start_addr ||
+ (uintptr_t)wa->ms->addr >= end_addr)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+ /* msl is const */
+ found_msl = &mcfg->memsegs[msl_idx];
+
+ rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+ if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx))
+ return -1;
+
+ return 1;
}
int
/* return pointer to newly allocated memseg */
return ms;
}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+ int seg, ret = 0;
+
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (seg = 0; seg < n_segs; seg++) {
+ struct rte_memseg *cur = ms[seg];
+ struct hugepage_info *hi = NULL;
+ struct free_walk_param wa;
+ int i, walk_res;
+
+ memset(&wa, 0, sizeof(wa));
+
+ for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
+ i++) {
+ hi = &internal_config.hugepage_info[i];
+ if (cur->hugepage_sz == hi->hugepage_sz)
+ break;
+ }
+ if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ ret = -1;
+ continue;
+ }
+
+ wa.ms = cur;
+ wa.hi = hi;
+
+ walk_res = rte_memseg_list_walk(free_seg_walk, &wa);
+ if (walk_res == 1)
+ continue;
+ if (walk_res == 0)
+ RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+ ret = -1;
+ }
+ return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ return eal_memalloc_free_seg_bulk(&ms, 1);
+}