eal: add single file segments option
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_memalloc.c
index 45ea0ad..9c41046 100644 (file)
@@ -27,6 +27,7 @@
 #include <numa.h>
 #include <numaif.h>
 #endif
+#include <linux/falloc.h>
 
 #include <rte_common.h>
 #include <rte_log.h>
 #include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
 
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * If each page is in a separate file, we can close fd's since we need each fd
+ * only once. However, in single file segments mode, we can get away with using
+ * a single fd for entire segments, but we need to store them somewhere. Each
+ * fd is different within each process, so we'll store them in a local tailq.
+ */
+struct msl_entry {
+       TAILQ_ENTRY(msl_entry) next;
+       unsigned int msl_idx;
+       int fd;
+};
+
+/** Double linked list of memseg list fd's. */
+TAILQ_HEAD(msl_entry_list, msl_entry);
+
+static struct msl_entry_list msl_entry_list =
+               TAILQ_HEAD_INITIALIZER(msl_entry_list);
+static rte_spinlock_t tailq_lock = RTE_SPINLOCK_INITIALIZER;
+
 static sigjmp_buf huge_jmpenv;
 
 static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
@@ -129,18 +155,100 @@ resotre_numa(int *oldpolicy, struct bitmask *oldmask)
 }
 #endif
 
+static struct msl_entry *
+get_msl_entry_by_idx(unsigned int list_idx)
+{
+       struct msl_entry *te;
+
+       rte_spinlock_lock(&tailq_lock);
+
+       TAILQ_FOREACH(te, &msl_entry_list, next) {
+               if (te->msl_idx == list_idx)
+                       break;
+       }
+       if (te == NULL) {
+               /* doesn't exist, so create it and set fd to -1 */
+
+               te = malloc(sizeof(*te));
+               if (te == NULL) {
+                       RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+                               __func__);
+                       goto unlock;
+               }
+               te->msl_idx = list_idx;
+               te->fd = -1;
+               TAILQ_INSERT_TAIL(&msl_entry_list, te, next);
+       }
+unlock:
+       rte_spinlock_unlock(&tailq_lock);
+       return te;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+       struct stat st;
+       if (fstat(fd, &st) < 0)
+               return 0;
+       return st.st_size;
+}
+
+/*
+ * uses fstat to check if file size on disk is zero (regular fstat won't show
+ * true file size due to how fallocate works)
+ */
+static bool
+is_zero_length(int fd)
+{
+       struct stat st;
+       if (fstat(fd, &st) < 0)
+               return false;
+       return st.st_blocks == 0;
+}
+
 static int
 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
                unsigned int list_idx, unsigned int seg_idx)
 {
        int fd;
-       eal_get_hugefile_path(path, buflen, hi->hugedir,
-                       list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
-       fd = open(path, O_CREAT | O_RDWR, 0600);
-       if (fd < 0) {
-               RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
-                               strerror(errno));
-               return -1;
+
+       if (internal_config.single_file_segments) {
+               /*
+                * try to find a tailq entry, for this memseg list, or create
+                * one if it doesn't exist.
+                */
+               struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+               if (te == NULL) {
+                       RTE_LOG(ERR, EAL, "%s(): cannot allocate tailq entry for memseg list\n",
+                               __func__);
+                       return -1;
+               } else if (te->fd < 0) {
+                       /* create a hugepage file */
+                       eal_get_hugefile_path(path, buflen, hi->hugedir,
+                                       list_idx);
+                       fd = open(path, O_CREAT | O_RDWR, 0600);
+                       if (fd < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+                       te->fd = fd;
+               } else {
+                       fd = te->fd;
+               }
+       } else {
+               /* one file per page, just create it */
+               eal_get_hugefile_path(path, buflen, hi->hugedir,
+                               list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+               fd = open(path, O_CREAT | O_RDWR, 0600);
+               if (fd < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+                                       strerror(errno));
+                       return -1;
+               }
        }
        return fd;
 }
@@ -172,6 +280,94 @@ static int lock(int fd, uint64_t offset, uint64_t len, int type)
        return 1;
 }
 
+static int
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz,
+               bool grow)
+{
+       bool again = false;
+       do {
+               if (fallocate_supported == 0) {
+                       /* we cannot deallocate memory if fallocate() is not
+                        * supported, but locks are still needed to prevent
+                        * primary process' initialization from clearing out
+                        * huge pages used by this process.
+                        */
+
+                       if (!grow) {
+                               RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+                                       __func__);
+                               return -1;
+                       }
+                       uint64_t new_size = fa_offset + page_sz;
+                       uint64_t cur_size = get_file_size(fd);
+
+                       /* fallocate isn't supported, fall back to ftruncate */
+                       if (new_size > cur_size &&
+                                       ftruncate(fd, new_size) < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+                       /* not being able to take out a read lock is an error */
+                       if (lock(fd, fa_offset, page_sz, F_RDLCK) != 1)
+                               return -1;
+               } else {
+                       int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+                                       FALLOC_FL_KEEP_SIZE;
+                       int ret;
+
+                       /* if fallocate() is supported, we need to take out a
+                        * read lock on allocate (to prevent other processes
+                        * from deallocating this page), and take out a write
+                        * lock on deallocate (to ensure nobody else is using
+                        * this page).
+                        *
+                        * we can't use flock() for this, as we actually need to
+                        * lock part of the file, not the entire file.
+                        */
+
+                       if (!grow) {
+                               ret = lock(fd, fa_offset, page_sz, F_WRLCK);
+
+                               if (ret < 0)
+                                       return -1;
+                               else if (ret == 0)
+                                       /* failed to lock, not an error */
+                                       return 0;
+                       }
+                       if (fallocate(fd, flags, fa_offset, page_sz) < 0) {
+                               if (fallocate_supported == -1 &&
+                                               errno == ENOTSUP) {
+                                       RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+                                               __func__);
+                                       again = true;
+                                       fallocate_supported = 0;
+                               } else {
+                                       RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+                                               __func__,
+                                               strerror(errno));
+                                       return -1;
+                               }
+                       } else {
+                               fallocate_supported = 1;
+
+                               if (grow) {
+                                       /* if can't read lock, it's an error */
+                                       if (lock(fd, fa_offset, page_sz,
+                                                       F_RDLCK) != 1)
+                                               return -1;
+                               } else {
+                                       /* if can't unlock, it's an error */
+                                       if (lock(fd, fa_offset, page_sz,
+                                                       F_UNLCK) != 1)
+                                               return -1;
+                               }
+                       }
+               }
+       } while (again);
+       return 0;
+}
+
 static int
 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
                struct hugepage_info *hi, unsigned int list_idx,
@@ -191,34 +387,40 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
                return -1;
 
        alloc_sz = hi->hugepage_sz;
-
-       map_offset = 0;
-       if (ftruncate(fd, alloc_sz) < 0) {
-               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
-                       __func__, strerror(errno));
-               goto resized;
-       }
-       /* we've allocated a page - take out a read lock. we're using fcntl()
-        * locks rather than flock() here because doing that gives us one huge
-        * advantage - fcntl() locks are per-process, not per-file descriptor,
-        * which means that we don't have to keep the original fd's around to
-        * keep a lock on the file.
-        *
-        * this is useful, because when it comes to unmapping pages, we will
-        * have to take out a write lock (to figure out if another process still
-        * has this page mapped), and to do itwith flock() we'll have to use
-        * original fd, as lock is associated with that particular fd. with
-        * fcntl(), this is not necessary - we can open a new fd and use fcntl()
-        * on that.
-        */
-       ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
-
-       /* this should not fail */
-       if (ret != 1) {
-               RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
-                       __func__,
-                       strerror(errno));
-               goto resized;
+       if (internal_config.single_file_segments) {
+               map_offset = seg_idx * alloc_sz;
+               ret = resize_hugefile(fd, map_offset, alloc_sz, true);
+               if (ret < 1)
+                       goto resized;
+       } else {
+               map_offset = 0;
+               if (ftruncate(fd, alloc_sz) < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+                               __func__, strerror(errno));
+                       goto resized;
+               }
+               /* we've allocated a page - take out a read lock. we're using
+                * fcntl() locks rather than flock() here because doing that
+                * gives us one huge advantage - fcntl() locks are per-process,
+                * not per-file descriptor, which means that we don't have to
+                * keep the original fd's around to keep a lock on the file.
+                *
+                * this is useful, because when it comes to unmapping pages, we
+                * will have to take out a write lock (to figure out if another
+                * process still has this page mapped), and to do itwith flock()
+                * we'll have to use original fd, as lock is associated with
+                * that particular fd. with fcntl(), this is not necessary - we
+                * can open a new fd and use fcntl() on that.
+                */
+               ret = lock(fd, map_offset, alloc_sz, F_RDLCK);
+
+               /* this should not fail */
+               if (ret != 1) {
+                       RTE_LOG(ERR, EAL, "%s(): error locking file: %s\n",
+                               __func__,
+                               strerror(errno));
+                       goto resized;
+               }
        }
 
        /*
@@ -227,7 +429,9 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
         */
        void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
                        MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
-       close(fd);
+       /* for non-single file segments, we can close fd here */
+       if (!internal_config.single_file_segments)
+               close(fd);
 
        if (va == MAP_FAILED) {
                RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
@@ -284,11 +488,85 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 mapped:
        munmap(addr, alloc_sz);
 resized:
-       close(fd);
-       unlink(path);
+       if (internal_config.single_file_segments) {
+               resize_hugefile(fd, map_offset, alloc_sz, false);
+               if (is_zero_length(fd)) {
+                       struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+                       if (te != NULL && te->fd >= 0) {
+                               close(te->fd);
+                               te->fd = -1;
+                       }
+                       /* ignore errors, can't make it any worse */
+                       unlink(path);
+               }
+       } else {
+               close(fd);
+               unlink(path);
+       }
        return -1;
 }
 
+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+               unsigned int list_idx, unsigned int seg_idx)
+{
+       uint64_t map_offset;
+       char path[PATH_MAX];
+       int fd, ret;
+
+       /* erase page data */
+       memset(ms->addr, 0, ms->len);
+
+       if (mmap(ms->addr, ms->len, PROT_READ,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+                               MAP_FAILED) {
+               RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+               return -1;
+       }
+
+       fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+       if (fd < 0)
+               return -1;
+
+       if (internal_config.single_file_segments) {
+               map_offset = seg_idx * ms->len;
+               if (resize_hugefile(fd, map_offset, ms->len, false))
+                       return -1;
+               /* if file is zero-length, we've already shrunk it, so it's
+                * safe to remove.
+                */
+               if (is_zero_length(fd)) {
+                       struct msl_entry *te = get_msl_entry_by_idx(list_idx);
+                       if (te != NULL && te->fd >= 0) {
+                               close(te->fd);
+                               te->fd = -1;
+                       }
+                       unlink(path);
+               }
+               ret = 0;
+       } else {
+               /* if we're able to take out a write lock, we're the last one
+                * holding onto this page.
+                */
+
+               ret = lock(fd, 0, ms->len, F_WRLCK);
+               if (ret >= 0) {
+                       /* no one else is using this page */
+                       if (ret == 1)
+                               unlink(path);
+                       ret = lock(fd, 0, ms->len, F_UNLCK);
+                       if (ret != 1)
+                               RTE_LOG(ERR, EAL, "%s(): unable to unlock file %s\n",
+                                       __func__, path);
+               }
+               close(fd);
+       }
+
+       memset(ms, 0, sizeof(*ms));
+
+       return ret;
+}
+
 struct alloc_walk_param {
        struct hugepage_info *hi;
        struct rte_memseg **ms;
@@ -305,7 +583,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
        struct alloc_walk_param *wa = arg;
        struct rte_memseg_list *cur_msl;
        size_t page_sz;
-       int cur_idx;
+       int cur_idx, start_idx, j;
        unsigned int msl_idx, need, i;
 
        if (msl->page_sz != wa->page_sz)
@@ -324,6 +602,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
        cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
        if (cur_idx < 0)
                return 0;
+       start_idx = cur_idx;
 
        for (i = 0; i < need; i++, cur_idx++) {
                struct rte_memseg *cur;
@@ -341,6 +620,25 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
                        /* if exact number wasn't requested, stop */
                        if (!wa->exact)
                                goto out;
+
+                       /* clean up */
+                       for (j = start_idx; j < cur_idx; j++) {
+                               struct rte_memseg *tmp;
+                               struct rte_fbarray *arr =
+                                               &cur_msl->memseg_arr;
+
+                               tmp = rte_fbarray_get(arr, j);
+                               if (free_seg(tmp, wa->hi, msl_idx,
+                                               start_idx + j)) {
+                                       RTE_LOG(ERR, EAL, "Cannot free page\n");
+                                       continue;
+                               }
+
+                               rte_fbarray_set_free(arr, j);
+                       }
+                       /* clear the list */
+                       if (wa->ms)
+                               memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
                        return -1;
                }
                if (wa->ms)
@@ -351,7 +649,39 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 out:
        wa->segs_allocated = i;
        return 1;
+}
+
+struct free_walk_param {
+       struct hugepage_info *hi;
+       struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *found_msl;
+       struct free_walk_param *wa = arg;
+       uintptr_t start_addr, end_addr;
+       int msl_idx, seg_idx;
+
+       start_addr = (uintptr_t) msl->base_va;
+       end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
+
+       if ((uintptr_t)wa->ms->addr < start_addr ||
+                       (uintptr_t)wa->ms->addr >= end_addr)
+               return 0;
+
+       msl_idx = msl - mcfg->memsegs;
+       seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
 
+       /* msl is const */
+       found_msl = &mcfg->memsegs[msl_idx];
+
+       rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+       if (free_seg(wa->ms, wa->hi, msl_idx, seg_idx))
+               return -1;
+
+       return 1;
 }
 
 int
@@ -427,3 +757,55 @@ eal_memalloc_alloc_seg(size_t page_sz, int socket)
        /* return pointer to newly allocated memseg */
        return ms;
 }
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+       int seg, ret = 0;
+
+       /* dynamic free not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       for (seg = 0; seg < n_segs; seg++) {
+               struct rte_memseg *cur = ms[seg];
+               struct hugepage_info *hi = NULL;
+               struct free_walk_param wa;
+               int i, walk_res;
+
+               memset(&wa, 0, sizeof(wa));
+
+               for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
+                               i++) {
+                       hi = &internal_config.hugepage_info[i];
+                       if (cur->hugepage_sz == hi->hugepage_sz)
+                               break;
+               }
+               if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
+                       RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+                       ret = -1;
+                       continue;
+               }
+
+               wa.ms = cur;
+               wa.hi = hi;
+
+               walk_res = rte_memseg_list_walk(free_seg_walk, &wa);
+               if (walk_res == 1)
+                       continue;
+               if (walk_res == 0)
+                       RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+               ret = -1;
+       }
+       return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+       /* dynamic free not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       return eal_memalloc_free_seg_bulk(&ms, 1);
+}