mem: support --in-memory mode
authorAnatoly Burakov <anatoly.burakov@intel.com>
Fri, 13 Jul 2018 12:48:04 +0000 (13:48 +0100)
committerThomas Monjalon <thomas@monjalon.net>
Fri, 13 Jul 2018 13:35:43 +0000 (15:35 +0200)
Implement the final piece of the in-memory mode puzzle - enable running
DPDK entirely in memory, without creating any files.

To do it, use mmap with MAP_HUGETLB and size flags to enable DPDK to work
without hugetlbfs mountpoints. In order to enable this, a few things needed
to be changed.

First of all, we need to allow empty hugetlbfs mountpoints in
hugepage_info, and handle them correctly (by not trying to create any
files and lock any directories).

Next, we need to reorder the mapping sequence, because the page is not
really allocated until the page fault, and we cannot get its IOVA
address before we trigger the page fault.

Finally, decide at compile time whether we are going to be supporting
anonymous hugepages or not, because we cannot check for it at runtime.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
lib/librte_eal/linuxapp/eal/eal_memalloc.c
lib/librte_eal/linuxapp/eal/eal_memory.c

index 7f8e2fd..3a7d4b2 100644 (file)
@@ -18,6 +18,8 @@
 #include <sys/queue.h>
 #include <sys/stat.h>
 
+#include <linux/mman.h> /* for hugetlb-related flags */
+
 #include <rte_memory.h>
 #include <rte_eal.h>
 #include <rte_launch.h>
@@ -313,11 +315,49 @@ compare_hpi(const void *a, const void *b)
        return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
 }
 
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+       uint64_t total_pages = 0;
+       unsigned int i;
+
+       /*
+        * first, try to put all hugepages into relevant sockets, but
+        * if first attempts fails, fall back to collecting all pages
+        * in one socket and sorting them later
+        */
+       total_pages = 0;
+       /* we also don't want to do this for legacy init */
+       if (!internal_config.legacy_mem)
+               for (i = 0; i < rte_socket_count(); i++) {
+                       int socket = rte_socket_id_by_idx(i);
+                       unsigned int num_pages =
+                                       get_num_hugepages_on_node(
+                                               dirent->d_name, socket);
+                       hpi->num_pages[socket] = num_pages;
+                       total_pages += num_pages;
+               }
+       /*
+        * we failed to sort memory from the get go, so fall
+        * back to old way
+        */
+       if (total_pages == 0) {
+               hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+               /* for 32-bit systems, limit number of hugepages to
+                * 1GB per page size */
+               hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+                               RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+       }
+}
+
 static int
 hugepage_info_init(void)
 {      const char dirent_start_text[] = "hugepages-";
        const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-       unsigned int i, total_pages, num_sizes = 0;
+       unsigned int i, num_sizes = 0;
        DIR *dir;
        struct dirent *dirent;
 
@@ -355,6 +395,22 @@ hugepage_info_init(void)
                                        "%" PRIu64 " reserved, but no mounted "
                                        "hugetlbfs found for that size\n",
                                        num_pages, hpi->hugepage_sz);
+                       /* if we have kernel support for reserving hugepages
+                        * through mmap, and we're in in-memory mode, treat this
+                        * page size as valid. we cannot be in legacy mode at
+                        * this point because we've checked this earlier in the
+                        * init process.
+                        */
+#ifdef MAP_HUGE_SHIFT
+                       if (internal_config.in_memory) {
+                               RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
+                                       "hugepages of size %" PRIu64 " bytes "
+                                       "will be allocated anonymously\n",
+                                       hpi->hugepage_sz);
+                               calc_num_pages(hpi, dirent);
+                               num_sizes++;
+                       }
+#endif
                        continue;
                }
 
@@ -371,35 +427,7 @@ hugepage_info_init(void)
                if (clear_hugedir(hpi->hugedir) == -1)
                        break;
 
-               /*
-                * first, try to put all hugepages into relevant sockets, but
-                * if first attempts fails, fall back to collecting all pages
-                * in one socket and sorting them later
-                */
-               total_pages = 0;
-               /* we also don't want to do this for legacy init */
-               if (!internal_config.legacy_mem)
-                       for (i = 0; i < rte_socket_count(); i++) {
-                               int socket = rte_socket_id_by_idx(i);
-                               unsigned int num_pages =
-                                               get_num_hugepages_on_node(
-                                                       dirent->d_name, socket);
-                               hpi->num_pages[socket] = num_pages;
-                               total_pages += num_pages;
-                       }
-               /*
-                * we failed to sort memory from the get go, so fall
-                * back to old way
-                */
-               if (total_pages == 0)
-                       hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
-
-#ifndef RTE_ARCH_64
-               /* for 32-bit systems, limit number of hugepages to
-                * 1GB per page size */
-               hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
-                                           RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
+               calc_num_pages(hpi, dirent);
 
                num_sizes++;
        }
@@ -423,8 +451,7 @@ hugepage_info_init(void)
 
                for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
                        num_pages += hpi->num_pages[j];
-               if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
-                               num_pages > 0)
+               if (num_pages > 0)
                        return 0;
        }
 
index d610923..79443c5 100644 (file)
@@ -28,6 +28,7 @@
 #include <numaif.h>
 #endif
 #include <linux/falloc.h>
+#include <linux/mman.h> /* for hugetlb-related mmap flags */
 
 #include <rte_common.h>
 #include <rte_log.h>
 #include "eal_memalloc.h"
 #include "eal_private.h"
 
+const int anonymous_hugepages_supported =
+#ifdef MAP_HUGE_SHIFT
+               1;
+#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
+#else
+               0;
+#define RTE_MAP_HUGE_SHIFT 26
+#endif
+
 /*
  * not all kernel version support fallocate on hugetlbfs, so fall back to
  * ftruncate and disallow deallocation if fallocate is not supported.
@@ -461,6 +471,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
        int cur_socket_id = 0;
 #endif
        uint64_t map_offset;
+       rte_iova_t iova;
+       void *va;
        char path[PATH_MAX];
        int ret = 0;
        int fd;
@@ -468,42 +480,65 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
        int flags;
        void *new_addr;
 
-       /* takes out a read lock on segment or segment list */
-       fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
-       if (fd < 0) {
-               RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
-               return -1;
-       }
-
        alloc_sz = hi->hugepage_sz;
-       if (internal_config.single_file_segments) {
-               map_offset = seg_idx * alloc_sz;
-               ret = resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
-                               alloc_sz, true);
-               if (ret < 0)
-                       goto resized;
-       } else {
+       if (internal_config.in_memory && anonymous_hugepages_supported) {
+               int log2, flags;
+
+               log2 = rte_log2_u32(alloc_sz);
+               /* as per mmap() manpage, all page sizes are log2 of page size
+                * shifted by MAP_HUGE_SHIFT
+                */
+               flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
+                               MAP_PRIVATE | MAP_ANONYMOUS;
+               fd = -1;
+               va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
+
+               /* single-file segments codepath will never be active because
+                * in-memory mode is incompatible with it and it's stopped at
+                * EAL initialization stage, however the compiler doesn't know
+                * that and complains about map_offset being used uninitialized
+                * on failure codepaths while having in-memory mode enabled. so,
+                * assign a value here.
+                */
                map_offset = 0;
-               if (ftruncate(fd, alloc_sz) < 0) {
-                       RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
-                               __func__, strerror(errno));
-                       goto resized;
+       } else {
+               /* takes out a read lock on segment or segment list */
+               fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+               if (fd < 0) {
+                       RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
+                       return -1;
                }
-               if (internal_config.hugepage_unlink) {
-                       if (unlink(path)) {
-                               RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
+
+               if (internal_config.single_file_segments) {
+                       map_offset = seg_idx * alloc_sz;
+                       ret = resize_hugefile(fd, path, list_idx, seg_idx,
+                                       map_offset, alloc_sz, true);
+                       if (ret < 0)
+                               goto resized;
+               } else {
+                       map_offset = 0;
+                       if (ftruncate(fd, alloc_sz) < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
                                        __func__, strerror(errno));
                                goto resized;
                        }
+                       if (internal_config.hugepage_unlink) {
+                               if (unlink(path)) {
+                                       RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
+                                               __func__, strerror(errno));
+                                       goto resized;
+                               }
+                       }
                }
-       }
 
-       /*
-        * map the segment, and populate page tables, the kernel fills this
-        * segment with zeros if it's a new page.
-        */
-       void *va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
-                       MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, map_offset);
+               /*
+                * map the segment, and populate page tables, the kernel fills
+                * this segment with zeros if it's a new page.
+                */
+               va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
+                               MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
+                               map_offset);
+       }
 
        if (va == MAP_FAILED) {
                RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
@@ -519,24 +554,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
                goto resized;
        }
 
-       rte_iova_t iova = rte_mem_virt2iova(addr);
-       if (iova == RTE_BAD_PHYS_ADDR) {
-               RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
-                       __func__);
-               goto mapped;
-       }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
-
-       if (cur_socket_id != socket_id) {
-               RTE_LOG(DEBUG, EAL,
-                               "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
-                       __func__, socket_id, cur_socket_id);
-               goto mapped;
-       }
-#endif
-
        /* In linux, hugetlb limitations, like cgroup, are
         * enforced at fault time instead of mmap(), even
         * with the option of MAP_POPULATE. Kernel will send
@@ -549,9 +566,6 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
                        (unsigned int)(alloc_sz >> 20));
                goto mapped;
        }
-       /* for non-single file segments, we can close fd here */
-       if (!internal_config.single_file_segments)
-               close(fd);
 
        /* we need to trigger a write to the page to enforce page fault and
         * ensure that page is accessible to us, but we can't overwrite value
@@ -560,6 +574,28 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
         */
        *(volatile int *)addr = *(volatile int *)addr;
 
+       iova = rte_mem_virt2iova(addr);
+       if (iova == RTE_BAD_PHYS_ADDR) {
+               RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+                       __func__);
+               goto mapped;
+       }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+       if (cur_socket_id != socket_id) {
+               RTE_LOG(DEBUG, EAL,
+                               "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+                       __func__, socket_id, cur_socket_id);
+               goto mapped;
+       }
+#endif
+       /* for non-single file segments that aren't in-memory, we can close fd
+        * here */
+       if (!internal_config.single_file_segments && !internal_config.in_memory)
+               close(fd);
+
        ms->addr = addr;
        ms->hugepage_sz = alloc_sz;
        ms->len = alloc_sz;
@@ -588,6 +624,7 @@ unmapped:
                RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
        }
 resized:
+       /* in-memory mode will never be single-file-segments mode */
        if (internal_config.single_file_segments) {
                resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
                                alloc_sz, false);
@@ -595,6 +632,7 @@ resized:
        } else {
                /* only remove file if we can take out a write lock */
                if (internal_config.hugepage_unlink == 0 &&
+                               internal_config.in_memory == 0 &&
                                lock(fd, LOCK_EX) == 1)
                        unlink(path);
                close(fd);
@@ -705,7 +743,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
         * during init, we already hold a write lock, so don't try to take out
         * another one.
         */
-       if (wa->hi->lock_descriptor == -1) {
+       if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
                dir_fd = open(wa->hi->hugedir, O_RDONLY);
                if (dir_fd < 0) {
                        RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
@@ -809,7 +847,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
         * during init, we already hold a write lock, so don't try to take out
         * another one.
         */
-       if (wa->hi->lock_descriptor == -1) {
+       if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
                dir_fd = open(wa->hi->hugedir, O_RDONLY);
                if (dir_fd < 0) {
                        RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
index ddfa8b1..dbf1949 100644 (file)
@@ -1088,8 +1088,7 @@ get_socket_mem_size(int socket)
 
        for (i = 0; i < internal_config.num_hugepage_sizes; i++){
                struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-               if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
-                       size += hpi->hugepage_sz * hpi->num_pages[socket];
+               size += hpi->hugepage_sz * hpi->num_pages[socket];
        }
 
        return size;