/* this function is only called from eal_hugepage_info_init which itself
* is only called from a primary process */
static uint32_t
-get_num_hugepages(const char *subdir, size_t sz)
+get_num_hugepages(const char *subdir, size_t sz, unsigned int reusable_pages)
{
unsigned long resv_pages, num_pages, over_pages, surplus_pages;
const char *nr_hp_file = "free_hugepages";
else
over_pages = 0;
- if (num_pages == 0 && over_pages == 0)
+ if (num_pages == 0 && over_pages == 0 && reusable_pages)
RTE_LOG(WARNING, EAL, "No available %zu kB hugepages reported\n",
sz >> 10);
if (num_pages < over_pages) /* overflow */
num_pages = UINT32_MAX;
+ num_pages += reusable_pages;
+ if (num_pages < reusable_pages) /* overflow */
+ num_pages = UINT32_MAX;
+
/* we want to return a uint32_t and more than this looks suspicious
* anyway ... */
if (num_pages > UINT32_MAX)
return -1;
}
+struct walk_hugedir_data {
+ int dir_fd;
+ int file_fd;
+ const char *file_name;
+ void *user_data;
+};
+
+typedef void (walk_hugedir_t)(const struct walk_hugedir_data *whd);
+
/*
- * Clear the hugepage directory of whatever hugepage files
- * there are. Checks if the file is locked (i.e.
- * if it's in use by another DPDK process).
+ * Search the hugepage directory for whatever hugepage files there are.
+ * Check if the file is in use by another DPDK process.
+ * If not, execute a callback on it.
*/
static int
-clear_hugedir(const char * hugedir)
+walk_hugedir(const char *hugedir, walk_hugedir_t *cb, void *user_data)
{
DIR *dir;
struct dirent *dirent;
int dir_fd, fd, lck_result;
const char filter[] = "*map_*"; /* matches hugepage files */
- /* open directory */
dir = opendir(hugedir);
if (!dir) {
RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
goto error;
}
- while(dirent != NULL){
+ while (dirent != NULL) {
/* skip files that don't match the hugepage pattern */
if (fnmatch(filter, dirent->d_name, 0) > 0) {
dirent = readdir(dir);
/* non-blocking lock */
lck_result = flock(fd, LOCK_EX | LOCK_NB);
- /* if lock succeeds, remove the file */
+ /* if lock succeeds, execute callback */
if (lck_result != -1)
- unlinkat(dir_fd, dirent->d_name, 0);
+ cb(&(struct walk_hugedir_data){
+ .dir_fd = dir_fd,
+ .file_fd = fd,
+ .file_name = dirent->d_name,
+ .user_data = user_data,
+ });
+
close (fd);
dirent = readdir(dir);
}
if (dir)
closedir(dir);
- RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n",
+ RTE_LOG(ERR, EAL, "Error while walking hugepage dir: %s\n",
strerror(errno));
return -1;
}
+static void
+clear_hugedir_cb(const struct walk_hugedir_data *whd)
+{
+ unlinkat(whd->dir_fd, whd->file_name, 0);
+}
+
+/* Remove hugepage files not used by other DPDK processes from a directory. */
+static int
+clear_hugedir(const char *hugedir)
+{
+ return walk_hugedir(hugedir, clear_hugedir_cb, NULL);
+}
+
+static void
+inspect_hugedir_cb(const struct walk_hugedir_data *whd)
+{
+ uint64_t *total_size = whd->user_data;
+ struct stat st;
+
+ if (fstat(whd->file_fd, &st) < 0)
+ RTE_LOG(DEBUG, EAL, "%s(): stat(\"%s\") failed: %s",
+ __func__, whd->file_name, strerror(errno));
+ else
+ (*total_size) += st.st_size;
+}
+
+/*
+ * Count the total size in bytes of all files in the directory
+ * not mapped by other DPDK process.
+ */
+static int
+inspect_hugedir(const char *hugedir, uint64_t *total_size)
+{
+ return walk_hugedir(hugedir, inspect_hugedir_cb, total_size);
+}
+
static int
compare_hpi(const void *a, const void *b)
{
}
static void
-calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent,
+ unsigned int reusable_pages)
{
uint64_t total_pages = 0;
unsigned int i;
* in one socket and sorting them later
*/
total_pages = 0;
- /* we also don't want to do this for legacy init */
- if (!internal_conf->legacy_mem)
+
+ /*
+ * We also don't want to do this for legacy init.
+ * When there are hugepage files to reuse it is unknown
+ * what NUMA node the pages are on.
+ * This could be determined by mapping,
+ * but it is precisely what hugepage file reuse is trying to avoid.
+ */
+ if (!internal_conf->legacy_mem && reusable_pages == 0)
for (i = 0; i < rte_socket_count(); i++) {
int socket = rte_socket_id_by_idx(i);
unsigned int num_pages =
*/
if (total_pages == 0) {
hpi->num_pages[0] = get_num_hugepages(dirent->d_name,
- hpi->hugepage_sz);
+ hpi->hugepage_sz, reusable_pages);
#ifndef RTE_ARCH_64
/* for 32-bit systems, limit number of hugepages to
{ const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
unsigned int i, num_sizes = 0;
+ uint64_t reusable_bytes;
+ unsigned int reusable_pages;
DIR *dir;
struct dirent *dirent;
struct internal_config *internal_conf =
uint32_t num_pages;
num_pages = get_num_hugepages(dirent->d_name,
- hpi->hugepage_sz);
+ hpi->hugepage_sz, 0);
if (num_pages > 0)
RTE_LOG(NOTICE, EAL,
"%" PRIu32 " hugepages of size "
"hugepages of size %" PRIu64 " bytes "
"will be allocated anonymously\n",
hpi->hugepage_sz);
- calc_num_pages(hpi, dirent);
+ calc_num_pages(hpi, dirent, 0);
num_sizes++;
}
#endif
"Failed to lock hugepage directory!\n");
break;
}
- /* clear out the hugepages dir from unused pages */
- if (clear_hugedir(hpi->hugedir) == -1)
- break;
- calc_num_pages(hpi, dirent);
+ /*
+ * Check for existing hugepage files and either remove them
+ * or count how many of them can be reused.
+ */
+ reusable_pages = 0;
+ if (!internal_conf->hugepage_file.unlink_existing) {
+ reusable_bytes = 0;
+ if (inspect_hugedir(hpi->hugedir,
+ &reusable_bytes) < 0)
+ break;
+ RTE_ASSERT(reusable_bytes % hpi->hugepage_sz == 0);
+ reusable_pages = reusable_bytes / hpi->hugepage_sz;
+ } else if (clear_hugedir(hpi->hugedir) < 0) {
+ break;
+ }
+ calc_num_pages(hpi, dirent, reusable_pages);
num_sizes++;
}
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
- unsigned int list_idx, unsigned int seg_idx)
+ unsigned int list_idx, unsigned int seg_idx,
+ bool *dirty)
{
int fd;
+ int *out_fd;
+ struct stat st;
+ int ret;
const struct internal_config *internal_conf =
eal_get_internal_configuration();
+ if (dirty != NULL)
+ *dirty = false;
+
/* for in-memory mode, we only make it here when we're sure we support
* memfd, and this is a special case.
*/
return get_seg_memfd(hi, list_idx, seg_idx);
if (internal_conf->single_file_segments) {
- /* create a hugepage file path */
+ out_fd = &fd_list[list_idx].memseg_list_fd;
eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
-
- fd = fd_list[list_idx].memseg_list_fd;
-
- if (fd < 0) {
- fd = open(path, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n",
- __func__, path, strerror(errno));
- return -1;
- }
- /* take out a read lock and keep it indefinitely */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
- }
- fd_list[list_idx].memseg_list_fd = fd;
- }
} else {
- /* create a hugepage file path */
+ out_fd = &fd_list[list_idx].fds[seg_idx];
eal_get_hugefile_path(path, buflen, hi->hugedir,
list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+ }
+ fd = *out_fd;
+ if (fd >= 0)
+ return fd;
- fd = fd_list[list_idx].fds[seg_idx];
-
- if (fd < 0) {
- /* A primary process is the only one creating these
- * files. If there is a leftover that was not cleaned
- * by clear_hugedir(), we must *now* make sure to drop
- * the file or we will remap old stuff while the rest
- * of the code is built on the assumption that a new
- * page is clean.
- */
- if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
- unlink(path) == -1 &&
- errno != ENOENT) {
- RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n",
- __func__, path, strerror(errno));
- return -1;
- }
+ /*
+ * There is no TOCTOU between stat() and unlink()/open()
+ * because the hugepage directory is locked.
+ */
+ ret = stat(path, &st);
+ if (ret < 0 && errno != ENOENT) {
+ RTE_LOG(DEBUG, EAL, "%s(): stat() for '%s' failed: %s\n",
+ __func__, path, strerror(errno));
+ return -1;
+ }
+ if (!internal_conf->hugepage_file.unlink_existing && ret == 0 &&
+ dirty != NULL)
+ *dirty = true;
- fd = open(path, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n",
- __func__, path, strerror(errno));
- return -1;
- }
- /* take out a read lock */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
- }
- fd_list[list_idx].fds[seg_idx] = fd;
+ /*
+ * The kernel clears a hugepage only when it is mapped
+ * from a particular file for the first time.
+ * If the file already exists, the old content will be mapped.
+ * If the memory manager assumes all mapped pages to be clean,
+ * the file must be removed and created anew.
+ * Otherwise, the primary caller must be notified
+ * that mapped pages will be dirty
+ * (secondary callers receive the segment state from the primary one).
+ * When multiple hugepages are mapped from the same file,
+ * whether they will be dirty depends on the part that is mapped.
+ */
+ if (!internal_conf->single_file_segments &&
+ internal_conf->hugepage_file.unlink_existing &&
+ rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ ret == 0) {
+ /* coverity[toctou] */
+ if (unlink(path) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n",
+ __func__, path, strerror(errno));
+ return -1;
}
}
+
+ /* coverity[toctou] */
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "%s(): open '%s' failed: %s\n",
+ __func__, path, strerror(errno));
+ return -1;
+ }
+ /* take out a read lock */
+ if (lock(fd, LOCK_SH) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): lock '%s' failed: %s\n",
+ __func__, path, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ *out_fd = fd;
return fd;
}
static int
resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
- bool grow)
+ bool grow, bool *dirty)
{
+ const struct internal_config *internal_conf =
+ eal_get_internal_configuration();
bool again = false;
do {
uint64_t cur_size = get_file_size(fd);
/* fallocate isn't supported, fall back to ftruncate */
+ if (dirty != NULL)
+ *dirty = new_size <= cur_size;
if (new_size > cur_size &&
ftruncate(fd, new_size) < 0) {
RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
strerror(errno));
return -1;
}
- } else
+ } else {
fallocate_supported = 1;
+ /*
+ * It is unknown which portions of an existing
+ * hugepage file were allocated previously,
+ * so all pages within the file are considered
+ * dirty, unless the file is a fresh one.
+ */
+ if (dirty != NULL)
+ *dirty &= !internal_conf->hugepage_file.unlink_existing;
+ }
}
} while (again);
}
static int
-resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow)
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow,
+ bool *dirty)
{
/* in-memory mode is a special case, because we can be sure that
* fallocate() is supported.
const struct internal_config *internal_conf =
eal_get_internal_configuration();
- if (internal_conf->in_memory)
+ if (internal_conf->in_memory) {
+ if (dirty != NULL)
+ *dirty = false;
return resize_hugefile_in_memory(fd, fa_offset,
page_sz, grow);
+ }
return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
- grow);
+ grow, dirty);
}
static int
char path[PATH_MAX];
int ret = 0;
int fd;
+ bool dirty;
size_t alloc_sz;
int flags;
void *new_addr;
pagesz_flag = pagesz_flags(alloc_sz);
fd = -1;
+ dirty = false;
mmap_flags = in_memory_flags | pagesz_flag;
/* single-file segments codepath will never be active
map_offset = 0;
} else {
/* takes out a read lock on segment or segment list */
- fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx,
+ &dirty);
if (fd < 0) {
RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
return -1;
if (internal_conf->single_file_segments) {
map_offset = seg_idx * alloc_sz;
- ret = resize_hugefile(fd, map_offset, alloc_sz, true);
+ ret = resize_hugefile(fd, map_offset, alloc_sz, true,
+ &dirty);
if (ret < 0)
goto resized;
ms->nrank = rte_memory_get_nrank();
ms->iova = iova;
ms->socket_id = socket_id;
+ ms->flags = dirty ? RTE_MEMSEG_FLAG_DIRTY : 0;
return 0;
return -1;
if (internal_conf->single_file_segments) {
- resize_hugefile(fd, map_offset, alloc_sz, false);
+ resize_hugefile(fd, map_offset, alloc_sz, false, NULL);
/* ignore failure, can't make it any worse */
/* if refcount is at zero, close the file */
* segment and thus drop the lock on original fd, but hugepage dir is
* now locked so we can take out another one without races.
*/
- fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx, NULL);
if (fd < 0)
return -1;
if (internal_conf->single_file_segments) {
map_offset = seg_idx * ms->len;
- if (resize_hugefile(fd, map_offset, ms->len, false))
+ if (resize_hugefile(fd, map_offset, ms->len, false, NULL))
return -1;
if (--(fd_list[list_idx].count) == 0)
* holding onto this page.
*/
if (!internal_conf->in_memory &&
+ internal_conf->hugepage_file.unlink_existing &&
!internal_conf->hugepage_file.unlink_before_mapping) {
ret = lock(fd, LOCK_EX);
if (ret >= 0) {
RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
return -1;
}
+ /* safety net, should be impossible to configure */
+ if (internal_conf->hugepage_file.unlink_before_mapping &&
+ !internal_conf->hugepage_file.unlink_existing) {
+ RTE_LOG(ERR, EAL, "Unlinking existing hugepage files is prohibited, cannot unlink them before mapping.\n");
+ return -1;
+ }
}
/* initialize all of the fd lists */